diff --git a/.coveragerc b/.coveragerc index 902f374..ef618b2 100644 --- a/.coveragerc +++ b/.coveragerc @@ -15,4 +15,7 @@ exclude_lines = raise TypeError warnings.warn only on win32 + sys.platform == 'win32' + except ImportError + ModuleNotFoundError if __name__ == .__main__.: \ No newline at end of file diff --git a/.gitignore b/.gitignore index 292a61e..267c15a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ new_test_pypi_release.bash new_pypi_release.bash coverage.html/* - +_autosummary/ .idea/ venv/ *__pycache__* diff --git a/.travis.yml b/.travis.yml index d428833..59771a3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,11 +7,20 @@ matrix: include: - os: linux dist: xenial - sudo: false + sudo: true + - os: osx + osx_image: xcode10.3 # default is 9.x, which fails + sudo: true - os: osx - # osx_image: xcode10.2 + osx_image: xcode11 sudo: true +#addons: +# homebrew: +# packages: +# - cmake +# - gcc@9 + env: global: - CACHE_DIR="$HOME/virtualenv" @@ -22,21 +31,29 @@ before_install: - travis/install-conda.sh - export PATH="$MINICONDA_DIR/bin:$PATH" - hash -r - - travis/install-pip.sh + - conda install -y numpy # install optimized numpy first + - conda install -y llvm libgcc || true # Try to fix build errors on MacOS + - pip install pybind11 # so that nmslib can build + - travis/install-pip.sh # install all the other requirements + - travis/install-build-puffinn.sh # install from cache or build first + - travis/install-build-ngtpy.sh # build and install NGT (others might be added) install: - - python setup.py build - - python setup.py install + - python3 setup.py build + - python3 setup.py install before_script: - flake8 --exit-zero . script: - - python setup.py test - - pytest --cov=skhubness + - pytest skhubness --cov=skhubness after_success: - - codecov + # Only on linux, all libraries are supported, thus tested + - if [ "$TRAVIS_OS_NAME" = "linux" ]; then codecov; fi + +before_cache: + - brew cleanup || true cache: - pip @@ -46,6 +63,7 @@ cache: - "$HOME/.cache/pip" - "$HOME/virtualenv" - "$HOME/miniconda" + - "$HOME/Library/Caches/Homebrew" branches: only: diff --git a/README.md b/README.md index 4f03bc8..eabcf2d 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,18 @@ pip install scikit-hubness Dependencies are installed automatically, if necessary. `scikit-hubness` requires `numpy`, `scipy` and `scikit-learn`. Approximate nearest neighbor search and approximate hubness reduction -additionally requires `nmslib` and/or `falconn`. +additionally requires at least one of the following packges: +* [`nmslib`](https://github.com/nmslib/nmslib) + for hierachical navigable small-world graphs ('hnsw') +* [`ngtpy`](https://github.com/yahoojapan/NGT/) + for nearest neighbor graphs ('onng') +* [`puffinn`](https://github.com/puffinn/puffinn) + for locality-sensitive hashing ('lsh') +* [`falconn`](https://github.com/FALCONN-LIB/FALCONN) + for alternative LSH ('falconn_lsh') , or +* [`annoy`](https://github.com/spotify/annoy) + for random projection forests ('rptree'). + Some modules require `tqdm` or `joblib`. All these packages are available from open repositories, such as [PyPI](https://pypi.org). @@ -65,6 +76,12 @@ http://scikit-hubness.readthedocs.io/en/latest/user_guide/installation.html). Documentation is available online: http://scikit-hubness.readthedocs.io/en/latest/index.html + +## What's new + +See the [changelog](docs/changelog.md) to find what's new in the latest package version. + + ## Quickstart Users of `scikit-hubness` may want to diff --git a/docs/_autosummary/skhubness.analysis.Hubness.rst b/docs/_autosummary/skhubness.analysis.Hubness.rst deleted file mode 100644 index 90b3d9e..0000000 --- a/docs/_autosummary/skhubness.analysis.Hubness.rst +++ /dev/null @@ -1,30 +0,0 @@ -skhubness.analysis.Hubness -========================== - -.. currentmodule:: skhubness.analysis - -.. autoclass:: Hubness - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~Hubness.__init__ - ~Hubness.antihub_occurrence - ~Hubness.atkinson_index - ~Hubness.estimate - ~Hubness.fit_transform - ~Hubness.gini_index - ~Hubness.hub_occurrence - ~Hubness.robinhood_index - ~Hubness.skewness_truncnorm - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.BallTree.rst b/docs/_autosummary/skhubness.neighbors.BallTree.rst deleted file mode 100644 index b2cde23..0000000 --- a/docs/_autosummary/skhubness.neighbors.BallTree.rst +++ /dev/null @@ -1,42 +0,0 @@ -skhubness.neighbors.BallTree -============================ - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: BallTree - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~BallTree.__init__ - ~BallTree.get_arrays - ~BallTree.get_n_calls - ~BallTree.get_tree_stats - ~BallTree.kernel_density - ~BallTree.query - ~BallTree.query_radius - ~BallTree.reset_n_calls - ~BallTree.two_point_correlation - - - - - - .. rubric:: Attributes - - .. autosummary:: - - ~BallTree.data - ~BallTree.idx_array - ~BallTree.node_bounds - ~BallTree.node_data - ~BallTree.sample_weight - ~BallTree.sum_weight - ~BallTree.valid_metrics - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.DistanceMetric.rst b/docs/_autosummary/skhubness.neighbors.DistanceMetric.rst deleted file mode 100644 index 79ee5b9..0000000 --- a/docs/_autosummary/skhubness.neighbors.DistanceMetric.rst +++ /dev/null @@ -1,26 +0,0 @@ -skhubness.neighbors.DistanceMetric -================================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: DistanceMetric - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~DistanceMetric.__init__ - ~DistanceMetric.dist_to_rdist - ~DistanceMetric.get_metric - ~DistanceMetric.pairwise - ~DistanceMetric.rdist_to_dist - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.HNSW.rst b/docs/_autosummary/skhubness.neighbors.HNSW.rst deleted file mode 100644 index b9e0ca8..0000000 --- a/docs/_autosummary/skhubness.neighbors.HNSW.rst +++ /dev/null @@ -1,30 +0,0 @@ -skhubness.neighbors.HNSW -======================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: HNSW - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~HNSW.__init__ - ~HNSW.fit - ~HNSW.kneighbors - - - - - - .. rubric:: Attributes - - .. autosummary:: - - ~HNSW.valid_metrics - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.KDTree.rst b/docs/_autosummary/skhubness.neighbors.KDTree.rst deleted file mode 100644 index 46e15d6..0000000 --- a/docs/_autosummary/skhubness.neighbors.KDTree.rst +++ /dev/null @@ -1,42 +0,0 @@ -skhubness.neighbors.KDTree -========================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: KDTree - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~KDTree.__init__ - ~KDTree.get_arrays - ~KDTree.get_n_calls - ~KDTree.get_tree_stats - ~KDTree.kernel_density - ~KDTree.query - ~KDTree.query_radius - ~KDTree.reset_n_calls - ~KDTree.two_point_correlation - - - - - - .. rubric:: Attributes - - .. autosummary:: - - ~KDTree.data - ~KDTree.idx_array - ~KDTree.node_bounds - ~KDTree.node_data - ~KDTree.sample_weight - ~KDTree.sum_weight - ~KDTree.valid_metrics - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.KNeighborsClassifier.rst b/docs/_autosummary/skhubness.neighbors.KNeighborsClassifier.rst deleted file mode 100644 index 09eb1eb..0000000 --- a/docs/_autosummary/skhubness.neighbors.KNeighborsClassifier.rst +++ /dev/null @@ -1,31 +0,0 @@ -skhubness.neighbors.KNeighborsClassifier -======================================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: KNeighborsClassifier - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~KNeighborsClassifier.__init__ - ~KNeighborsClassifier.fit - ~KNeighborsClassifier.get_params - ~KNeighborsClassifier.kcandidates - ~KNeighborsClassifier.kneighbors - ~KNeighborsClassifier.kneighbors_graph - ~KNeighborsClassifier.predict - ~KNeighborsClassifier.predict_proba - ~KNeighborsClassifier.score - ~KNeighborsClassifier.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.KNeighborsRegressor.rst b/docs/_autosummary/skhubness.neighbors.KNeighborsRegressor.rst deleted file mode 100644 index 0c43902..0000000 --- a/docs/_autosummary/skhubness.neighbors.KNeighborsRegressor.rst +++ /dev/null @@ -1,30 +0,0 @@ -skhubness.neighbors.KNeighborsRegressor -======================================= - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: KNeighborsRegressor - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~KNeighborsRegressor.__init__ - ~KNeighborsRegressor.fit - ~KNeighborsRegressor.get_params - ~KNeighborsRegressor.kcandidates - ~KNeighborsRegressor.kneighbors - ~KNeighborsRegressor.kneighbors_graph - ~KNeighborsRegressor.predict - ~KNeighborsRegressor.score - ~KNeighborsRegressor.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.KernelDensity.rst b/docs/_autosummary/skhubness.neighbors.KernelDensity.rst deleted file mode 100644 index 9055085..0000000 --- a/docs/_autosummary/skhubness.neighbors.KernelDensity.rst +++ /dev/null @@ -1,28 +0,0 @@ -skhubness.neighbors.KernelDensity -================================= - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: KernelDensity - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~KernelDensity.__init__ - ~KernelDensity.fit - ~KernelDensity.get_params - ~KernelDensity.sample - ~KernelDensity.score - ~KernelDensity.score_samples - ~KernelDensity.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.LSH.rst b/docs/_autosummary/skhubness.neighbors.LSH.rst deleted file mode 100644 index c6044be..0000000 --- a/docs/_autosummary/skhubness.neighbors.LSH.rst +++ /dev/null @@ -1,31 +0,0 @@ -skhubness.neighbors.LSH -======================= - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: LSH - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~LSH.__init__ - ~LSH.fit - ~LSH.kneighbors - ~LSH.radius_neighbors - - - - - - .. rubric:: Attributes - - .. autosummary:: - - ~LSH.valid_metrics - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.LocalOutlierFactor.rst b/docs/_autosummary/skhubness.neighbors.LocalOutlierFactor.rst deleted file mode 100644 index 25d271b..0000000 --- a/docs/_autosummary/skhubness.neighbors.LocalOutlierFactor.rst +++ /dev/null @@ -1,37 +0,0 @@ -skhubness.neighbors.LocalOutlierFactor -====================================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: LocalOutlierFactor - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~LocalOutlierFactor.__init__ - ~LocalOutlierFactor.fit - ~LocalOutlierFactor.get_params - ~LocalOutlierFactor.kcandidates - ~LocalOutlierFactor.kneighbors - ~LocalOutlierFactor.kneighbors_graph - ~LocalOutlierFactor.set_params - - - - - - .. rubric:: Attributes - - .. autosummary:: - - ~LocalOutlierFactor.decision_function - ~LocalOutlierFactor.fit_predict - ~LocalOutlierFactor.predict - ~LocalOutlierFactor.score_samples - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.NearestCentroid.rst b/docs/_autosummary/skhubness.neighbors.NearestCentroid.rst deleted file mode 100644 index 0868b35..0000000 --- a/docs/_autosummary/skhubness.neighbors.NearestCentroid.rst +++ /dev/null @@ -1,27 +0,0 @@ -skhubness.neighbors.NearestCentroid -=================================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: NearestCentroid - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~NearestCentroid.__init__ - ~NearestCentroid.fit - ~NearestCentroid.get_params - ~NearestCentroid.predict - ~NearestCentroid.score - ~NearestCentroid.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.NearestNeighbors.rst b/docs/_autosummary/skhubness.neighbors.NearestNeighbors.rst deleted file mode 100644 index e7fb30f..0000000 --- a/docs/_autosummary/skhubness.neighbors.NearestNeighbors.rst +++ /dev/null @@ -1,30 +0,0 @@ -skhubness.neighbors.NearestNeighbors -==================================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: NearestNeighbors - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~NearestNeighbors.__init__ - ~NearestNeighbors.fit - ~NearestNeighbors.get_params - ~NearestNeighbors.kcandidates - ~NearestNeighbors.kneighbors - ~NearestNeighbors.kneighbors_graph - ~NearestNeighbors.radius_neighbors - ~NearestNeighbors.radius_neighbors_graph - ~NearestNeighbors.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.NeighborhoodComponentsAnalysis.rst b/docs/_autosummary/skhubness.neighbors.NeighborhoodComponentsAnalysis.rst deleted file mode 100644 index 8a75742..0000000 --- a/docs/_autosummary/skhubness.neighbors.NeighborhoodComponentsAnalysis.rst +++ /dev/null @@ -1,27 +0,0 @@ -skhubness.neighbors.NeighborhoodComponentsAnalysis -================================================== - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: NeighborhoodComponentsAnalysis - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~NeighborhoodComponentsAnalysis.__init__ - ~NeighborhoodComponentsAnalysis.fit - ~NeighborhoodComponentsAnalysis.fit_transform - ~NeighborhoodComponentsAnalysis.get_params - ~NeighborhoodComponentsAnalysis.set_params - ~NeighborhoodComponentsAnalysis.transform - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.RadiusNeighborsClassifier.rst b/docs/_autosummary/skhubness.neighbors.RadiusNeighborsClassifier.rst deleted file mode 100644 index e1598a6..0000000 --- a/docs/_autosummary/skhubness.neighbors.RadiusNeighborsClassifier.rst +++ /dev/null @@ -1,30 +0,0 @@ -skhubness.neighbors.RadiusNeighborsClassifier -============================================= - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: RadiusNeighborsClassifier - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~RadiusNeighborsClassifier.__init__ - ~RadiusNeighborsClassifier.fit - ~RadiusNeighborsClassifier.get_params - ~RadiusNeighborsClassifier.kcandidates - ~RadiusNeighborsClassifier.predict - ~RadiusNeighborsClassifier.radius_neighbors - ~RadiusNeighborsClassifier.radius_neighbors_graph - ~RadiusNeighborsClassifier.score - ~RadiusNeighborsClassifier.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.RadiusNeighborsRegressor.rst b/docs/_autosummary/skhubness.neighbors.RadiusNeighborsRegressor.rst deleted file mode 100644 index 787a3c1..0000000 --- a/docs/_autosummary/skhubness.neighbors.RadiusNeighborsRegressor.rst +++ /dev/null @@ -1,30 +0,0 @@ -skhubness.neighbors.RadiusNeighborsRegressor -============================================ - -.. currentmodule:: skhubness.neighbors - -.. autoclass:: RadiusNeighborsRegressor - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~RadiusNeighborsRegressor.__init__ - ~RadiusNeighborsRegressor.fit - ~RadiusNeighborsRegressor.get_params - ~RadiusNeighborsRegressor.kcandidates - ~RadiusNeighborsRegressor.predict - ~RadiusNeighborsRegressor.radius_neighbors - ~RadiusNeighborsRegressor.radius_neighbors_graph - ~RadiusNeighborsRegressor.score - ~RadiusNeighborsRegressor.set_params - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.kneighbors_graph.rst b/docs/_autosummary/skhubness.neighbors.kneighbors_graph.rst deleted file mode 100644 index eceb5a0..0000000 --- a/docs/_autosummary/skhubness.neighbors.kneighbors_graph.rst +++ /dev/null @@ -1,6 +0,0 @@ -skhubness.neighbors.kneighbors\_graph -===================================== - -.. currentmodule:: skhubness.neighbors - -.. autofunction:: kneighbors_graph \ No newline at end of file diff --git a/docs/_autosummary/skhubness.neighbors.radius_neighbors_graph.rst b/docs/_autosummary/skhubness.neighbors.radius_neighbors_graph.rst deleted file mode 100644 index 7bd0f89..0000000 --- a/docs/_autosummary/skhubness.neighbors.radius_neighbors_graph.rst +++ /dev/null @@ -1,6 +0,0 @@ -skhubness.neighbors.radius\_neighbors\_graph -============================================ - -.. currentmodule:: skhubness.neighbors - -.. autofunction:: radius_neighbors_graph \ No newline at end of file diff --git a/docs/_autosummary/skhubness.reduction.LocalScaling.rst b/docs/_autosummary/skhubness.reduction.LocalScaling.rst deleted file mode 100644 index 34b3ded..0000000 --- a/docs/_autosummary/skhubness.reduction.LocalScaling.rst +++ /dev/null @@ -1,24 +0,0 @@ -skhubness.reduction.LocalScaling -================================ - -.. currentmodule:: skhubness.reduction - -.. autoclass:: LocalScaling - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~LocalScaling.__init__ - ~LocalScaling.fit - ~LocalScaling.transform - - - - - - \ No newline at end of file diff --git a/docs/_autosummary/skhubness.reduction.MutualProximity.rst b/docs/_autosummary/skhubness.reduction.MutualProximity.rst deleted file mode 100644 index dd2fb22..0000000 --- a/docs/_autosummary/skhubness.reduction.MutualProximity.rst +++ /dev/null @@ -1,24 +0,0 @@ -skhubness.reduction.MutualProximity -=================================== - -.. currentmodule:: skhubness.reduction - -.. autoclass:: MutualProximity - - - .. automethod:: __init__ - - - .. rubric:: Methods - - .. autosummary:: - - ~MutualProximity.__init__ - ~MutualProximity.fit - ~MutualProximity.transform - - - - - - \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..33a397b --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,35 @@ +# Changelog + +## [Next release] +... + +## [0.21.0a8] - 2019-09-12 +### Added +- Approximate nearest neighbor search + * LSH by an additional provider, [`puffinn`](https://github.com/puffinn/puffinn) (Linux only, atm) + * ANNG provided by [`ngtpy`](https://github.com/yahoojapan/NGT/) (Linux, MacOS) + * Random projection forests provided by [`annoy`](https://github.com/spotify/annoy) (Linux, MacOS, Windows) + +### Fixes +- Several minor issues +- Several documentations issues + + +## [0.21.0a7] - 2019-07-17 + +The first alpha release of `scikit-hubness` to appear in this changelog. +It already contains the following features: + +- Hubness estimation (exact or approximate) +- Hubness reduction (exact or approximate) + * Mutual proximity + * Local scaling + * DisSim Local +- Approximate nearest neighbor search + * HNSW provided by [nmslib](https://github.com/nmslib/nmslib) + * LSH provided by [falconn](https://github.com/FALCONN-LIB/FALCONN) + +[Next release]: https://github.com/VarIr/scikit-hubness/compare/release-0.21.0a8...HEAD +[0.21.0a8]: https://github.com/VarIr/scikit-hubness/compare/v0.21.0-alpha.7...release-0.21.0a8 +[0.21.0a7]: https://github.com/VarIr/scikit-hubness/releases/tag/v0.21.0-alpha.7 + diff --git a/docs/conf.py b/docs/conf.py index 8ebe6f5..586c130 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,12 @@ sys.path.insert(0, os.path.abspath('../')) import mock -MOCK_MODULES = ['falconn', 'nmslib', +MOCK_MODULES = ['falconn', + 'nmslib', + 'annoy', + 'ngt', + 'ngtpy', + 'puffinn', ] for mod_name in MOCK_MODULES: sys.modules[mod_name] = mock.Mock() diff --git a/docs/documentation.rst b/docs/documentation.rst index fcf6830..c0dd9ec 100644 --- a/docs/documentation.rst +++ b/docs/documentation.rst @@ -43,11 +43,14 @@ Neighbors: :mod:`skhubness.neighbors` neighbors.HNSW neighbors.KNeighborsClassifier neighbors.KNeighborsRegressor - neighbors.LSH + neighbors.FalconnLSH neighbors.NearestCentroid neighbors.NearestNeighbors + neighbors.ONNG + neighbors.PuffinnLSH neighbors.RadiusNeighborsClassifier neighbors.RadiusNeighborsRegressor + neighbors.RandomProjectionTree neighbors.kneighbors_graph neighbors.radius_neighbors_graph neighbors.KernelDensity @@ -71,6 +74,4 @@ Reduction: :mod:`skhubness.reduction` reduction.MutualProximity reduction.LocalScaling - - - + reduction.DisSimLocal diff --git a/docs/user_guide/installation.rst b/docs/user_guide/installation.rst index 969106c..c16cafd 100644 --- a/docs/user_guide/installation.rst +++ b/docs/user_guide/installation.rst @@ -11,10 +11,28 @@ The current release of `scikit-hubness` can be installed from PyPI: pip install scikit-hubness +Dependencies +------------ + +All strict dependencies of `scikit-hubness` are automatically installed +by `pip`. Some optional dependencies (certain ANN libraries) may not +yet be available from PyPI. If you require one of these libraries, +please refer to the library's documentation for building instructions. +For example, at the time of writing, `puffinn` was not available on PyPI. +Building and installing is straight-forward: + +.. code-block:: bash + + git clone https://github.com/puffinn/puffinn.git + cd puffinn + python3 setup.py build + pip install . + + From Source ----------- -You can always grab the latest version directly from GitHub: +You can always grab the latest version of `scikit-hubness` directly from GitHub: .. code-block:: bash @@ -35,6 +53,20 @@ Supported platforms - MacOS X - Windows -Note, that some functionality of `scikit-hubness` is not available on Windows -(e.g. locality-sensitive hashing (LSH) is provided by `falconn`, -which itself does not support Windows. Please use HNSW instead). +Note, that not all approximate nearest neighbor libraries used in `scikit-hubness` +are available on all platforms. The table below indicates, which libaries and +algorithms are currently supported on your operating system. + ++---------+-------------+-------+-------+---------+ +| library | algorithm | Linux | MacOS | Windows | ++---------+-------------+-------+-------+---------+ +| nmslib | hnsw | x | x | x | ++---------+-------------+-------+-------+---------+ +| annoy | rptree | x | x | x | ++---------+-------------+-------+-------+---------+ +| ngtpy | onng | x | x | | ++---------+-------------+-------+-------+---------+ +| falconn | falconn_lsh | x | x | | ++---------+-------------+-------+-------+---------+ +| puffinn | lsh | x | | | ++---------+-------------+-------+-------+---------+ diff --git a/requirements-win.txt b/requirements-win.txt index ed1d85c..3392d78 100644 --- a/requirements-win.txt +++ b/requirements-win.txt @@ -5,7 +5,9 @@ pandas joblib>=0.12 tqdm nmslib +annoy # falconn # DOES NOT support Windows +# ngtpy # DOES NOT support Windows pytest pytest-cov codecov diff --git a/requirements.txt b/requirements.txt index d819d64..b2d8f5c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,9 @@ scikit-learn>=0.21 pandas joblib>=0.12 tqdm +annoy nmslib +ngt falconn pytest pytest-cov diff --git a/setup.py b/setup.py index c4e8467..f8808ca 100644 --- a/setup.py +++ b/setup.py @@ -78,7 +78,9 @@ def find_version(*file_paths): 'pybind11', # Required for nmslib build 'joblib >= 0.12', 'nmslib', + 'annoy', 'falconn;platform_system!="Windows"', # falconn is not available on Windows; see also PEP 508 + 'ngt;platform_system!="Windows"', # NGT is not available on Windows ], extras_require={ # Install using the 'extras' syntax: $ pip install sampleproject[dev] # 'dev': ['check-manifest'], diff --git a/skhubness/__init__.py b/skhubness/__init__.py index 5f4cf93..df091a8 100644 --- a/skhubness/__init__.py +++ b/skhubness/__init__.py @@ -3,7 +3,7 @@ """ Python package for nearest neighbor retrieval in high-dimensional space.""" -__version__ = '0.21.0a7' +__version__ = '0.21.0a8' from . import analysis from . import data diff --git a/skhubness/analysis/estimation.py b/skhubness/analysis/estimation.py index 0450707..6091961 100644 --- a/skhubness/analysis/estimation.py +++ b/skhubness/analysis/estimation.py @@ -48,11 +48,11 @@ class Hubness(BaseEstimator): - """ Hubness characteristics of data set. + """ Examine hubness characteristics of data. Parameters ---------- - k : int + k: int Neighborhood size return_value: str, default = "k_skewness" @@ -62,24 +62,24 @@ class Hubness(BaseEstimator): or check `skhubness.analysis.VALID_HUBNESS_MEASURE` for available measures. - hub_size : float + hub_size: float Hubs are defined as objects with k-occurrence > hub_size * k. - metric : string, one of ['euclidean', 'cosine', 'precomputed'] + metric: string, one of ['euclidean', 'cosine', 'precomputed'] Metric to use for distance computation. Currently, only Euclidean, cosine, and precomputed distances are supported. - store_k_neighbors : bool + store_k_neighbors: bool Whether to save the k-neighbor lists. Requires O(n_test * k) memory. - store_k_occurrence : bool + store_k_occurrence: bool Whether to save the k-occurrence. Requires O(n_test) memory. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm: {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'lsh' will use :class:`FalconnLSH` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. @@ -89,7 +89,7 @@ class Hubness(BaseEstimator): Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -97,9 +97,8 @@ class Hubness(BaseEstimator): with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm - # TODO add all supported hubness reduction methods - 'mutual_proximity' or 'mp' will use :class:`MutualProximity` - 'local_scaling' or 'ls' will use :class:`LocalScaling` @@ -113,64 +112,64 @@ class Hubness(BaseEstimator): a mutual proximity variant is used, which models distance distributions with independent Gaussians. - random_state : int, RandomState instance or None, optional + random_state: int, RandomState instance or None, optional If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - shuffle_equal : bool, optional + shuffle_equal: bool, optional If true and metric='precomputed', shuffle neighbors with identical distances to avoid artifact hubness. NOTE: This is especially useful for secondary distance measures with a finite number of possible values, e.g. SNN or MP empiric. - n_jobs : int, optional + n_jobs: int, optional CURRENTLY IGNORED. Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs - verbose : int, optional + verbose: int, optional Level of output messages Attributes ---------- - k_skewness : float + k_skewness: float Hubness, measured as skewness of k-occurrence histogram [1]_ - k_skewness_truncnorm : float + k_skewness_truncnorm: float Hubness, measured as skewness of truncated normal distribution fitted with k-occurrence histogram - atkinson_index : float + atkinson_index: float Hubness, measured as the Atkinson index of k-occurrence distribution - gini_index : float + gini_index: float Hubness, measured as the Gini index of k-occurrence distribution - robinhood_index : float + robinhood_index: float Hubness, measured as Robin Hood index of k-occurrence distribution [2]_ - antihubs : int + antihubs: int Indices to antihubs - antihub_occurrence : float + antihub_occurrence: float Proportion of antihubs in data set - hubs : int + hubs: int Indices to hubs - hub_occurrence : float + hub_occurrence: float Proportion of k-nearest neighbor slots occupied by hubs - groupie_ratio : float + groupie_ratio: float Proportion of objects with the largest hub in their neighborhood - k_occurrence : ndarray + k_occurrence: ndarray Reverse neighbor count for each object - k_neighbors : ndarray + k_neighbors: ndarray Indices to k-nearest neighbors for each object References @@ -358,11 +357,11 @@ def _k_neighbors_precomputed_sparse(self, X: csr_matrix, n_samples: int = None) Parameters ---------- - X : sparse, shape = [n_test, n_indexed] + X: sparse, shape = [n_test, n_indexed] Sparse distance matrix. Only non-zero elements may be considered neighbors. - n_samples : int + n_samples: int Number of sampled indexed objects, e.g. in approximate hubness reduction. If None, this is inferred from the first row of X. @@ -415,7 +414,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float: Parameters ---------- - k_occurrence : ndarray + k_occurrence: ndarray Reverse nearest neighbor count for each object. """ clip_left = 0 @@ -433,9 +432,9 @@ def _calc_gini_index(k_occurrence: np.ndarray, limiting='memory') -> float: Parameters ---------- - k_occurrence : ndarray + k_occurrence: ndarray Reverse nearest neighbor count for each object. - limiting : 'memory' or 'cpu' + limiting: 'memory' or 'cpu' If 'cpu', use fast implementation with high memory usage, if 'memory', use slighly slower, but memory-efficient implementation, otherwise use naive implementation (slow, low memory usage) @@ -462,7 +461,7 @@ def _calc_robinhood_index(k_occurrence: np.ndarray) -> float: Parameters ---------- - k_occurrence : ndarray + k_occurrence: ndarray Reverse nearest neighbor count for each object. Notes @@ -489,9 +488,9 @@ def _calc_atkinson_index(k_occurrence: np.ndarray, eps: float = .5) -> float: Parameters ---------- - k_occurrence : ndarray + k_occurrence: ndarray Reverse nearest neighbor count for each object. - eps : float, default = 0.5 + eps: float, default = 0.5 'Income' weight. Turns the index into a normative measure. """ if eps == 1: @@ -509,7 +508,7 @@ def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> (np.array, float): Parameters ---------- - k_occurrence : ndarray + k_occurrence: ndarray Reverse nearest neighbor count for each object. """ antihubs = np.argwhere(k_occurrence == 0).ravel() @@ -522,13 +521,13 @@ def _calc_hub_occurrence(k: int, k_occurrence: np.ndarray, n_test: int, hub_size Parameters ---------- - k : int + k: int Specifies the number of nearest neighbors - k_occurrence : ndarray + k_occurrence: ndarray Reverse nearest neighbor count for each object. - n_test : int + n_test: int Number of queries (or objects in a test set) - hub_size : float + hub_size: float Factor to determine hubs """ hubs = np.argwhere(k_occurrence >= hub_size * k).ravel() @@ -544,18 +543,18 @@ def score(self, X: np.ndarray = None, y=None, has_self_distances: bool = False) Parameters ---------- - X : ndarray, shape (n_query, n_features) or (n_query, n_indexed) + X: ndarray, shape (n_query, n_features) or (n_query, n_indexed) Array of query vectors, or distance, if self.metric == 'precomputed' - y : ignored + y: ignored - has_self_distances : bool, default = False + has_self_distances: bool, default = False Define, whether a precomputed distance matrix contains self distances, which need to be excluded. Returns ------- - hubness_measure : float or dict + hubness_measure: float or dict Return the hubness measure as indicated by `return_value`. Additional hubness indices are provided as attributes (e.g. :func:`robinhood_index_`). diff --git a/skhubness/analysis/tests/test_estimation.py b/skhubness/analysis/tests/test_estimation.py index 92582ef..89399c1 100644 --- a/skhubness/analysis/tests/test_estimation.py +++ b/skhubness/analysis/tests/test_estimation.py @@ -17,6 +17,7 @@ def test_estimator(): + """ Check that Hubness is a valid scikit-learn estimator. """ check_estimator(Hubness) @@ -67,6 +68,7 @@ def test_return_k_occurrence(store_k_occurrence): def test_limiting_factor(): + """ Different implementations of Gini index calculation should give the same result. """ X, _ = make_classification() hub = Hubness(store_k_occurrence=True, return_value='k_occurrence') hub.fit(X) @@ -278,76 +280,3 @@ def test_hubness_independent_on_data_set_size(hubness_measure): np.testing.assert_allclose(value[-1], value[0], rtol=0.1) else: np.testing.assert_allclose(value[-1], value[0], rtol=2e-1) - - -# def test_hubness_from_sparse_precomputed_matrix(self): -# # Generate high-dimensional data -# X, y = make_classification(n_samples=1000, -# n_features=100, -# n_informative=100, -# n_redundant=0, -# n_repeated=0, -# random_state=123) -# X = X.astype(np.float32) -# y = y.astype(np.int32) -# for hr_algorithm in ['mpg', 'ls', 'dsl']: -# for sampling_algorithm in ['hnsw', 'lsh']: # ['hnsw', 'lsh']:# -# for n_samples in [50, 100]: -# print(f'Test {hr_algorithm}, {sampling_algorithm}, ' -# f'with {n_samples} samples.') -# self.hubness_from_sparse_precomputed_matrix( -# X, y, hr_algorithm, sampling_algorithm, n_samples) - - -# def hubness_from_sparse_precomputed_matrix(self, X, y, hr, -# sample, n_samples): -# # Make train-test split -# X_train, X_test, y_train, _ = train_test_split(X, y) -# # Obtain a sparse distance matrix -# ahr = ApproximateHubnessReduction( -# hr_algorithm=hr, sampling_algorithm=sample, n_samples=n_samples) -# ahr.fit(X_train, y_train) -# _ = ahr.transform(X_test) -# D_test_csr = ahr.sec_dist_sparse_ -# # Hubness in sparse matrix -# hub = Hubness(k=10, -# metric='precomputed', -# return_k_neighbors=True, -# shuffle_equal=False, -# verbose=self.verbose) -# hub.score(D_test_csr) -# Sk_trunc_sparse = hub.k_skewness_truncnorm_ -# Sk_sparse = hub.k_skewness_ -# k_neigh_sparse = hub.k_neighbors_ -# # Hubness in dense matrix -# try: -# D_test_dense = D_test_csr.toarray() -# except AttributeError: -# return # Without sampling, the distance matrix is not sparse -# D_test_dense[D_test_dense == 0] = np.finfo(np.float32).max -# hub_dense = Hubness(k=10, -# metric='precomputed', -# return_k_neighbors=True, -# shuffle_equal=False) -# hub_dense.score(D_test_dense) -# Sk_trunc_dense = hub_dense.k_skewness_truncnorm_ -# Sk_dense = hub_dense.k_skewness_ -# k_neigh_dense = hub_dense.k_neighbors_ -# if hr in ['MP', 'MPG']: -# decimal = 1 -# else: -# decimal = 5 -# try: -# np.testing.assert_array_equal( -# k_neigh_dense.ravel(), k_neigh_sparse) -# except AssertionError: -# s1 = k_neigh_dense.sum() -# s2 = k_neigh_sparse.sum() -# sm = max(s1, s2) -# print(f'k_neighbors not identical, but close: ' -# f'{s1}, {s2}, {s1/s2}.') -# np.testing.assert_allclose(s2 / sm, s1 / sm, rtol=1e-2) -# np.testing.assert_array_almost_equal( -# Sk_sparse, Sk_dense, decimal=decimal) -# np.testing.assert_array_almost_equal( -# Sk_trunc_sparse, Sk_trunc_dense, decimal=decimal) diff --git a/skhubness/data/__init__.py b/skhubness/data/__init__.py index e68b350..6437dfe 100644 --- a/skhubness/data/__init__.py +++ b/skhubness/data/__init__.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: BSD-3-Clause - +""" +The :mod:`skhubness.data` package provides example data sets. +""" from .load_dataset import load_dexter __all__ = ['load_dexter'] diff --git a/skhubness/neighbors/__init__.py b/skhubness/neighbors/__init__.py index 3e0189a..a7b9ab8 100644 --- a/skhubness/neighbors/__init__.py +++ b/skhubness/neighbors/__init__.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- # SPDX-License-Identifier: BSD-3-Clause - """ The :mod:`skhubness.neighbors` package is a drop-in replacement for :mod:`sklearn.neighbors`, providing all of its features, while adding transparent support for hubness reduction @@ -11,12 +10,21 @@ from .classification import KNeighborsClassifier, RadiusNeighborsClassifier from .graph import kneighbors_graph, radius_neighbors_graph from .hnsw import HNSW +from .approximate_neighbors import UnavailableANN +try: + from .lsh import FalconnLSH +except ImportError: + FalconnLSH = UnavailableANN try: - from .lsh import LSH -except (ImportError, ModuleNotFoundError): - from .approximate_neighbors import UnavailableANN - LSH = UnavailableANN + from .lsh import PuffinnLSH +except ImportError: + PuffinnLSH = UnavailableANN from .kd_tree import KDTree +try: + from .onng import ONNG +except ImportError: + ONNG = UnavailableANN +from .random_projection_trees import RandomProjectionTree from .dist_metrics import DistanceMetric from .regression import KNeighborsRegressor, RadiusNeighborsRegressor from .nearest_centroid import NearestCentroid @@ -28,19 +36,23 @@ __all__ = ['BallTree', 'DistanceMetric', + 'FalconnLSH', 'KDTree', 'HNSW', 'KNeighborsClassifier', 'KNeighborsRegressor', - 'LSH', 'NearestCentroid', 'NearestNeighbors', + 'PuffinnLSH', + 'ONNG', 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', + 'RandomProjectionTree', 'kneighbors_graph', 'radius_neighbors_graph', 'KernelDensity', 'LocalOutlierFactor', 'NeighborhoodComponentsAnalysis', 'VALID_METRICS', - 'VALID_METRICS_SPARSE'] + 'VALID_METRICS_SPARSE', + ] diff --git a/skhubness/neighbors/approximate_neighbors.py b/skhubness/neighbors/approximate_neighbors.py index 73fba33..43a0599 100644 --- a/skhubness/neighbors/approximate_neighbors.py +++ b/skhubness/neighbors/approximate_neighbors.py @@ -2,12 +2,25 @@ from abc import ABC, abstractmethod from multiprocessing import cpu_count +from typing import Union, Tuple import warnings +import numpy as np class ApproximateNearestNeighbor(ABC): - """ Abstract base class for approximate nearest neighbor search methods. """ + """ Abstract base class for approximate nearest neighbor search methods. + Parameters + ---------- + n_candidates: int, default = 5 + Number of neighbors to retrieve + metric: str, default = 'euclidean' + Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot" + n_jobs: int, default = 1 + Number of parallel jobs + verbose: int, default = 0 + Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. + """ def __init__(self, n_candidates: int = 5, metric: str = 'sqeuclidean', n_jobs: int = 1, verbose: int = 0, *args, **kwargs): self.n_candidates = n_candidates @@ -21,10 +34,32 @@ def __init__(self, n_candidates: int = 5, metric: str = 'sqeuclidean', @abstractmethod def fit(self, X, y=None): + """ Setup ANN index from training data. + + Parameters + ---------- + X: np.array + Data to be indexed + y: any + Ignored + """ pass # pragma: no cover @abstractmethod - def kneighbors(self, X=None, n_candidates=None, return_distance=True): + def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve k nearest neighbors. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + n_candidates: int or None, optional, default = None + Number of neighbors to retrieve. + If None, use the value passed during construction. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. + """ pass # pragma: no cover diff --git a/skhubness/neighbors/base.py b/skhubness/neighbors/base.py index de269ce..11415ec 100644 --- a/skhubness/neighbors/base.py +++ b/skhubness/neighbors/base.py @@ -17,7 +17,6 @@ # License: BSD 3 clause (C) INRIA, University of Amsterdam from functools import partial -import sys import warnings import numpy as np @@ -35,25 +34,34 @@ from sklearn.utils.validation import check_is_fitted from joblib import Parallel, delayed, effective_n_jobs +from .approximate_neighbors import ApproximateNearestNeighbor, UnavailableANN from .hnsw import HNSW +from .random_projection_trees import RandomProjectionTree from ..reduction import NoHubnessReduction, LocalScaling, MutualProximity, DisSimLocal -# LSH library falconn does not support Windows -ON_PLATFORM_WINDOWS = sys.platform == 'win32' -if ON_PLATFORM_WINDOWS: - from .approximate_neighbors import UnavailableANN - LSH = UnavailableANN -else: - from .lsh import LSH - +try: + from .lsh import FalconnLSH +except ImportError: + FalconnLSH = UnavailableANN +try: + from .lsh import PuffinnLSH +except ImportError: + PuffinnLSH = UnavailableANN +try: + from .onng import ONNG +except ImportError: + ONNG = UnavailableANN __all__ = ['KNeighborsMixin', 'NeighborsBase', 'RadiusNeighborsMixin', 'SupervisedFloatMixin', 'SupervisedIntegerMixin', 'UnsupervisedMixin', 'VALID_METRICS', 'VALID_METRICS_SPARSE', ] -VALID_METRICS = dict(lsh=LSH.valid_metrics if not ON_PLATFORM_WINDOWS else [], +VALID_METRICS = dict(lsh=PuffinnLSH.valid_metrics if not issubclass(PuffinnLSH, UnavailableANN) else [], + falconn_lsh=FalconnLSH.valid_metrics if not issubclass(FalconnLSH, UnavailableANN) else [], + onng=ONNG.valid_metrics if not issubclass(ONNG, UnavailableANN) else [], hnsw=HNSW.valid_metrics, + rptree=RandomProjectionTree.valid_metrics, ball_tree=BallTree.valid_metrics, kd_tree=KDTree.valid_metrics, # The following list comes from the @@ -68,13 +76,19 @@ 'yule', 'wminkowski'])) VALID_METRICS_SPARSE = dict(lsh=[], + falconn_lsh=[], + onng=[], hnsw=[], + rptree=[], ball_tree=[], kd_tree=[], brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - {'haversine'}), ) +ALG_WITHOUT_RADIUS_QUERY = ['hnsw', 'lsh', 'rptree', 'onng', ] +ANN_ALG = ['hnsw', 'lsh', 'falconn_lsh', 'rptree', 'onng', ] + def _check_weights(weights): """Check to make sure weights are valid""" @@ -175,8 +189,7 @@ def _check_hubness_algorithm(self): def _check_algorithm_metric(self): if self.algorithm not in ['auto', 'brute', - 'kd_tree', 'ball_tree', - 'lsh', 'hnsw']: + 'kd_tree', 'ball_tree'] + ANN_ALG: raise ValueError("unrecognized algorithm: '%s'" % self.algorithm) if self.algorithm == 'auto': @@ -191,7 +204,7 @@ def _check_algorithm_metric(self): alg_check = self.algorithm if callable(self.metric): - if self.algorithm in ['kd_tree', 'lsh', 'hnsw']: + if self.algorithm in ['kd_tree'] + ANN_ALG: # callable metric is only valid for brute force and ball_tree raise ValueError(f"{self.algorithm} algorithm does not support callable metric '{self.metric}'") elif self.metric not in VALID_METRICS[alg_check]: @@ -274,13 +287,20 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - elif isinstance(X, (LSH, HNSW)): + elif isinstance(X, ApproximateNearestNeighbor): self._tree = None - if isinstance(X, LSH): + if isinstance(X, PuffinnLSH): self._fit_X = X.X_train_ self._fit_method = 'lsh' + elif isinstance(X, FalconnLSH): + self._fit_X = X.X_train_ + self._fit_method = 'falconn_lsh' + elif isinstance(X, ONNG): + self._fit_method = 'onng' elif isinstance(X, HNSW): self._fit_method = 'hnsw' + elif isinstance(X, RandomProjectionTree): + self._fit_method = 'rptree' self._index = X # TODO enable hubness reduction here ... @@ -346,13 +366,25 @@ def _fit(self, X): self._tree = None self._index = None elif self._fit_method == 'lsh': - self._index = LSH(verbose=self.verbose, **self.algorithm_params) + self._index = PuffinnLSH(verbose=self.verbose, **self.algorithm_params) + self._index.fit(X) + self._tree = None + elif self._fit_method == 'falconn_lsh': + self._index = FalconnLSH(verbose=self.verbose, **self.algorithm_params) + self._index.fit(X) + self._tree = None + elif self._fit_method == 'onng': + self._index = ONNG(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'hnsw': self._index = HNSW(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None + elif self._fit_method == 'rptree': + self._index = RandomProjectionTree(verbose=self.verbose, **self.algorithm_params) + self._index.fit(X) + self._tree = None # because it's a tree, but not an sklearn tree... else: raise ValueError(f"algorithm = '{self.algorithm}' not recognized") @@ -446,7 +478,10 @@ class from an array representing our data set and ask who's check_is_fitted(self, "_fit_method") if n_neighbors is None: - n_neighbors = self.algorithm_params['n_candidates'] + try: + n_neighbors = self.algorithm_params['n_candidates'] + except KeyError: + n_neighbors = 1 if self.hubness is None else 100 elif n_neighbors <= 0: raise ValueError(f"Expected n_neighbors > 0. Got {n_neighbors}") else: @@ -508,7 +543,7 @@ class from an array representing our data set and ask who's X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) - elif self._fit_method in ['lsh']: + elif self._fit_method in ['lsh', 'falconn_lsh', 'rptree', 'onng', ]: # assume joblib>=0.12 delayed_query = delayed(self._index.kneighbors) parallel_kwargs = {"prefer": "threads"} @@ -752,7 +787,7 @@ class from an array representing our data set and ask who's else: results = np.hstack(results) - elif self._fit_method in ['lsh']: + elif self._fit_method in ['falconn_lsh']: # assume joblib>=0.12 delayed_query = delayed(self._index.radius_neighbors) parallel_kwargs = {"prefer": "threads"} @@ -762,13 +797,13 @@ class from an array representing our data set and ask who's for s in gen_even_slices(X.shape[0], n_jobs) ) - elif self._fit_method in ['hnsw']: - raise ValueError(f'nmslib/hnsw does not support radius queries.') + elif self._fit_method in ALG_WITHOUT_RADIUS_QUERY: + raise ValueError(f'{self._fit_method} does not support radius queries.') else: raise ValueError(f"internal: _fit_method={self._fit_method} not recognized.") - if self._fit_method in ['lsh', 'hnsw']: + if self._fit_method in ANN_ALG: if return_distance: # dist, neigh_ind = tuple(zip(*results)) # results = np.hstack(dist), np.hstack(neigh_ind) diff --git a/skhubness/neighbors/classification.py b/skhubness/neighbors/classification.py index 6e6af24..3baf21c 100644 --- a/skhubness/neighbors/classification.py +++ b/skhubness/neighbors/classification.py @@ -33,36 +33,41 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, Parameters ---------- - n_neighbors : int, optional (default = 5) + n_neighbors: int, optional (default = 5) Number of neighbors to use by default for :meth:`kneighbors` queries. - weights : str or callable, optional (default = 'uniform') + weights: str or callable, optional (default = 'uniform') weight function used in prediction. Possible values: - - 'uniform' : uniform weights. All points in each neighborhood + - 'uniform': uniform weights. All points in each neighborhood are weighted equally. - - 'distance' : weight points by the inverse of their distance. + - 'distance': weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an + - [callable]: a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'hnsw', 'lsh', 'falconn_lsh', 'onng', 'rptree', + 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'lsh' will use :class:`PuffinnLSH` + - 'falconn_lsh' will use :class:`FalconnLSH` + - 'onng' will use :class:`ONNG` + - 'rptree' will use :class:`RandomProjectionTree` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. + - 'auto' will attempt to decide the most appropriate exact algorithm + based on the values passed to :meth:`fit` method. This will not + select an approximate nearest neighbor algorithm. Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -70,9 +75,8 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm - # TODO add all supported hubness reduction methods - 'mutual_proximity' or 'mp' will use :class:`MutualProximity` - 'local_scaling' or 'ls' will use :class:`LocalScaling` @@ -86,27 +90,27 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, a mutual proximity variant is used, which models distance distributions with independent Gaussians. - leaf_size : int, optional (default = 30) + leaf_size: int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p: integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric: string or callable, default 'minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. - metric_params : dict, optional (default = None) + metric_params: dict, optional (default = None) Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. @@ -172,13 +176,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ + X: array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of shape [n_samples] or [n_samples, n_outputs] + y: array of shape [n_samples] or [n_samples, n_outputs] Class labels for each data sample. """ X = check_array(X, accept_sparse='csr') @@ -211,14 +215,16 @@ def predict(self, X): def predict_proba(self, X): """Return probability estimates for the test data X. + Parameters ---------- - X : array-like, shape (n_query, n_features), \ + X: array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. + Returns ------- - p : array of shape = [n_samples, n_classes], or a list of n_outputs + p: array of shape = [n_samples, n_classes], or a list of n_outputs of such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by lexicographic order. @@ -271,27 +277,28 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, Parameters ---------- - radius : float, optional (default = 1.0) + radius: float, optional (default = 1.0) Range of parameter space to use by default for :meth:`radius_neighbors` queries. - weights : str or callable + weights: str or callable weight function used in prediction. Possible values: - - 'uniform' : uniform weights. All points in each neighborhood + - 'uniform': uniform weights. All points in each neighborhood are weighted equally. - - 'distance' : weight points by the inverse of their distance. + - 'distance': weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an + - [callable]: a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. Uniform weights are used by default. - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm: {'auto', 'falconn_lsh', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: + - 'falconn_lsh' will use :class:`FalconnLSH` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. @@ -301,7 +308,7 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -309,9 +316,8 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm - # TODO add all supported hubness reduction methods - 'mutual_proximity' or 'mp' will use :class:`MutualProximity` - 'local_scaling' or 'ls' will use :class:`LocalScaling` @@ -325,32 +331,32 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, a mutual proximity variant is used, which models distance distributions with independent Gaussians. - leaf_size : int, optional (default = 30) + leaf_size: int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p: integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric: string or callable, default 'minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. - outlier_label : int, optional (default = None) + outlier_label: int, optional (default = None) Label, which is given for outlier samples (samples with no neighbors on given radius). If set to None, ValueError is raised, when outlier is detected. - metric_params : dict, optional (default = None) + metric_params: dict, optional (default = None) Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. @@ -404,14 +410,16 @@ def __init__(self, radius=1.0, weights='uniform', def predict(self, X): """Predict the class labels for the provided data + Parameters ---------- - X : array-like, shape (n_query, n_features), \ + X: array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. + Returns ------- - y : array of shape [n_samples] or [n_samples, n_outputs] + y: array of shape [n_samples] or [n_samples, n_outputs] Class labels for each data sample. """ X = check_array(X, accept_sparse='csr') diff --git a/skhubness/neighbors/graph.py b/skhubness/neighbors/graph.py index beb263e..8325835 100644 --- a/skhubness/neighbors/graph.py +++ b/skhubness/neighbors/graph.py @@ -41,28 +41,29 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', include_self=False, n_jobs=None): """Computes the (weighted) graph of k-Neighbors for points in X - Read more in the `scikit-learn User Guide - `_. + Read more in the + `scikit-learn User Guide `_ Parameters ---------- - X : array-like or BallTree, shape = [n_samples, n_features] + X: array-like or BallTree, shape = [n_samples, n_features] Sample data, in the form of a numpy array or a precomputed :class:`BallTree`. - n_neighbors : int + n_neighbors: int Number of neighbors for each sample. - mode : {'connectivity', 'distance'}, optional + mode: {'connectivity', 'distance'}, optional Type of returned matrix: 'connectivity' will return the connectivity matrix with ones and zeros, and 'distance' will return the distances between neighbors according to the given metric. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm: {'auto', 'hnsw', 'lsh', 'falconn_lsh', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'lsh' will use :class:`PuffinnLSH` + - 'falconn_lsh' will use :class:`FalconnLSH` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. @@ -72,7 +73,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -80,12 +81,13 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - # TODO add all supported hubness reduction methods - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm + - 'mutual_proximity' or 'mp' will use :class:`MutualProximity' - 'local_scaling' or 'ls' will use :class:`LocalScaling` - 'dis_sim_local' or 'dsl' will use :class:`DisSimLocal` + If None, no hubness reduction will be performed (=vanilla kNN). hubness_params: dict, optional @@ -94,26 +96,26 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', a mutual proximity variant is used, which models distance distributions with independent Gaussians. - metric : string, default 'minkowski' + metric: string, default 'minkowski' The distance metric used to calculate the k-Neighbors for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is 'euclidean' ('minkowski' metric with the p param equal to 2.) - p : int, default 2 + p: int, default 2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional + metric_params: dict, optional additional keyword arguments for the metric function. - include_self : bool, default=False. + include_self: bool, default=False. Whether or not to mark each sample as the first nearest neighbor to itself. If `None`, then True is used for mode='connectivity' and False for mode='distance' as this will preserve backwards compatibility. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. @@ -121,7 +123,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', Returns ------- - A : sparse matrix in CSR format, shape = [n_samples, n_samples] + A: sparse matrix in CSR format, shape = [n_samples, n_samples] A[i, j] is assigned the weight of edge that connects i to j. Examples @@ -165,28 +167,28 @@ def radius_neighbors_graph(X, radius, mode='connectivity', Neighborhoods are restricted the points at a distance lower than radius. - Read more in the `scikit-learn User Guide - `_. + Read more in the + `scikit-learn User Guide `_ Parameters ---------- - X : array-like or BallTree, shape = [n_samples, n_features] + X: array-like or BallTree, shape = [n_samples, n_features] Sample data, in the form of a numpy array or a precomputed :class:`BallTree`. - radius : float + radius: float Radius of neighborhoods. - mode : {'connectivity', 'distance'}, optional + mode: {'connectivity', 'distance'}, optional Type of returned matrix: 'connectivity' will return the connectivity matrix with ones and zeros, and 'distance' will return the distances between neighbors according to the given metric. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm: {'auto', 'hnsw', 'falconn_lsh', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'falconn_lsh' will use :class:`FalconnLSH` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. @@ -196,7 +198,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -204,12 +206,13 @@ def radius_neighbors_graph(X, radius, mode='connectivity', with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - # TODO add all supported hubness reduction methods - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm + - 'mutual_proximity' or 'mp' will use :class:`MutualProximity' - 'local_scaling' or 'ls' will use :class:`LocalScaling` - 'dis_sim_local' or 'dsl' will use :class:`DisSimLocal` + If None, no hubness reduction will be performed (=vanilla kNN). hubness_params: dict, optional @@ -218,26 +221,26 @@ def radius_neighbors_graph(X, radius, mode='connectivity', a mutual proximity variant is used, which models distance distributions with independent Gaussians. - metric : string, default 'minkowski' + metric: string, default 'minkowski' The distance metric used to calculate the neighbors within a given radius for each sample point. The DistanceMetric class gives a list of available metrics. The default distance is 'euclidean' ('minkowski' metric with the param equal to 2.) - p : int, default 2 + p: int, default 2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional + metric_params: dict, optional additional keyword arguments for the metric function. - include_self : bool, default=False + include_self: bool, default=False Whether or not to mark each sample as the first nearest neighbor to itself. If `None`, then True is used for mode='connectivity' and False for mode='distance' as this will preserve backwards compatibility. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See @@ -246,7 +249,7 @@ def radius_neighbors_graph(X, radius, mode='connectivity', Returns ------- - A : sparse matrix in CSR format, shape = [n_samples, n_samples] + A: sparse matrix in CSR format, shape = [n_samples, n_samples] A[i, j] is assigned the weight of edge that connects i to j. Examples diff --git a/skhubness/neighbors/hnsw.py b/skhubness/neighbors/hnsw.py index 0d532d2..ee3c41d 100644 --- a/skhubness/neighbors/hnsw.py +++ b/skhubness/neighbors/hnsw.py @@ -3,16 +3,44 @@ # PEP 563: Postponed Evaluation of Annotations from __future__ import annotations - +from typing import Tuple, Union import numpy as np from sklearn.utils.validation import check_is_fitted, check_array import nmslib from .approximate_neighbors import ApproximateNearestNeighbor +from ..utils.check import check_n_candidates __all__ = ['HNSW'] class HNSW(ApproximateNearestNeighbor): + """Wrapper for using nmslib + + Hierarchical navigable small-world graphs are data structures, + that allow for approximate nearest neighbor search. + Here, an implementation from nmslib is used. + + Parameters + ---------- + n_candidates: int, default = 5 + Number of neighbors to retrieve + metric: str, default = 'euclidean' + Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot" + method: str, default = 'hnsw', + ANN method to use. Currently, only 'hnsw' is supported. + post_processing: int, default = 2 + More post processing means longer index creation, + and higher retrieval accuracy. + n_jobs: int, default = 1 + Number of parallel jobs + verbose: int, default = 0 + Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. + + Attributes + ---------- + valid_metrics: + List of valid distance metrics/measures + """ valid_metrics = ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean', 'cosine', 'cosinesimil'] @@ -28,7 +56,20 @@ def __init__(self, n_candidates: int = 5, metric: str = 'euclidean', self.space = None def fit(self, X, y=None) -> HNSW: - """ Setup the HNSW index.""" + """ Setup the HNSW index from training data. + + Parameters + ---------- + X: np.array + Data to be indexed + y: any + Ignored + + Returns + ------- + self: HNSW + An instance of HNSW with a built graph + """ X = check_array(X) method = self.method @@ -56,7 +97,22 @@ def fit(self, X, y=None) -> HNSW: return self - def kneighbors(self, X: np.ndarray = None, n_candidates: int = None, return_distance: bool = True): + def kneighbors(self, X: np.ndarray = None, + n_candidates: int = None, + return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve k nearest neighbors. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + n_candidates: int or None, optional, default = None + Number of neighbors to retrieve. + If None, use the value passed during construction. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. + """ check_is_fitted(self, ["index_", ]) if X is None: @@ -65,11 +121,7 @@ def kneighbors(self, X: np.ndarray = None, n_candidates: int = None, return_dist # Check the n_neighbors parameter if n_candidates is None: n_candidates = self.n_candidates - elif n_candidates <= 0: - raise ValueError(f"Expected n_neighbors > 0. Got {n_candidates:d}") - else: - if not np.issubdtype(type(n_candidates), np.integer): - raise TypeError(f"n_neighbors does not take {type(n_candidates)} value, enter integer value") + n_candidates = check_n_candidates(n_candidates) # Fetch the neighbor candidates neigh_ind_dist = self.index_.knnQueryBatch(X, diff --git a/skhubness/neighbors/lof.py b/skhubness/neighbors/lof.py index 0b26947..81ac494 100644 --- a/skhubness/neighbors/lof.py +++ b/skhubness/neighbors/lof.py @@ -42,16 +42,21 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, If n_neighbors is larger than the number of samples provided, all samples will be used. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'hnsw', 'lsh', 'falconn_lsh', 'onng', 'rptree', + 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'lsh' will use :class:`PuffinnLSH` + - 'falconn_lsh' will use :class:`FalconnLSH` + - 'onng' will use :class:`ONNG` + - 'rptree' will use :class:`RandomProjectionTree` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. + - 'auto' will attempt to decide the most appropriate exact algorithm + based on the values passed to :meth:`fit` method. This will not + select an approximate nearest neighbor algorithm. Note: fitting on sparse input will override the setting of this parameter, using brute force. @@ -66,7 +71,6 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm - # TODO add all supported hubness reduction methods - 'mutual_proximity' or 'mp' will use :class:`MutualProximity` - 'local_scaling' or 'ls' will use :class:`LocalScaling` @@ -80,13 +84,13 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, a mutual proximity variant is used, which models distance distributions with independent Gaussians. - leaf_size : int, optional (default=30) + leaf_size: int, optional (default=30) Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - metric : string or callable, default 'minkowski' + metric: string or callable, default 'minkowski' metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. @@ -114,16 +118,16 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, metrics: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html - p : integer, optional (default=2) + p: integer, optional (default=2) Parameter for the Minkowski metric from :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional (default=None) + metric_params: dict, optional (default=None) Additional keyword arguments for the metric function. - contamination : 'auto' or float, optional (default='auto') + contamination: 'auto' or float, optional (default='auto') The amount of contamination of the data set, i.e. the proportion of outliers in the data set. When fitting this is used to define the threshold on the scores of the samples. @@ -136,14 +140,14 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, The default value of ``contamination`` changed from 0.1 to ``'auto'``. - novelty : boolean, default False + novelty: boolean, default False By default, LocalOutlierFactor is only meant to be used for outlier detection (novelty=False). Set novelty to True if you want to use LocalOutlierFactor for novelty detection. In this case be aware that that you should only use predict, decision_function and score_samples on new unseen data and not on the training set. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. @@ -153,7 +157,7 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, Attributes ---------- - negative_outlier_factor_ : numpy array, shape (n_samples,) + negative_outlier_factor_: numpy array, shape (n_samples,) The opposite LOF of the training samples. The higher, the more normal. Inliers tend to have a LOF score close to 1 (``negative_outlier_factor_`` close to -1), while outliers tend to have a larger LOF score. @@ -163,10 +167,10 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, It is the average of the ratio of the local reachability density of a sample and those of its k-nearest neighbors. - n_neighbors_ : integer + n_neighbors_: integer The actual number of neighbors used for :meth:`kneighbors` queries. - offset_ : float + offset_: float Offset used to obtain binary labels from the raw scores. Observations having a negative_outlier_factor smaller than `offset_` are detected as abnormal. @@ -205,16 +209,16 @@ def fit_predict(self): Parameters ---------- - X : array-like, shape (n_samples, n_features), default=None + X: array-like, shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. - y : Ignored + y: Ignored not used, present for API consistency by convention. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier: array, shape (n_samples,) Returns -1 for anomalies/outliers and 1 for inliers. """ @@ -236,13 +240,13 @@ def _fit_predict(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features), default=None + X: array-like, shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier: array, shape (n_samples,) Returns -1 for anomalies/outliers and 1 for inliers. """ @@ -256,16 +260,16 @@ def fit(self, X, y=None) -> LocalOutlierFactor: Parameters ---------- - X : {array-like, sparse matrix, BallTree, KDTree} + X: {array-like, sparse matrix, BallTree, KDTree} Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. - y : Ignored + y: Ignored not used, present for API consistency by convention. Returns ------- - self : object + self: object """ if self.contamination != 'auto': if not(0. < self.contamination <= .5): @@ -313,13 +317,13 @@ def predict(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X: array-like, shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier: array, shape (n_samples,) Returns -1 for anomalies/outliers and +1 for inliers. """ if not self.novelty: @@ -338,14 +342,14 @@ def _predict(self, X=None): Parameters ---------- - X : array-like, shape (n_samples, n_features), default=None + X: array-like, shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. If None, makes prediction on the training data without considering them as their own neighbors. Returns ------- - is_inlier : array, shape (n_samples,) + is_inlier: array, shape (n_samples,) Returns -1 for anomalies/outliers and +1 for inliers. """ check_is_fitted(self, ["offset_", "negative_outlier_factor_", @@ -376,13 +380,13 @@ def decision_function(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X: array-like, shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - shifted_opposite_lof_scores : array, shape (n_samples,) + shifted_opposite_lof_scores: array, shape (n_samples,) The shifted opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers. @@ -412,13 +416,13 @@ def _decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X: array-like, shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - shifted_opposite_lof_scores : array, shape (n_samples,) + shifted_opposite_lof_scores: array, shape (n_samples,) The shifted opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers. @@ -443,13 +447,13 @@ def score_samples(self): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X: array-like, shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - opposite_lof_scores : array, shape (n_samples,) + opposite_lof_scores: array, shape (n_samples,) The opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. """ @@ -479,13 +483,13 @@ def _score_samples(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X: array-like, shape (n_samples, n_features) The query sample or samples to compute the Local Outlier Factor w.r.t. the training samples. Returns ------- - opposite_lof_scores : array, shape (n_samples,) + opposite_lof_scores: array, shape (n_samples,) The opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. """ @@ -512,17 +516,17 @@ def _local_reachability_density(self, distances_X, neighbors_indices): Parameters ---------- - distances_X : array, shape (n_query, self.n_neighbors) + distances_X: array, shape (n_query, self.n_neighbors) Distances to the neighbors (in the training samples `self._fit_X`) of each query point to compute the LRD. - neighbors_indices : array, shape (n_query, self.n_neighbors) + neighbors_indices: array, shape (n_query, self.n_neighbors) Neighbors indices (of each query point) among training samples self._fit_X. Returns ------- - local_reachability_density : array, shape (n_samples,) + local_reachability_density: array, shape (n_samples,) The local reachability density of each sample. """ dist_k = self._distances_fit_X_[neighbors_indices, diff --git a/skhubness/neighbors/lsh.py b/skhubness/neighbors/lsh.py index fc66cd7..f18ceb7 100644 --- a/skhubness/neighbors/lsh.py +++ b/skhubness/neighbors/lsh.py @@ -5,30 +5,278 @@ from __future__ import annotations from functools import partial +import logging +import sys +from typing import Tuple, Union import warnings import numpy as np -from sklearn.metrics import euclidean_distances +from sklearn.base import BaseEstimator +from sklearn.metrics import euclidean_distances, pairwise_distances from sklearn.metrics.pairwise import cosine_distances -from sklearn.utils.validation import check_is_fitted, check_array -import falconn +from sklearn.utils.validation import check_is_fitted, check_array, check_X_y +try: + import puffinn +except ImportError: + logging.warning("The package 'puffinn' is not available.") # pragma: no cover +try: + import falconn +except ImportError: + logging.warning("The package 'falconn' is not available.") # pragma: no cover from tqdm.auto import tqdm - from .approximate_neighbors import ApproximateNearestNeighbor -__all__ = ['LSH'] +from ..utils.check import check_n_candidates + +__all__ = ['FalconnLSH', 'PuffinnLSH', ] + + +class PuffinnLSH(BaseEstimator, ApproximateNearestNeighbor): + """ Wrap Puffinn LSH for scikit-learn compatibility. + + Parameters + ---------- + n_candidates: int, default = 5 + Number of neighbors to retrieve + metric: str, default = 'euclidean' + Distance metric, allowed are "angular", "jaccard". + Other metrics are partially supported, such as 'euclidean', 'sqeuclidean'. + In these cases, 'angular' distances are used to find the candidate set + of neighbors with LSH among all indexed objects, and (squared) Euclidean + distances are subsequently only computed for the candidates. + memory: int, default = 1GB + Max memory usage + recall: float, default = 0.90 + Probability of finding the true nearest neighbors among the candidates + n_jobs: int, default = 1 + Number of parallel jobs + verbose: int, default = 0 + Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. + + Attributes + ---------- + valid_metrics: + List of valid distance metrics/measures + """ + valid_metrics = ["angular", "cosine", "euclidean", "sqeuclidean", "minkowski", + "jaccard", + ] + metric_map = {'euclidean': 'angular', + 'sqeuclidean': 'angular', + 'minkowski': 'angular', + 'cosine': 'angular', + } + + def __init__(self, n_candidates: int = 5, + metric: str = 'euclidean', + memory: int = 1024**3, + recall: float = 0.9, + n_jobs: int = 1, + verbose: int = 0, + ): + super().__init__(n_candidates=n_candidates, + metric=metric, + n_jobs=n_jobs, + verbose=verbose, + ) + self.memory = memory + self.recall = recall + + def fit(self, X, y=None) -> PuffinnLSH: + """ Build the puffinn LSH index and insert data from X. + + Parameters + ---------- + X: np.array + Data to be indexed + y: any + Ignored + + Returns + ------- + self: Puffinn + An instance of Puffinn with a built index + """ + if y is None: + X = check_array(X) + else: + X, y = check_X_y(X, y) + self.y_train_ = y + + if self.metric not in self.valid_metrics: + warnings.warn(f'Invalid metric "{self.metric}". Using "euclidean" instead') + self.metric = 'euclidean' + try: + self.effective_metric = self.metric_map[self.metric] + except KeyError: + self.effective_metric = self.metric + + # Reduce default memory consumption for unit tests + if "pytest" in sys.modules: + self.memory = 3*1024**2 + + # Construct the index + index = puffinn.Index(self.effective_metric, + X.shape[1], + self.memory, + ) + + if self.verbose: + iter_X = tqdm(X, desc='Indexing', total=len(X)) + else: + iter_X = X + for v in iter_X: + index.insert(v.tolist()) + index.rebuild(num_threads=self.n_jobs) + self.index_ = index + self.X_train_ = X # remove, once we can retrieve vectors from the index itself + + return self + + def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve k nearest neighbors. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + n_candidates: int or None, optional, default = None + Number of neighbors to retrieve. + If None, use the value passed during construction. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. + """ + check_is_fitted(self, 'index_') + + if n_candidates is None: + n_candidates = self.n_candidates + n_candidates = check_n_candidates(n_candidates) + + # For compatibility reasons, as each sample is considered as its own + # neighbor, one extra neighbor will be computed. + if X is None: + X = self.X_train_ + n_neighbors = n_candidates + 1 + start = 1 + else: + X = check_array(X) + n_neighbors = n_candidates + start = 0 -class LSH(ApproximateNearestNeighbor): + n_test = X.shape[0] + dtype = X.dtype + + # If chosen metric is not among the natively support ones, reorder the neighbors + reorder = True if self.metric not in ('angular', 'cosine', 'jaccard') else False + + # If fewer candidates than required are found for a query, + # we save index=-1 and distance=NaN + neigh_ind = -np.ones((n_test, n_candidates), + dtype=np.int32) + if return_distance or reorder: + neigh_dist = np.empty_like(neigh_ind, + dtype=dtype) * np.nan + metric = 'cosine' if self.metric == 'angular' else self.metric + + index = self.index_ + + if self.verbose: + enumerate_X = tqdm(enumerate(X), + desc='Querying', + total=n_test, + ) + else: + enumerate_X = enumerate(X) + for i, x in enumerate_X: + # Find the approximate nearest neighbors. + # Each of the true `n_candidates` nearest neighbors + # has at least `recall` chance of being found. + ind = index.search(x.tolist(), + n_neighbors, + self.recall, + ) + + ind = ind[start:] + neigh_ind[i, :len(ind)] = ind + if return_distance or reorder: + neigh_dist[i, :len(ind)] = pairwise_distances(x.reshape(1, -1), + self.X_train_[ind], + metric=metric, + ) + + if reorder: + sort = np.argsort(neigh_dist, axis=1) + neigh_dist = np.take_along_axis(neigh_dist, sort, axis=1) + neigh_ind = np.take_along_axis(neigh_ind, sort, axis=1) + + if return_distance: + return neigh_dist, neigh_ind + else: + return neigh_ind + + +class FalconnLSH(ApproximateNearestNeighbor): + """Wrapper for using falconn LSH + + Falconn is an approximate nearest neighbor library, + that uses multiprobe locality-sensitive hashing. + + Parameters + ---------- + n_candidates: int, default = 5 + Number of neighbors to retrieve + radius: float or None, optional, default = None + Retrieve neighbors within this radius. + Can be negative: See Notes. + metric: str, default = 'euclidean' + Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot" + num_probes: int, default = 50 + The number of buckets the query algorithm probes. + The higher number of probes is, the better accuracy one gets, + but the slower queries are. + n_jobs: int, default = 1 + Number of parallel jobs + verbose: int, default = 0 + Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. + + Attributes + ---------- + valid_metrics: + List of valid distance metrics/measures + + Notes + ----- + From the falconn docs: radius can be negative, and for the distance function + 'negative_inner_product' it actually makes sense. + """ valid_metrics = ['euclidean', 'l2', 'minkowski', 'squared_euclidean', 'sqeuclidean', 'cosine', 'neg_inner', 'NegativeInnerProduct'] def __init__(self, n_candidates: int = 5, radius: float = 1., metric: str = 'euclidean', num_probes: int = 50, n_jobs: int = 1, verbose: int = 0): - super().__init__(n_candidates=n_candidates, metric=metric, n_jobs=n_jobs, verbose=verbose) + super().__init__(n_candidates=n_candidates, + metric=metric, + n_jobs=n_jobs, + verbose=verbose, + ) self.num_probes = num_probes self.radius = radius - def fit(self, X: np.ndarray, y: np.ndarray = None) -> LSH: - """ Setup the LSH index from training data. """ + def fit(self, X: np.ndarray, y: np.ndarray = None) -> FalconnLSH: + """ Setup the LSH index from training data. + + Parameters + ---------- + X: np.array + Data to be indexed + y: any + Ignored + + Returns + ------- + self: FalconnLSH + An instance of LSH with a built index + """ X = check_array(X, dtype=[np.float32, np.float64]) if self.metric in ['euclidean', 'l2', 'minkowski']: @@ -57,7 +305,22 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> LSH: return self - def kneighbors(self, X: np.ndarray = None, n_candidates: int = None, return_distance: bool = True): + def kneighbors(self, X: np.ndarray = None, + n_candidates: int = None, + return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve k nearest neighbors. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + n_candidates: int or None, optional, default = None + Number of neighbors to retrieve. + If None, use the value passed during construction. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. + """ check_is_fitted(self, ["index_", 'X_train_']) # Check the n_neighbors parameter @@ -129,8 +392,21 @@ def kneighbors(self, X: np.ndarray = None, n_candidates: int = None, return_dist else: return neigh_ind - def radius_neighbors(self, X: np.ndarray = None, radius: float = None, return_distance: bool = True): - """ TODO add docstring + def radius_neighbors(self, X: np.ndarray = None, + radius: float = None, + return_distance: bool = True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve neighbors within a certain radius. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + radius: float or None, optional, default = None + Retrieve neighbors within this radius. + Can be negative: See Notes. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. Notes ----- diff --git a/skhubness/neighbors/onng.py b/skhubness/neighbors/onng.py new file mode 100644 index 0000000..83e9cb0 --- /dev/null +++ b/skhubness/neighbors/onng.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: BSD-3-Clause +# Author: Roman Feldbauer (adaptions for scikit-hubness) +# PEP 563: Postponed Evaluation of Annotations +from __future__ import annotations +import logging +from typing import Union, Tuple +try: + import ngtpy +except (ImportError, ModuleNotFoundError) as e: + logging.warning("The package 'ngt' is required to run this example.") # pragma: no cover +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y +from tqdm.auto import tqdm +from .approximate_neighbors import ApproximateNearestNeighbor +from ..utils.check import check_n_candidates +from ..utils.io import create_tempfile_preferably_in_dir + +print(__doc__) + +__all__ = ['ONNG', + ] + + +class ONNG(BaseEstimator, ApproximateNearestNeighbor): + """Wrapper for ngtpy and ONNG + + Parameters + ---------- + n_candidates: int, default = 5 + Number of neighbors to retrieve + metric: str, default = 'euclidean' + Distance metric, allowed are 'manhattan', 'L1', 'euclidean', 'L2', 'minkowski', + 'Angle', 'Normalized Angle', 'Hamming', 'Jaccard', 'Cosine' or 'Normalized Cosine'. + index_dir: str, default = 'auto' + Store the index in the given directory. + If None, keep the index in main memory (NON pickleable index), + If index_dir is a string, it is interpreted as a directory to store the index into, + if 'auto', create a temp dir for the index, preferably in /dev/shm on Linux. + Note: The directory/the index will NOT be deleted automatically. + n_jobs: int, default = 1 + Number of parallel jobs + verbose: int, default = 0 + Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. + + Attributes + ---------- + valid_metrics: + List of valid distance metrics/measures + + Notes + ----- + ONNG stores the index to a directory specified in `index_dir`. + The index is persistent, and will NOT be deleted automatically. + It is the user's responsibility to take care of deletion, + when required. + """ + valid_metrics = ['manhattan', 'L1', 'euclidean', 'L2', 'minkowski', 'sqeuclidean', + 'Angle', 'Normalized Angle', 'Cosine', 'Normalized Cosine', 'Hamming', 'Jaccard'] + internal_distance_type = {'manhattan': 'L1', + 'euclidean': 'L2', + 'minkowski': 'L2', + 'sqeuclidean': 'L2', + } + + def __init__(self, n_candidates: int = 5, + metric: str = 'euclidean', + index_dir: str = 'auto', + edge_size_for_creation: int = 40, + edge_size_for_search: int = 10, + n_jobs: int = 1, + verbose: int = 0): + super().__init__(n_candidates=n_candidates, + metric=metric, + n_jobs=n_jobs, + verbose=verbose, + ) + self.index_dir = index_dir + self.edge_size_for_creation = edge_size_for_creation + self.edge_size_for_search = edge_size_for_search + + def fit(self, X, y=None) -> ONNG: + """ Build the ngtpy.Index and insert data from X. + + Parameters + ---------- + X: np.array + Data to be indexed + y: any + Ignored + + Returns + ------- + self: ONNG + An instance of ONNG with a built index + """ + if y is None: + X = check_array(X) + else: + X, y = check_X_y(X, y) + self.y_train_ = y + + self.n_samples_train_ = X.shape[0] + self.n_features_ = X.shape[1] + self.X_dtype_ = X.dtype + + # Map common distance names to names used by ngt + try: + self.effective_metric_ = ONNG.internal_distance_type[self.metric] + except KeyError: + self.effective_metric_ = self.metric + if self.effective_metric_ not in ONNG.valid_metrics: + raise ValueError(f'Unknown distance/similarity measure: {self.effective_metric_}. ' + f'Please use one of: {ONNG.valid_metrics}.') + + # Set up a directory to save the index to + if self.index_dir in ['auto']: + index_path = create_tempfile_preferably_in_dir(prefix='skhubness_', + suffix='.onng', + directory='/dev/shm') + logging.warning(f'The index will be stored in {index_path}. ' + f'It will NOT be deleted automatically, when this instance is destructed.') + elif isinstance(self.index_dir, str): + index_path = create_tempfile_preferably_in_dir(prefix='skhubness_', + suffix='.onng', + directory=self.index_dir) + elif self.index_dir is None: + index_path = create_tempfile_preferably_in_dir(prefix='skhubness_', + suffix='.onng') + else: + raise TypeError(f'ONNG requires to write an index to the filesystem. ' + f'Please provide a valid path with parameter `index_dir`.') + + # Create the ONNG index, insert data + # TODO add ngt optimizer + ngtpy.create(path=index_path, + dimension=self.n_features_, + edge_size_for_creation=self.edge_size_for_creation, + edge_size_for_search=self.edge_size_for_search, + distance_type=self.effective_metric_, + ) + index_obj = ngtpy.Index(index_path) + index_obj.batch_insert(X, num_threads=self.n_jobs) + + # Keep index in memory or store in path + if self.index_dir is None: + self.index_ = index_obj + else: + index_obj.save() + self.index_ = index_path + + return self + + def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve k nearest neighbors. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + n_candidates: int or None, optional, default = None + Number of neighbors to retrieve. + If None, use the value passed during construction. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. + """ + check_is_fitted(self, 'index_') + if X is not None: + X = check_array(X) + + n_test = self.n_samples_train_ if X is None else X.shape[0] + dtype = self.X_dtype_ if X is None else X.dtype + + if n_candidates is None: + n_candidates = self.n_candidates + n_candidates = check_n_candidates(n_candidates) + + # For compatibility reasons, as each sample is considered as its own + # neighbor, one extra neighbor will be computed. + if X is None: + n_neighbors = n_candidates + 1 + start = 1 + else: + n_neighbors = n_candidates + start = 0 + + # If fewer candidates than required are found for a query, + # we save index=-1 and distance=NaN + neigh_ind = -np.ones((n_test, n_candidates), + dtype=np.int32) + if return_distance: + neigh_dist = np.empty_like(neigh_ind, + dtype=dtype) * np.nan + + if isinstance(self.index_, str): + index = ngtpy.Index(self.index_) + else: + index = self.index_ + + if X is None: + if self.verbose: + query_ind = tqdm(range(n_test), + desc='Query ONNG', + total=n_test, + ) + else: + query_ind = range(n_test) + for i in query_ind: + query = index.get_object(i) + response = index.search(query=query, + size=n_neighbors, + with_distance=return_distance, + ) + if return_distance: + ind, dist = [np.array(arr) for arr in zip(*response)] + else: + ind = response + ind = ind[start:] + neigh_ind[i, :len(ind)] = ind + if return_distance: + dist = dist[start:] + neigh_dist[i, :len(dist)] = dist + else: # if X was provided + if self.verbose: + enumerate_X = tqdm(enumerate(X), + desc='Query ONNG', + total=n_test, + ) + else: + enumerate_X = enumerate(X) + for i, x in enumerate_X: + response = index.search(query=x, + size=n_neighbors, + with_distance=return_distance, + ) + if return_distance: + ind, dist = [np.array(arr) for arr in zip(*response)] + else: + ind = response + ind = ind[start:] + neigh_ind[i, :len(ind)] = ind + if return_distance: + dist = dist[start:] + neigh_dist[i, :len(dist)] = dist + + if return_distance and self.metric == 'sqeuclidean': + neigh_dist **= 2 + + if return_distance: + return neigh_dist, neigh_ind + else: + return neigh_ind diff --git a/skhubness/neighbors/random_projection_trees.py b/skhubness/neighbors/random_projection_trees.py new file mode 100644 index 0000000..8343334 --- /dev/null +++ b/skhubness/neighbors/random_projection_trees.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: BSD-3-Clause +# Author: Tom Dupre la Tour (original work) +# Roman Feldbauer (adaptions for scikit-hubness) +# PEP 563: Postponed Evaluation of Annotations +from __future__ import annotations + +import logging +import sys +from typing import Union, Tuple + +try: + import annoy +except ImportError: + print("The package 'annoy' is required to run this example.") # pragma: no cover + sys.exit() # pragma: no cover + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y +from tqdm.auto import tqdm +from .approximate_neighbors import ApproximateNearestNeighbor +from ..utils.check import check_n_candidates +from ..utils.io import create_tempfile_preferably_in_dir + +print(__doc__) + +__all__ = ['RandomProjectionTree', + ] + + +class RandomProjectionTree(BaseEstimator, ApproximateNearestNeighbor): + """Wrapper for using annoy.AnnoyIndex + + Annoy is an approximate nearest neighbor library, + that builds a forest of random projections trees. + + Parameters + ---------- + n_candidates: int, default = 5 + Number of neighbors to retrieve + metric: str, default = 'euclidean' + Distance metric, allowed are "angular", "euclidean", "manhattan", "hamming", "dot" + n_trees: int, default = 10 + Build a forest of n_trees trees. More trees gives higher precision when querying, + but are more expensive in terms of build time and index size. + search_k: int, default = -1 + Query will inspect search_k nodes. A larger value will give more accurate results, + but will take longer time. + mmap_dir: str, default = 'auto' + Memory-map the index to the given directory. + This is required to make the the class pickleable. + If None, keep everything in main memory (NON pickleable index), + if mmap_dir is a string, it is interpreted as a directory to store the index into, + if 'auto', create a temp dir for the index, + preferably in /dev/shm on Linux. + n_jobs: int, default = 1 + Number of parallel jobs + verbose: int, default = 0 + Verbosity level. If verbose > 0, show tqdm progress bar on indexing and querying. + + Attributes + ---------- + valid_metrics: + List of valid distance metrics/measures + """ + valid_metrics = ["angular", "euclidean", "manhattan", "hamming", "dot", "minkowski"] + + def __init__(self, n_candidates: int = 5, metric: str = 'euclidean', + n_trees: int = 10, search_k: int = -1, mmap_dir: str = 'auto', + n_jobs: int = 1, verbose: int = 0): + super().__init__(n_candidates=n_candidates, + metric=metric, + n_jobs=n_jobs, + verbose=verbose, + ) + self.n_trees = n_trees + self.search_k = search_k + self.mmap_dir = mmap_dir + + def fit(self, X, y=None) -> RandomProjectionTree: + """ Build the annoy.Index and insert data from X. + + Parameters + ---------- + X: np.array + Data to be indexed + y: any + Ignored + + Returns + ------- + self: RandomProjectionTree + An instance of RandomProjectionTree with a built index + """ + if y is None: + X = check_array(X) + else: + X, y = check_X_y(X, y) + self.y_train_ = y + + self.n_samples_fit_ = X.shape[0] + self.n_features_ = X.shape[1] + self.X_dtype_ = X.dtype + if self.metric == 'minkowski': # for compatibility + self.metric = 'euclidean' + metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean' + self.effective_metric_ = metric + annoy_index = annoy.AnnoyIndex(X.shape[1], metric=metric) + if self.mmap_dir == 'auto': + self.annoy_ = create_tempfile_preferably_in_dir(prefix='skhubness_', + suffix='.annoy', + directory='/dev/shm') + logging.warning(f'The index will be stored in {self.annoy_}. ' + f'It will NOT be deleted automatically, when this instance is destructed.') + elif isinstance(self.mmap_dir, str): + self.annoy_ = create_tempfile_preferably_in_dir(prefix='skhubness_', + suffix='.annoy', + directory=self.mmap_dir) + else: # e.g. None + self.mmap_dir = None + + if self.verbose: + enumerate_X = tqdm(enumerate(X), + desc='Build RPtree', + total=len(X), + ) + else: + enumerate_X = enumerate(X) + for i, x in enumerate_X: + annoy_index.add_item(i, x.tolist()) + annoy_index.build(self.n_trees) + + if self.mmap_dir is None: + self.annoy_ = annoy_index + else: + annoy_index.save(self.annoy_, ) + + return self + + def kneighbors(self, X=None, n_candidates=None, return_distance=True) -> Union[Tuple[np.array, np.array], np.array]: + """ Retrieve k nearest neighbors. + + Parameters + ---------- + X: np.array or None, optional, default = None + Query objects. If None, search among the indexed objects. + n_candidates: int or None, optional, default = None + Number of neighbors to retrieve. + If None, use the value passed during construction. + return_distance: bool, default = True + If return_distance, will return distances and indices to neighbors. + Else, only return the indices. + """ + check_is_fitted(self, 'annoy_') + if X is not None: + X = check_array(X) + + n_test = self.n_samples_fit_ if X is None else X.shape[0] + dtype = self.X_dtype_ if X is None else X.dtype + + if n_candidates is None: + n_candidates = self.n_candidates + n_candidates = check_n_candidates(n_candidates) + + # For compatibility reasons, as each sample is considered as its own + # neighbor, one extra neighbor will be computed. + if X is None: + n_neighbors = n_candidates + 1 + start = 1 + else: + n_neighbors = n_candidates + start = 0 + + # If fewer candidates than required are found for a query, + # we save index=-1 and distance=NaN + neigh_ind = -np.ones((n_test, n_candidates), + dtype=np.int32) + neigh_dist = np.empty_like(neigh_ind, + dtype=dtype) * np.nan + + # Load memory-mapped annoy.Index, unless it's already in main memory + if isinstance(self.annoy_, str): + annoy_index = annoy.AnnoyIndex(self.n_features_, metric=self.effective_metric_) + annoy_index.load(self.annoy_) + elif isinstance(self.annoy_, annoy.AnnoyIndex): + annoy_index = self.annoy_ + assert isinstance(annoy_index, annoy.AnnoyIndex), f'Internal error: unexpected type for annoy index' + + if X is None: + if self.verbose: + n_items = annoy_index.get_n_items() + query_ind = tqdm(range(n_items), + desc='Query RPtree', + total=n_items, + ) + else: + query_ind = range(annoy_index.get_n_items()) + for i in query_ind: + ind, dist = annoy_index.get_nns_by_item( + i, n_neighbors, self.search_k, + include_distances=True, + ) + ind = ind[start:] + dist = dist[start:] + neigh_ind[i, :len(ind)] = ind + neigh_dist[i, :len(dist)] = dist + else: # if X was provided + if self.verbose: + enumerate_X = tqdm(enumerate(X), + desc='Query RPtree', + total=len(X), + ) + else: + enumerate_X = enumerate(X) + for i, x in enumerate_X: + ind, dist = annoy_index.get_nns_by_vector( + x.tolist(), n_neighbors, self.search_k, + include_distances=True, + ) + ind = ind[start:] + dist = dist[start:] + neigh_ind[i, :len(ind)] = ind + neigh_dist[i, :len(dist)] = dist + + if self.metric == 'sqeuclidean': + neigh_dist **= 2 + + if return_distance: + return neigh_dist, neigh_ind + else: + return neigh_ind diff --git a/skhubness/neighbors/regression.py b/skhubness/neighbors/regression.py index bd4fde0..fbf3a4b 100644 --- a/skhubness/neighbors/regression.py +++ b/skhubness/neighbors/regression.py @@ -37,38 +37,43 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, Parameters ---------- - n_neighbors : int, optional (default = 5) + n_neighbors: int, optional (default = 5) Number of neighbors to use by default for :meth:`kneighbors` queries. - weights : str or callable + weights: str or callable weight function used in prediction. Possible values: - - 'uniform' : uniform weights. All points in each neighborhood + - 'uniform': uniform weights. All points in each neighborhood are weighted equally. - - 'distance' : weight points by the inverse of their distance. + - 'distance': weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an + - [callable]: a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. Uniform weights are used by default. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'hnsw', 'lsh', 'falconn_lsh', 'onng', 'rptree', + 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'lsh' will use :class:`PuffinnLSH` + - 'falconn_lsh' will use :class:`FalconnLSH` + - 'onng' will use :class:`ONNG` + - 'rptree' will use :class:`RandomProjectionTree` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. + - 'auto' will attempt to decide the most appropriate exact algorithm + based on the values passed to :meth:`fit` method. This will not + select an approximate nearest neighbor algorithm. Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -76,9 +81,8 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm - # TODO add all supported hubness reduction methods - 'mutual_proximity' or 'mp' will use :class:`MutualProximity` - 'local_scaling' or 'ls' will use :class:`LocalScaling` @@ -92,27 +96,27 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, a mutual proximity variant is used, which models distance distributions with independent Gaussians. - leaf_size : int, optional (default = 30) + leaf_size: int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p: integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric: string or callable, default 'minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. - metric_params : dict, optional (default = None) + metric_params: dict, optional (default = None) Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See scikit-learn @@ -175,13 +179,13 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), \ + X: array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of int, shape = [n_samples] or [n_samples, n_outputs] + y: array of int, shape = [n_samples] or [n_samples, n_outputs] Target values """ if issparse(X) and self.metric == 'precomputed': @@ -227,29 +231,28 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Parameters ---------- - radius : float, optional (default = 1.0) + radius: float, optional (default = 1.0) Range of parameter space to use by default for :meth:`radius_neighbors` queries. - weights : str or callable + weights: str or callable weight function used in prediction. Possible values: - - 'uniform' : uniform weights. All points in each neighborhood + - 'uniform': uniform weights. All points in each neighborhood are weighted equally. - - 'distance' : weight points by the inverse of their distance. + - 'distance': weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an + - [callable]: a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. Uniform weights are used by default. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm: {'auto', 'falconn_lsh', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'falconn_lsh' will use :class:`FalconnLSH` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. @@ -259,7 +262,7 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -267,9 +270,8 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm - # TODO add all supported hubness reduction methods - 'mutual_proximity' or 'mp' will use :class:`MutualProximity` - 'local_scaling' or 'ls' will use :class:`LocalScaling` @@ -283,27 +285,27 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, a mutual proximity variant is used, which models distance distributions with independent Gaussians. - leaf_size : int, optional (default = 30) + leaf_size: int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - p : integer, optional (default = 2) + p: integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric : string or callable, default 'minkowski' + metric: string or callable, default 'minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics. - metric_params : dict, optional (default = None) + metric_params: dict, optional (default = None) Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See scikit-learn @@ -358,12 +360,12 @@ def predict(self, X): Parameters ---------- - X : array-like, shape (n_query, n_features), or (n_query, n_indexed) if metric == 'precomputed' + X: array-like, shape (n_query, n_features), or (n_query, n_indexed) if metric == 'precomputed' Test samples. Returns ------- - y : array of float, shape = [n_samples] or [n_samples, n_outputs] + y: array of float, shape = [n_samples] or [n_samples, n_outputs] Target values """ X = check_array(X, accept_sparse='csr') diff --git a/skhubness/neighbors/tests/test_lof.py b/skhubness/neighbors/tests/test_lof.py index 61595f3..5596361 100644 --- a/skhubness/neighbors/tests/test_lof.py +++ b/skhubness/neighbors/tests/test_lof.py @@ -38,12 +38,20 @@ ) # lsh uses FALCONN, which does not support Windows -if sys.platform == 'win32': - APPROXIMATE_ALGORITHMS = ('hnsw', # only on win32 +if sys.platform == 'win32': # pragma: no cover + APPROXIMATE_ALGORITHMS = ('hnsw', # pragma: no cover + 'rptree', + ) +elif sys.platform == 'darwin': + APPROXIMATE_ALGORITHMS = ('falconn_lsh', + 'hnsw', + 'rptree', ) else: APPROXIMATE_ALGORITHMS = ('lsh', + 'falconn_lsh', 'hnsw', + 'rptree', ) HUBNESS_ALGORITHMS = ('mp', 'ls', @@ -68,6 +76,9 @@ def test_lof(algorithm): score = clf.fit(X).negative_outlier_factor_ assert_array_equal(clf._fit_X, X) + if algorithm in ['lsh']: + pytest.xfail(f'puffinn is known to fail this test.') + # Assert largest outlier score is smaller than smallest inlier score: assert np.min(score[:-2]) > np.max(score[-2:]) @@ -118,6 +129,10 @@ def test_lof_values(algorithm): # check predict() assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1]) assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1]) + + if algorithm in ['lsh']: + pytest.xfail(f'puffinn is known to fail this test...') + # check predict(one sample not in train) assert_array_almost_equal(-clf1.score_samples([[2., 2.]]), [s_0]) assert_array_almost_equal(-clf2.score_samples([[2., 2.]]), [s_0]) diff --git a/skhubness/neighbors/tests/test_lsh.py b/skhubness/neighbors/tests/test_lsh.py index 5bb2a6e..f4d9cfc 100644 --- a/skhubness/neighbors/tests/test_lsh.py +++ b/skhubness/neighbors/tests/test_lsh.py @@ -3,17 +3,30 @@ import pytest import sys from sklearn.datasets import make_classification +from sklearn.preprocessing import Normalizer from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal -from skhubness.neighbors import LSH +from skhubness.neighbors import FalconnLSH, PuffinnLSH +# Exclude libraries that are not available on specific platforms +if sys.platform == 'win32': + LSH_METHODS = () + LSH_WITH_RADIUS = () +elif sys.platform == 'darwin': + LSH_METHODS = (FalconnLSH, ) + LSH_WITH_RADIUS = (FalconnLSH, ) +else: + LSH_METHODS = (FalconnLSH, PuffinnLSH, ) + LSH_WITH_RADIUS = (FalconnLSH, ) -@pytest.mark.skipif(sys.platform == 'win32', reason='Currently no LSH supported on Windows.') + +@pytest.mark.parametrize('LSH', LSH_METHODS) @pytest.mark.parametrize('metric', ['euclidean', 'cosine']) @pytest.mark.parametrize('n_jobs', [-1, 1, None]) @pytest.mark.parametrize('verbose', [0, 1]) -def test_kneighbors_with_or_without_self_hit(metric, n_jobs, verbose): - X, y = make_classification() +def test_kneighbors_with_or_without_self_hit(LSH: callable, metric, n_jobs, verbose): + X, y = make_classification(random_state=235) + X = Normalizer().fit_transform(X) lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose) lsh.fit(X, y) neigh_dist, neigh_ind = lsh.kneighbors(return_distance=True) @@ -31,12 +44,13 @@ def test_kneighbors_with_or_without_self_hit(metric, n_jobs, verbose): neigh_dist_self[:, 1:]) -@pytest.mark.skipif(sys.platform == 'win32', reason='Currently no LSH supported on Windows.') +@pytest.mark.parametrize('LSH', LSH_WITH_RADIUS) @pytest.mark.parametrize('metric', ['euclidean', 'cosine']) @pytest.mark.parametrize('n_jobs', [-1, 1, None]) @pytest.mark.parametrize('verbose', [0, 1]) -def test_radius_neighbors_with_or_without_self_hit(metric, n_jobs, verbose): +def test_radius_neighbors_with_or_without_self_hit(LSH, metric, n_jobs, verbose): X, y = make_classification() + X = Normalizer().fit_transform(X) lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose) lsh.fit(X, y) radius = lsh.kneighbors(n_candidates=3)[0][:, 2].max() @@ -57,31 +71,35 @@ def test_radius_neighbors_with_or_without_self_hit(metric, n_jobs, verbose): neigh_dist_self[i][1:4]) -@pytest.mark.skipif(sys.platform == 'win32', reason='Currently no LSH supported on Windows.') -def test_squared_euclidean_same_neighbors_as_euclidean(): - X, y = make_classification() +@pytest.mark.parametrize('LSH', LSH_METHODS) +def test_squared_euclidean_same_neighbors_as_euclidean(LSH): + X, y = make_classification(random_state=234) + X = Normalizer().fit_transform(X) lsh = LSH(metric='minkowski') lsh.fit(X, y) neigh_dist_eucl, neigh_ind_eucl = lsh.kneighbors() - radius = neigh_dist_eucl[:, 2].max() - rad_dist_eucl, rad_ind_eucl = lsh.radius_neighbors(radius=radius) - lsh = LSH(metric='sqeuclidean') - lsh.fit(X, y) - neigh_dist_sqeucl, neigh_ind_sqeucl = lsh.kneighbors() - rad_dist_sqeucl, rad_ind_sqeucl = lsh.radius_neighbors(radius=radius**2) + lsh_sq = LSH(metric='sqeuclidean') + lsh_sq.fit(X, y) + neigh_dist_sqeucl, neigh_ind_sqeucl = lsh_sq.kneighbors() assert_array_equal(neigh_ind_eucl, neigh_ind_sqeucl) assert_array_almost_equal(neigh_dist_eucl ** 2, neigh_dist_sqeucl) - for i in range(len(rad_ind_eucl)): - assert_array_equal(rad_ind_eucl[i], rad_ind_sqeucl[i]) - assert_array_almost_equal(rad_dist_eucl[i] ** 2, rad_dist_sqeucl[i]) + + if LSH in LSH_WITH_RADIUS: + radius = neigh_dist_eucl[:, 2].max() + rad_dist_eucl, rad_ind_eucl = lsh.radius_neighbors(radius=radius) + rad_dist_sqeucl, rad_ind_sqeucl = lsh_sq.radius_neighbors(radius=radius**2) + for i in range(len(rad_ind_eucl)): + assert_array_equal(rad_ind_eucl[i], rad_ind_sqeucl[i]) + assert_array_almost_equal(rad_dist_eucl[i] ** 2, rad_dist_sqeucl[i]) -@pytest.mark.skipif(sys.platform == 'win32', reason='Currently no LSH supported on Windows.') +@pytest.mark.parametrize('LSH', LSH_METHODS) @pytest.mark.parametrize('metric', ['invalid', 'manhattan', 'l1', 'chebyshev']) -def test_warn_on_invalid_metric(metric): - X, y = make_classification() +def test_warn_on_invalid_metric(LSH, metric): + X, y = make_classification(random_state=24643) + X = Normalizer().fit_transform(X) lsh = LSH(metric='euclidean') lsh.fit(X, y) neigh_dist, neigh_ind = lsh.kneighbors() diff --git a/skhubness/neighbors/tests/test_neighbors.py b/skhubness/neighbors/tests/test_neighbors.py index 42ac62d..438bdfa 100644 --- a/skhubness/neighbors/tests/test_neighbors.py +++ b/skhubness/neighbors/tests/test_neighbors.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: BSD-3-Clause from itertools import product +import os from pickle import PicklingError +import platform import sys import warnings @@ -36,6 +38,8 @@ from sklearn.utils._joblib import parallel_backend from skhubness import neighbors +from skhubness.neighbors.base import ALG_WITHOUT_RADIUS_QUERY +from skhubness.utils.platform import available_ann_algorithms_on_current_platform rng = np.random.RandomState(0) # load and shuffle iris dataset @@ -60,14 +64,8 @@ 'auto', ) -# lsh uses FALCONN, which does not support Windows -if sys.platform == 'win32': - APPROXIMATE_ALGORITHMS = ('hnsw', # only on win32 - ) -else: - APPROXIMATE_ALGORITHMS = ('lsh', - 'hnsw', - ) +APPROXIMATE_ALGORITHMS = available_ann_algorithms_on_current_platform() +NO_RADIUS = ALG_WITHOUT_RADIUS_QUERY HUBNESS_ALGORITHMS = ('mp', 'ls', 'dsl', @@ -88,6 +86,22 @@ neighbors.radius_neighbors_graph = ignore_warnings( neighbors.radius_neighbors_graph) +# Are we running on travis-ci? And on which OS? +is_travis = 'TRAVIS' in os.environ +current_os = platform.system() + +# Skip certain tests, e.g. on certain platforms, when libraries don't support them +FALCONN_LSH_NOT_ON_WIN = pytest.param( + 'falconn_lsh', marks=pytest.mark.skipif(sys.platform == 'win32', reason='falconn does not support Windows')) +ONNG_NOT_ON_WIN = pytest.param( + 'onng', marks=pytest.mark.skipif(sys.platform == 'win32', reason='NGT (ONNG) does not support Windows')) +HNSW_HAS_NO_RADIUS_QUERY = pytest.param('hnsw', marks=pytest.mark.xfail( + reason="hnsw does not support radius queries")) +ANNOY_HAS_NO_RADIUS_QUERY = pytest.param('rptree', marks=pytest.mark.xfail( + reason="annoy (rptree) does not support radius queries")) +NGT_HAS_NO_RADIUS_QUERY = pytest.param('onng', marks=pytest.mark.xfail( + reason="NGT (ONNG) does not support radius queries")) + def _weight_func(dist): """ Weight function to replace lambda d: d ** -2. @@ -148,8 +162,13 @@ def test_unsupervised_kneighbors(hubness_and_params, results_approx = neigh.kneighbors(test, return_distance=True) assert_array_equal(results_approx_nodist, results_approx[1]) - assert_array_almost_equal(results_approx[0], results[1][0]) - assert_array_almost_equal(results_approx[1], results[1][1]) + if algorithm in ['rptree', 'lsh']: # quite imprecise + assert_array_almost_equal(results_approx[0], results[1][0], decimal=0) + for i in range(len(results_approx[1])): + assert np.intersect1d(results_approx[1][i], results[1][1][i]).size >= 2 + else: + assert_array_almost_equal(results_approx[0], results[1][0], decimal=6) + assert_array_almost_equal(results_approx[1], results[1][1]) @pytest.mark.parametrize('hubness_and_params', HUBNESS_ALGORITHMS_WITH_PARAMS) @@ -176,16 +195,22 @@ def test_unsupervised_inputs(hubness_and_params): inputs = [nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X), neighbors.HNSW(n_candidates=1).fit(X), + neighbors.RandomProjectionTree(n_candidates=1).fit(X), ] if sys.platform != 'win32': - inputs += [neighbors.LSH(n_candidates=1).fit(X), ] + inputs += [neighbors.FalconnLSH(n_candidates=1).fit(X), ] + if sys.platform == 'linux': + inputs += [neighbors.PuffinnLSH(n_candidates=1).fit(X), ] for input_ in inputs: nbrs.fit(input_) dist2, ind2 = nbrs.kneighbors(X) - assert_array_almost_equal(dist1, dist2) - assert_array_almost_equal(ind1, ind2) + if not isinstance(input_, neighbors.PuffinnLSH): + assert_array_almost_equal(dist1, dist2) + assert_array_almost_equal(ind1, ind2) + else: + assert np.intersect1d(ind1, ind2).size >= 8 def test_n_neighbors_datatype(): @@ -300,13 +325,9 @@ def test_unsupervised_radius_neighbors(n_samples=20, n_features=5, random_state=0): # Test unsupervised radius-based query rng = np.random.RandomState(random_state) - - from sklearn.metrics import euclidean_distances X = rng.rand(n_samples, n_features) - D = euclidean_distances(X, squared=False) - test = rng.rand(n_query_pts, n_features) - test_dist = euclidean_distances(test, X, squared=False) + for p in P: results = [] @@ -344,10 +365,10 @@ def test_unsupervised_radius_neighbors(n_samples=20, n_features=5, neigh = neighbors.NearestNeighbors(radius=radius, algorithm=algorithm, algorithm_params={'n_candidates': 5}, - p=p) + p=2) neigh.fit(X) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: with pytest.raises(ValueError): ind1 = neigh.radius_neighbors(test, return_distance=False) continue @@ -401,11 +422,17 @@ def test_kneighbors_classifier(hubness_and_params, knn.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = knn.predict(X[:n_test_pts] + epsilon) - assert_array_equal(y_pred, y[:n_test_pts]) + if algorithm in ['lsh']: + assert np.sum(y_pred != y[:n_test_pts]) <= 1 + else: + assert_array_equal(y_pred, y[:n_test_pts]) # Test prediction with y_str knn.fit(X, y_str) y_pred = knn.predict(X[:n_test_pts] + epsilon) - assert_array_equal(y_pred, y_str[:n_test_pts]) + if algorithm in ['lsh']: + assert np.sum(y_pred != y_str[:n_test_pts]) <= 1 + else: + assert_array_equal(y_pred, y_str[:n_test_pts]) @pytest.mark.parametrize('hubness_and_params', HUBNESS_ALGORITHMS_WITH_PARAMS) @@ -494,7 +521,7 @@ def test_radius_neighbors_classifier(hubness_and_params, algorithm=algorithm) neigh.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: with pytest.raises(ValueError): y_pred = neigh.predict(X[:n_test_pts] + epsilon) continue @@ -530,7 +557,7 @@ def test_radius_neighbors_classifier_when_no_neighbors(hubness_and_params): hubness=hubness, hubness_params=hub_params, outlier_label=outlier_label) clf.fit(X, y) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: with pytest.raises(ValueError): prediction = clf.predict(z1) continue @@ -573,7 +600,7 @@ def test_radius_neighbors_classifier_outlier_labeling(hubness_and_params): hubness=hub, hubness_params=params, outlier_label=-1) clf.fit(X, y) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, clf.predict, z1) continue assert_array_equal(correct_labels1, clf.predict(z1)) @@ -604,7 +631,7 @@ def test_radius_neighbors_classifier_zero_distance(hubness_and_params): hubness=hub, hubness_params=h_params, algorithm=algorithm) clf.fit(X, y) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, clf.predict, z1) else: assert_array_equal(correct_labels1, clf.predict(z1)) @@ -635,7 +662,7 @@ def test_neighbors_regressors_zero_distance(): weights=weights, algorithm=algorithm) rnn.fit(X, y) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, rnn.predict, z) else: assert_array_almost_equal(rnn_correct_labels, rnn.predict(z)) @@ -646,7 +673,10 @@ def test_neighbors_regressors_zero_distance(): weights=weights, algorithm=algorithm) knn.fit(X, y) - assert_array_almost_equal(corr_labels, knn.predict(z)) + if algorithm in ['lsh']: + assert_array_almost_equal(corr_labels, knn.predict(z), decimal=0) + else: + assert_array_almost_equal(corr_labels, knn.predict(z), decimal=6) def test_radius_neighbors_boundary_handling(): @@ -666,7 +696,7 @@ def test_radius_neighbors_boundary_handling(): algorithm_params={'n_candidates': 2}, algorithm=algorithm, ).fit(X) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, nbrs.radius_neighbors, [[0.0]]) continue results = nbrs.radius_neighbors([[0.0]], return_distance=False) @@ -691,7 +721,7 @@ def test_RadiusNeighborsClassifier_multioutput(): for algorithm, weights in product(EXACT_ALGORITHMS + APPROXIMATE_ALGORITHMS, weights): # skip - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: with pytest.raises(ValueError): neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm)\ .fit(X_train, y_train)\ @@ -900,7 +930,7 @@ def test_radius_neighbors_regressor(n_samples=40, algorithm=algorithm) neigh.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, neigh.predict, X[:n_test_pts] + epsilon) continue y_pred = neigh.predict(X[:n_test_pts] + epsilon) @@ -924,11 +954,17 @@ def test_radius_neighbors_regressor(n_samples=40, @pytest.mark.parametrize('algorithm', list(EXACT_ALGORITHMS) - + [pytest.param('lsh', + + [pytest.param('falconn_lsh', marks=pytest.mark.skipif(sys.platform == 'win32', reason='falconn does not support Windows')), ] + + [pytest.param('lsh', + marks=pytest.mark.xfail(reason="puffinn does not support radius queries")), ] + [pytest.param('hnsw', - marks=pytest.mark.xfail(reason="hnsw does not support radius queries")), ]) + marks=pytest.mark.xfail(reason="hnsw does not support radius queries")), ] + + [pytest.param('rptree', + marks=pytest.mark.xfail(reason="rptree does not support radius queries")), ] + + [FALCONN_LSH_NOT_ON_WIN, ] + + [HNSW_HAS_NO_RADIUS_QUERY, ANNOY_HAS_NO_RADIUS_QUERY, NGT_HAS_NO_RADIUS_QUERY, ]) @pytest.mark.parametrize('weights', [None, 'uniform']) def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(algorithm, weights): # Test radius neighbors in multi-output regression (uniform weight) @@ -960,11 +996,16 @@ def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(algorithm, wei @pytest.mark.parametrize('algorithm', list(EXACT_ALGORITHMS) - + [pytest.param('lsh', - marks=pytest.mark.skipif(sys.platform == 'win32', - reason='falconn does not support Windows')), ] + + [pytest.param('falconn_lsh', marks=pytest.mark.skipif( + sys.platform == 'win32', reason='falconn does not support Windows')), ] + + [pytest.param('lsh', marks=pytest.mark.xfail( + reason="puffinn does not support radius queries")), ] + [pytest.param('hnsw', marks=pytest.mark.xfail( - reason="hnsw does not support radius queries")), ]) + reason="hnsw does not support radius queries")), ] + + [pytest.param('rptree', marks=pytest.mark.xfail( + reason="rptree does not support radius queries")), ] + + [FALCONN_LSH_NOT_ON_WIN, ] + + [HNSW_HAS_NO_RADIUS_QUERY, ANNOY_HAS_NO_RADIUS_QUERY, NGT_HAS_NO_RADIUS_QUERY]) @pytest.mark.parametrize('weights', ['uniform', 'distance', _weight_func]) def test_RadiusNeighborsRegressor_multioutput(algorithm, weights, n_samples=40, @@ -1041,9 +1082,12 @@ def test_neighbors_iris(algorithm, hubness_algorithm_and_params): ) clf.fit(iris.data, iris.target) y_pred = clf.predict(iris.data) - if hubness == 'dsl' or (algorithm == 'hnsw' and hubness in ['mp']): + + if hubness is None and algorithm == 'onng': + assert np.mean(y_pred == iris.target) > 0.85, f'Below 85% accuracy' + elif hubness == 'dsl' or (algorithm == 'hnsw' and hubness in ['mp']) or algorithm in ['onng', 'lsh']: # Spurious small errors occur - assert np.mean(y_pred == iris.target) > 0.95, f'Below 95% accuracy' + assert np.mean(y_pred == iris.target) > 0.90, f'Below 90% accuracy' else: assert_array_equal(y_pred, iris.target) @@ -1096,6 +1140,8 @@ def test_kneighbors_graph(algorithm, hubness_and_params): include_self=True, ) assert_array_equal(A.toarray(), np.eye(A.shape[0])) + if algorithm in ['lsh']: + pytest.xfail(f'Puffinn uses cosine distances and thus fails for some corner cases.') A = neighbors.kneighbors_graph(X, 1, mode='distance', algorithm=algorithm, @@ -1468,11 +1514,11 @@ def test_k_and_radius_neighbors_train_is_not_query(algorithm): dist, ind = nn.kneighbors(test_data) assert_array_equal(dist, [[1], [0]]) assert_array_equal(ind, [[1], [1]]) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, nn.radius_neighbors, [[2], [1]], radius=1.5) else: dist, ind = nn.radius_neighbors([[2], [1]], radius=1.5) - # sklearn does not guarantee sorted radius neighbors, but LSH sorts automatically, + # sklearn does not guarantee sorted radius neighbors, but FalconnLSH sorts automatically, # so we make sure, that all results here are sorted dist_true = [[1], [0, 1]] ind_true = [[1], [1, 0]] @@ -1487,7 +1533,7 @@ def test_k_and_radius_neighbors_train_is_not_query(algorithm): assert_array_equal( nn.kneighbors_graph([[2], [1]], mode='distance').A, np.array([[0., 1.], [0., 0.]])) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, nn.radius_neighbors_graph, [[2], [1]], radius=1.5) else: rng = nn.radius_neighbors_graph([[2], [1]], radius=1.5) @@ -1506,7 +1552,8 @@ def test_k_and_radius_neighbors_X_None(algorithm): dist, ind = nn.kneighbors() assert_array_equal(dist, [[1], [1]]) assert_array_equal(ind, [[1], [0]]) - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: + assert_raises(ValueError, nn.radius_neighbors, None, radius=1.5) else: dist, ind = nn.radius_neighbors(None, radius=1.5) @@ -1516,7 +1563,7 @@ def test_k_and_radius_neighbors_X_None(algorithm): # Test the graph variants. graphs = [] graphs += [nn.kneighbors_graph(None), ] - if algorithm in ['hnsw']: + if algorithm in NO_RADIUS: assert_raises(ValueError, nn.radius_neighbors_graph, None, radius=1.5) else: graphs += [nn.radius_neighbors_graph(None, radius=1.5), ] @@ -1540,6 +1587,9 @@ def test_k_and_radius_neighbors_duplicates(algorithm): nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) nn.fit([[0], [1]]) + if algorithm in ['lsh']: + pytest.xfail(f'puffinn uses cosine distances that require this test to be changed.') + # Do not do anything special to duplicates. kng = nn.kneighbors_graph([[0], [1]], mode='distance') assert_array_equal( @@ -1548,8 +1598,8 @@ def test_k_and_radius_neighbors_duplicates(algorithm): assert_array_equal(kng.data, [0., 0.]) assert_array_equal(kng.indices, [0, 1]) - if algorithm in ['hnsw']: - assert_raises(ValueError, nn.radius_neighbors, [[0], [1]], radius=1.5) + if algorithm in NO_RADIUS: + pytest.xfail(f'Method {algorithm} does not support radius queries.') else: dist, ind = [np.stack(x) for x in nn.radius_neighbors([[0], [1]], radius=1.5)] sort = np.argsort(dist) @@ -1563,7 +1613,7 @@ def test_k_and_radius_neighbors_duplicates(algorithm): rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode='distance') - if algorithm in ['lsh']: + if algorithm in ['falconn_lsh']: assert_array_equal(rng.A, [[0, 1], [1, 0]]) assert_array_equal(rng.indices, [0, 1, 1, 0]) assert_array_equal(rng.data, [0, 1, 0, 1]) @@ -1635,12 +1685,16 @@ def test_same_knn_parallel(algorithm): @pytest.mark.parametrize('algorithm', list(EXACT_ALGORITHMS) - + [pytest.param('lsh', - marks=pytest.mark.skipif(sys.platform == 'win32', - reason='falconn does not support Windows')), ] + + [pytest.param('falconn_lsh', marks=pytest.mark.skipif( + sys.platform == 'win32', reason='falconn does not support Windows')), ] + + [pytest.param('lsh', marks=pytest.mark.xfail( + reason="puffinn does not support radius queries")), ] + [pytest.param('hnsw', marks=pytest.mark.xfail( - reason="hnsw does not support radius queries")), - ]) + reason="hnsw does not support radius queries")), ] + + [pytest.param('rptree', marks=pytest.mark.xfail( + reason="rptree does not support radius queries")), ] + + [FALCONN_LSH_NOT_ON_WIN, ] + + [HNSW_HAS_NO_RADIUS_QUERY, ANNOY_HAS_NO_RADIUS_QUERY, NGT_HAS_NO_RADIUS_QUERY]) def test_same_radius_neighbors_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) @@ -1681,10 +1735,15 @@ def test_knn_forcing_backend(backend, algorithm): algorithm=algorithm, n_jobs=3) clf.fit(X_train, y_train) - if algorithm in ['lsh'] and backend in ['multiprocessing', 'loky']: + if algorithm in ['falconn_lsh'] and backend in ['multiprocessing', 'loky']: # can't pickle _falconn.LSHConstructionParameters objects assert_raises((TypeError, PicklingError, ), clf.predict, X_test) else: + if algorithm in ['lsh'] and backend in ['loky']: + pytest.skip(f'puffinn does not work with loky.') + if algorithm in ['rptree'] and backend in ['loky'] and current_os in ['Linux'] and is_travis: + pytest.skip(f'Annoy with backend loky on linux does not work on travis ' + f'(but apparently all other configs work...') clf.predict(X_test) clf.kneighbors(X_test) clf.kneighbors_graph(X_test, mode='distance').toarray() diff --git a/skhubness/neighbors/tests/test_onng.py b/skhubness/neighbors/tests/test_onng.py new file mode 100644 index 0000000..87b8afa --- /dev/null +++ b/skhubness/neighbors/tests/test_onng.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: BSD-3-Clause +import sys +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.testing import assert_array_equal, assert_array_almost_equal +from sklearn.utils.testing import assert_raises +from skhubness.neighbors import ONNG, NearestNeighbors + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +@pytest.mark.parametrize('n_candidates', [1, 2, 5, 99, 100, 1000, ]) +@pytest.mark.parametrize('set_in_constructor', [True, False]) +@pytest.mark.parametrize('return_distance', [True, False]) +@pytest.mark.parametrize('search_among_indexed', [True, False]) +@pytest.mark.parametrize('verbose', [True, False]) +def test_return_correct_number_of_neighbors(n_candidates: int, + set_in_constructor: bool, + return_distance: bool, + search_among_indexed: bool, + verbose: bool): + n_samples = 100 + X, y = make_classification(n_samples=n_samples) + ann = ONNG(n_candidates=n_candidates, verbose=verbose)\ + if set_in_constructor else ONNG(verbose=verbose) + ann.fit(X, y) + X_query = None if search_among_indexed else X + neigh = ann.kneighbors(X_query, return_distance=return_distance) if set_in_constructor\ + else ann.kneighbors(X_query, n_candidates=n_candidates, return_distance=return_distance) + + if return_distance: + dist, neigh = neigh + assert dist.shape == neigh.shape, f'Shape of distances and indices matrices do not match.' + if n_candidates > n_samples: + assert np.all(np.isnan(dist[:, n_samples:])), f'Returned distances for invalid neighbors' + + assert neigh.shape[1] == n_candidates, f'Wrong number of neighbors returned.' + if n_candidates > n_samples: + assert np.all(neigh[:, n_samples:] == -1), f'Returned indices for invalid neighbors' + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +@pytest.mark.parametrize('metric', ['invalid', None]) +def test_invalid_metric(metric): + X, y = make_classification(n_samples=10, n_features=10) + ann = ONNG(metric=metric) + with assert_raises(ValueError): + _ = ann.fit(X, y) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +@pytest.mark.parametrize('metric', ONNG.valid_metrics) +@pytest.mark.parametrize('n_jobs', [-1, 1, None]) +@pytest.mark.parametrize('verbose', [0, 1]) +def test_kneighbors_with_or_without_distances(metric, n_jobs, verbose): + n_samples = 100 + X = np.random.RandomState(1235232).rand(n_samples, 2) + ann = ONNG(metric=metric, + n_jobs=n_jobs, + verbose=verbose, + ) + ann.fit(X) + neigh_dist_self, neigh_ind_self = ann.kneighbors(X, return_distance=True) + ind_only_self = ann.kneighbors(X, return_distance=False) + + # Identical neighbors retrieved, whether dist or not + assert_array_equal(neigh_ind_self, ind_only_self) + + # Is the first hit always the object itself? + # Less strict test for inaccurate distances + if metric in ['Hamming', 'Jaccard', 'Normalized Cosine', 'Normalized Angle']: + assert np.intersect1d(neigh_ind_self[:, 0], np.arange(len(neigh_ind_self))).size >= 75 + else: + assert_array_equal(neigh_ind_self[:, 0], np.arange(len(neigh_ind_self))) + + if metric in ['Hamming', 'Jaccard']: # quite inaccurate... + assert neigh_dist_self[:, 0].mean() <= 0.016 + elif metric in ['Normalized Angle']: + assert_array_almost_equal(neigh_dist_self[:, 0], np.zeros(len(neigh_dist_self)), decimal=3) + else: # distances in [0, inf] + assert_array_almost_equal(neigh_dist_self[:, 0], np.zeros(len(neigh_dist_self))) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +@pytest.mark.parametrize('metric', ONNG.valid_metrics) +def test_kneighbors_with_or_without_self_hit(metric): + X = np.random.RandomState(1245544).rand(50, 2) + n_candidates = 5 + ann = ONNG(metric=metric, + n_candidates=n_candidates, + ) + ann.fit(X) + ind_self = ann.kneighbors(X, n_candidates=n_candidates+1, return_distance=False) + ind_no_self = ann.kneighbors(n_candidates=n_candidates, return_distance=False) + + if metric in ['Hamming', 'Jaccard']: # just inaccurate... + assert (ind_self[:, 0] == np.arange(len(ind_self))).sum() >= 46 + assert np.setdiff1d(ind_self[:, 1:], ind_no_self).size <= 10 + else: + assert_array_equal(ind_self[:, 0], np.arange(len(ind_self))) + assert_array_equal(ind_self[:, 1:], ind_no_self) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +def test_squared_euclidean_same_neighbors_as_euclidean(): + X, y = make_classification() + ann = ONNG(metric='euclidean') + ann.fit(X, y) + neigh_dist_eucl, neigh_ind_eucl = ann.kneighbors(X) + + ann = ONNG(metric='sqeuclidean') + ann.fit(X, y) + neigh_dist_sqeucl, neigh_ind_sqeucl = ann.kneighbors(X) + + assert_array_equal(neigh_ind_eucl, neigh_ind_sqeucl) + assert_array_almost_equal(neigh_dist_eucl ** 2, neigh_dist_sqeucl) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +def test_same_neighbors_as_with_exact_nn_search(): + X = np.random.RandomState(42).randn(10, 2) + + nn = NearestNeighbors() + nn_dist, nn_neigh = nn.fit(X).kneighbors(return_distance=True) + + ann = ONNG() + ann_dist, ann_neigh = ann.fit(X).kneighbors(return_distance=True) + + assert_array_almost_equal(ann_dist, nn_dist, decimal=5) + assert_array_almost_equal(ann_neigh, nn_neigh, decimal=0) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +def test_is_valid_estimator_in_persistent_memory(): + check_estimator(ONNG) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +@pytest.mark.xfail(reason='ngtpy.Index can not be pickled as of v1.7.6') +def test_is_valid_estimator_in_main_memory(): + check_estimator(ONNG(index_dir=None)) + + +@pytest.mark.skipif(sys.platform == 'win32', reason='ONNG not supported on Windows.') +@pytest.mark.parametrize('index_dir', [tuple(), 0, 'auto', '/dev/shm', '/tmp', None]) +def test_memory_mapped(index_dir): + X, y = make_classification(n_samples=10, + n_features=5, + random_state=123, + ) + ann = ONNG(index_dir=index_dir) + if isinstance(index_dir, str) or index_dir is None: + ann.fit(X, y) + _ = ann.kneighbors(X) + _ = ann.kneighbors() + else: + with np.testing.assert_raises(TypeError): + ann.fit(X, y) diff --git a/skhubness/neighbors/tests/test_rptree.py b/skhubness/neighbors/tests/test_rptree.py new file mode 100644 index 0000000..aacac47 --- /dev/null +++ b/skhubness/neighbors/tests/test_rptree.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: BSD-3-Clause + +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.testing import assert_array_equal, assert_array_almost_equal +from sklearn.utils.testing import assert_raises +from skhubness.neighbors import RandomProjectionTree, NearestNeighbors + + +@pytest.mark.parametrize('n_candidates', [1, 2, 5, 99, 100, 1000, ]) +@pytest.mark.parametrize('set_in_constructor', [True, False]) +@pytest.mark.parametrize('return_distance', [True, False]) +@pytest.mark.parametrize('search_among_indexed', [True, False]) +@pytest.mark.parametrize('verbose', [True, False]) +def test_return_correct_number_of_neighbors(n_candidates: int, + set_in_constructor: bool, + return_distance: bool, + search_among_indexed: bool, + verbose: bool): + n_samples = 100 + X, y = make_classification(n_samples=n_samples) + ann = RandomProjectionTree(n_candidates=n_candidates, verbose=verbose)\ + if set_in_constructor else RandomProjectionTree(verbose=verbose) + ann.fit(X, y) + X_query = None if search_among_indexed else X + neigh = ann.kneighbors(X_query, return_distance=return_distance) if set_in_constructor\ + else ann.kneighbors(X_query, n_candidates=n_candidates, return_distance=return_distance) + + if return_distance: + dist, neigh = neigh + assert dist.shape == neigh.shape, f'Shape of distances and indices matrices do not match.' + if n_candidates > n_samples: + assert np.all(np.isnan(dist[:, n_samples:])), f'Returned distances for invalid neighbors' + + assert neigh.shape[1] == n_candidates, f'Wrong number of neighbors returned.' + if n_candidates > n_samples: + assert np.all(neigh[:, n_samples:] == -1), f'Returned indices for invalid neighbors' + + +@pytest.mark.parametrize('metric', ['invalid', None]) +def test_invalid_metric(metric): + X, y = make_classification(n_samples=10, n_features=10) + ann = RandomProjectionTree(metric=metric) + with assert_raises(Exception): # annoy raises ValueError or TypeError + _ = ann.fit(X, y) + + +@pytest.mark.parametrize('metric', RandomProjectionTree.valid_metrics) +@pytest.mark.parametrize('n_jobs', [-1, 1, None]) +@pytest.mark.parametrize('verbose', [0, 1]) +def test_kneighbors_with_or_without_distances(metric, n_jobs, verbose): + n_samples = 100 + X, y = make_classification(n_samples=n_samples, + random_state=123, + ) + ann = RandomProjectionTree(metric=metric, + n_jobs=n_jobs, + verbose=verbose, + ) + ann.fit(X, y) + neigh_dist_self, neigh_ind_self = ann.kneighbors(X, return_distance=True) + ind_only_self = ann.kneighbors(X, return_distance=False) + + # Identical neighbors retrieved, whether dist or not + assert_array_equal(neigh_ind_self, ind_only_self) + + # Is the first hit always the object itself? + # Less strict test for dot/hamming distances + if metric in ['dot']: + assert np.setdiff1d(neigh_ind_self[:, 0], np.arange(len(neigh_ind_self))).size <= n_samples // 10 + elif metric in ['hamming']: + assert np.setdiff1d(neigh_ind_self[:, 0], np.arange(len(neigh_ind_self))).size <= n_samples // 100 + else: + assert_array_equal(neigh_ind_self[:, 0], np.arange(len(neigh_ind_self))) + + if metric in ['dot', 'angular']: + pass # does not guarantee self distance 0 + else: # distances in [0, inf] + assert_array_almost_equal(neigh_dist_self[:, 0], np.zeros(len(neigh_dist_self))) + + +@pytest.mark.parametrize('metric', RandomProjectionTree.valid_metrics) +def test_kneighbors_with_or_without_self_hit(metric): + X, y = make_classification(random_state=1234435) + n_candidates = 5 + ann = RandomProjectionTree(metric=metric, + n_candidates=n_candidates, + ) + ann.fit(X, y) + ind_self = ann.kneighbors(X, n_candidates=n_candidates+1, return_distance=False) + ind_no_self = ann.kneighbors(n_candidates=n_candidates, return_distance=False) + + if metric in ['dot']: # dot is just inaccurate... + assert (ind_self[:, 0] == np.arange(len(ind_self))).sum() > 92 + assert np.setdiff1d(ind_self[:, 1:], ind_no_self).size <= 10 + else: + assert_array_equal(ind_self[:, 0], np.arange(len(ind_self))) + assert_array_equal(ind_self[:, 1:], ind_no_self) + + +def test_squared_euclidean_same_neighbors_as_euclidean(): + X, y = make_classification() + ann = RandomProjectionTree(metric='euclidean') + ann.fit(X, y) + neigh_dist_eucl, neigh_ind_eucl = ann.kneighbors(X) + + ann = RandomProjectionTree(metric='sqeuclidean') + ann.fit(X, y) + neigh_dist_sqeucl, neigh_ind_sqeucl = ann.kneighbors(X) + + assert_array_equal(neigh_ind_eucl, neigh_ind_sqeucl) + assert_array_almost_equal(neigh_dist_eucl ** 2, neigh_dist_sqeucl) + + +def test_same_neighbors_as_with_exact_nn_search(): + X = np.random.RandomState(42).randn(10, 2) + + nn = NearestNeighbors() + nn_dist, nn_neigh = nn.fit(X).kneighbors(return_distance=True) + + ann = RandomProjectionTree() + ann_dist, ann_neigh = ann.fit(X).kneighbors(return_distance=True) + + assert_array_almost_equal(ann_dist, nn_dist, decimal=5) + assert_array_almost_equal(ann_neigh, nn_neigh, decimal=0) + + +def test_is_valid_estimator(): + check_estimator(RandomProjectionTree) + + +@pytest.mark.parametrize('mmap_dir', [None, 'auto', '/dev/shm', '/tmp']) +def test_memory_mapped(mmap_dir): + X, y = make_classification(n_samples=10, + n_features=5, + random_state=123, + ) + ann = RandomProjectionTree(mmap_dir=mmap_dir) + ann.fit(X, y) + _ = ann.kneighbors(X) + _ = ann.kneighbors() diff --git a/skhubness/neighbors/unsupervised.py b/skhubness/neighbors/unsupervised.py index 08ba430..98006cd 100644 --- a/skhubness/neighbors/unsupervised.py +++ b/skhubness/neighbors/unsupervised.py @@ -12,33 +12,38 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin, UnsupervisedMixin): """Unsupervised learner for implementing neighbor searches. - Read more in the `scikit-learn User Guide - `_. + Read more in the + `scikit-learn User Guide `_ Parameters ---------- - n_neighbors : int, optional (default = 5) + n_neighbors: int, optional (default = 5) Number of neighbors to use by default for :meth:`kneighbors` queries. - radius : float, optional (default = 1.0) + radius: float, optional (default = 1.0) Range of parameter space to use by default for :meth:`radius_neighbors` queries. - algorithm : {'auto', 'hnsw', 'lsh', 'ball_tree', 'kd_tree', 'brute'}, optional + algorithm : {'auto', 'hnsw', 'lsh', 'falconn_lsh', 'onng', 'rptree', + 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'hnsw' will use :class:`HNSW` - - 'lsh' will use :class:`LSH` + - 'lsh' will use :class:`PuffinnLSH` + - 'falconn_lsh' will use :class:`FalconnLSH` + - 'onng' will use :class:`ONNG` + - 'rptree' will use :class:`RandomProjectionTree` - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. + - 'auto' will attempt to decide the most appropriate exact algorithm + based on the values passed to :meth:`fit` method. This will not + select an approximate nearest neighbor algorithm. Note: fitting on sparse input will override the setting of this parameter, using brute force. - algorithm_params : dict, optional + algorithm_params: dict, optional Override default parameters of the NN algorithm. For example, with algorithm='lsh' and algorithm_params={n_candidates: 100} one hundred approximate neighbors are retrieved with LSH. @@ -46,12 +51,13 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, with hubness reduction. Finally, n_neighbors objects are used from the (optionally reordered) candidates. - # TODO add all supported hubness reduction methods - hubness : {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional + hubness: {'mutual_proximity', 'local_scaling', 'dis_sim_local', None}, optional Hubness reduction algorithm + - 'mutual_proximity' or 'mp' will use :class:`MutualProximity' - 'local_scaling' or 'ls' will use :class:`LocalScaling` - 'dis_sim_local' or 'dsl' will use :class:`DisSimLocal` + If None, no hubness reduction will be performed (=vanilla kNN). hubness_params: dict, optional @@ -60,13 +66,13 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, a mutual proximity variant is used, which models distance distributions with independent Gaussians. - leaf_size : int, optional (default = 30) + leaf_size: int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. - metric : string or callable, default 'minkowski' + metric: string or callable, default 'minkowski' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. @@ -92,16 +98,16 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, See the documentation for scipy.spatial.distance for details on these metrics. - p : integer, optional (default = 2) + p: integer, optional (default = 2) Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - metric_params : dict, optional (default = None) + metric_params: dict, optional (default = None) Additional keyword arguments for the metric function. - n_jobs : int or None, optional (default=None) + n_jobs: int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. diff --git a/skhubness/reduction/base.py b/skhubness/reduction/base.py index b335e79..fc77765 100644 --- a/skhubness/reduction/base.py +++ b/skhubness/reduction/base.py @@ -20,6 +20,7 @@ def transform(self, neigh_dist, neigh_ind, X, assume_sorted, return_distance=Tru pass # pragma: no cover def fit_transform(self, neigh_dist, neigh_ind, X, assume_sorted=True, return_distance=True, *args, **kwargs): + """ Equivalent to call .fit().transform() """ self.fit(neigh_dist, neigh_ind, X, assume_sorted, *args, **kwargs) return self.transform(neigh_dist, neigh_ind, X, assume_sorted, return_distance) @@ -34,6 +35,7 @@ def fit(self, *args, **kwargs): pass # pragma: no cover def transform(self, neigh_dist, neigh_ind, X, assume_sorted=True, return_distance=True, *args, **kwargs): + """ Equivalent to call .fit().transform() """ if return_distance: return neigh_dist, neigh_ind else: diff --git a/skhubness/utils/check.py b/skhubness/utils/check.py new file mode 100644 index 0000000..383f3bc --- /dev/null +++ b/skhubness/utils/check.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: BSD-3-Clause +# Author: Roman Feldbauer +import numpy as np + +__all__ = ['check_n_candidates'] + + +def check_n_candidates(n_candidates): + # Check the n_neighbors parameter + if n_candidates <= 0: + raise ValueError(f"Expected n_neighbors > 0. Got {n_candidates:d}") + if not np.issubdtype(type(n_candidates), np.integer): + raise TypeError(f"n_neighbors does not take {type(n_candidates)} value, enter integer value") + return n_candidates diff --git a/skhubness/utils/io.py b/skhubness/utils/io.py new file mode 100644 index 0000000..0f5c4aa --- /dev/null +++ b/skhubness/utils/io.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: BSD-3-Clause +# Author: Roman Feldbauer +import logging +from tempfile import mkstemp, NamedTemporaryFile + +__all__ = ['create_tempfile_preferably_in_dir'] + + +def create_tempfile_preferably_in_dir(suffix=None, prefix=None, directory=None, persistent: bool = False, ): + """ Create a temporary file with precedence for directory if possible, in TMP otherwise. + For example, this is useful to try to save into /dev/shm. + """ + temp_file = mkstemp if persistent else NamedTemporaryFile + try: + handle = temp_file(suffix=suffix, prefix=prefix, dir=directory) + warn = False + except FileNotFoundError: + handle = temp_file(suffix=suffix, prefix=prefix, dir=None) + warn = True + + # Extract the path (as string) + try: + path = handle.name + except AttributeError: + _, path = handle + + if warn: + logging.warning(f'Could not create temp file in {directory}. ' + f'Instead, the path is {path}.') + return path diff --git a/skhubness/utils/platform.py b/skhubness/utils/platform.py new file mode 100644 index 0000000..a2fe8d1 --- /dev/null +++ b/skhubness/utils/platform.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: BSD-3-Clause +import sys + + +def available_ann_algorithms_on_current_platform(): + """ Get approximate nearest neighbor algorithms available for the current platform/OS + + Currently, the algorithms are provided by the following libraries: + + * 'hnsw': nmslib + * 'rptree': annoy + * 'lsh': puffinn + * 'falconn_lsh': falconn + * 'onng': NGT + + Returns + ------- + algorithms: Tuple[str] + A tuple of available algorithms + """ + # Windows + if sys.platform == 'win32': # pragma: no cover + algorithms = ('hnsw', + 'rptree', + ) + # MacOS + elif sys.platform == 'darwin': + algorithms = ('falconn_lsh', + 'hnsw', + 'rptree', + 'onng', + ) + # Linux + elif sys.platform == 'linux': + algorithms = ('lsh', + 'falconn_lsh', + 'hnsw', + 'rptree', + 'onng', + ) + # others: undefined + else: # pragma: no cover + algorithms = () + + return algorithms diff --git a/skhubness/utils/tests/test_io.py b/skhubness/utils/tests/test_io.py new file mode 100644 index 0000000..81d94b5 --- /dev/null +++ b/skhubness/utils/tests/test_io.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: BSD-3-Clause +# Author: Roman Feldbauer +import os +import platform +import pytest +from skhubness.utils.io import create_tempfile_preferably_in_dir + + +@pytest.mark.parametrize('directory', [None, '/does/not/exist/kluawev']) +@pytest.mark.parametrize('persistent', [True, False]) +def test_tempfile(directory, persistent): + f = create_tempfile_preferably_in_dir(directory=directory, persistent=persistent) + assert isinstance(f, str) + if persistent and platform.system() != 'Windows': # locked by running process on Windows + os.remove(f) diff --git a/travis/install-build-ngtpy.sh b/travis/install-build-ngtpy.sh new file mode 100755 index 0000000..0898ba0 --- /dev/null +++ b/travis/install-build-ngtpy.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# Build external dependencies that cannot successfully install via pip or conda +# If you use this file as template, don't forget to `chmod a+x newfile` + +set -e + +# Check for the operating system and install NGT (C++ lib) and others +if [[ $(uname) == "Darwin" ]]; then + echo "Running under Mac OS X and CPU..." + sysctl machdep.cpu.brand_string + + # Setup environment + /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + echo "brew update && brew upgrade" + if brew ls --versions cmake > /dev/null; then + echo "cmake already installed" + else + brew install cmake + fi + xcode-select --install || true + echo "Install MacOS SDK header for 10.14..." + open /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg || true + + if brew ls --versions gcc@9 > /dev/null; then + echo "gcc@9 already installed, upgrading" + brew upgrade gcc@9 + else + brew install gcc@9 + fi + ln -s ./gcc-9 /usr/local/bin/gcc + ln -s ./g++-9 /usr/local/bin/g++ + echo "Prepend /usr/local/bin to PATH" + export PATH=/usr/local/bin:$PATH + export CXX=g++ + export CC=gcc + + # Find the latest release of NGT + FILE=$(curl -s https://api.github.com/repos/yahoojapan/NGT/releases/latest | grep zipball_url | cut -d '"' -f 4) + if [ -z "${FILE}" ]; then + FILE="https://github.com/yahoojapan/NGT/archive/v1.7.9.zip" + echo "Could not fetch latest release, will use predefined one."; + else + echo "Latest release is '$FILE'"; + fi + wget "$FILE" + BNAME=$(basename "$FILE") + + # Install NGT C++ + rm -rf ./*NGT* + unzip "$BNAME" + cd ./*NGT* # could be NGT-v.x.x.x, or yahoojapan-NGT-v.x.x.x + mkdir build + cd build + which gcc + echo "$PATH" + # TODO work-around for https://github.com/yahoojapan/NGT/issues/34 + # enable AVX when bug is fixed + cmake -DNGT_AVX_DISABLED=ON .. + CXXFLAGS='-fpermissive' make + sudo make install + + # make library available + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib64:/usr/local/lib" + + # Install NGT Python bindings + cd ../python + rm -rf dist + python3 setup.py sdist # python somehow maps to python2... + pip3 install dist/ngt-*.tar.gz + +elif [[ $(uname -s) == Linux* ]]; then + echo "Running under Linux on CPU..." + cat /proc/cpuinfo + + # Find the latest release + FILE=$(curl -s https://api.github.com/repos/yahoojapan/NGT/releases/latest | grep zipball_url | cut -d '"' -f 4) + if [ -z "${FILE}" ]; then + FILE="https://github.com/yahoojapan/NGT/archive/v1.7.9.zip" + echo "Could not fetch latest release, will use predefined one."; + else + echo "Latest release is '$FILE'"; + fi + echo "Downloading $FILE" + wget "$FILE" + BNAME=$(basename "$FILE") + echo "Release is $BNAME" + + # Install NGT + rm -rf ./*NGT* + unzip "$BNAME" + cd ./*NGT* # could be NGT-v.x.x.x, or yahoojapan-NGT-v.x.x.x + mkdir build + cd build + which gcc + echo "$PATH" + cmake .. + make + sudo make install + + # make library available + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib64:/usr/local/lib" + sudo ldconfig + + # Install NGT Python bindings + cd ../python + rm -rf dist + python setup.py sdist + pip install dist/ngt-*.tar.gz + +elif [[ $(uname -s) == MINGW32_NT* ]]; then + echo "Running under Win x86-32" + echo "Nothing to build." + +elif [[ $(uname -s) == MINGW64_NT* ]]; then + echo "Running under Win x86-64" + echo "Nothing to build." + +elif [[ $(uname -s) == CYGWIN* ]]; then + echo "Running under Cygwin" + echo "Nothing to build." + +fi diff --git a/travis/install-build-puffinn.sh b/travis/install-build-puffinn.sh new file mode 100755 index 0000000..7479d4e --- /dev/null +++ b/travis/install-build-puffinn.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Build external dependencies that cannot successfully install via pip or conda +# If you use this file as template, don't forget to `chmod a+x newfile` + +set -e + +# Check for the operating system and install puffinn +if [[ $(uname) == "Darwin" ]]; then + echo "Running under Mac OS X and CPU..." + echo "Will not install puffinn, due to limited support for MacOS." + # git clone https://github.com/puffinn/puffinn.git + # cd puffinn + # python3 setup.py build + # pip install . + # cd .. + +elif [[ $(uname -s) == Linux* ]]; then + echo "Running under Linux on CPU..." + # Trying to install puffinn from cache, + # and only build if this fails. + pip install puffinn || (\ + git clone https://github.com/puffinn/puffinn.git;\ + cd puffinn;\ + python3 setup.py build;\ + pip install . ;\ + cd ..) + +elif [[ $(uname -s) == MINGW32_NT* ]]; then + echo "Running under Win x86-32" + echo "Nothing to build." + +elif [[ $(uname -s) == MINGW64_NT* ]]; then + echo "Running under Win x86-64" + echo "Nothing to build." + +elif [[ $(uname -s) == CYGWIN* ]]; then + echo "Running under Cygwin" + echo "Nothing to build." + +fi diff --git a/travis/install-conda.sh b/travis/install-conda.sh index 75d73d6..7f618b6 100755 --- a/travis/install-conda.sh +++ b/travis/install-conda.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# If you use this file as template, don't forget to `chmod a+x newfile` set -e @@ -23,7 +24,8 @@ else # if it does not exist, we need to install miniconda conda config --set always_yes yes --set changeps1 no conda update -q conda conda info -a # for debugging - echo $TRAVIS_PYTHON_VERSION - conda create --yes -n test python=$TRAVIS_PYTHON_VERSION + echo "$TRAVIS_PYTHON_VERSION" + conda create --yes -n test python="$TRAVIS_PYTHON_VERSION" source activate test + fi diff --git a/travis/install-pip.sh b/travis/install-pip.sh index 2aea7ab..80dc588 100755 --- a/travis/install-pip.sh +++ b/travis/install-pip.sh @@ -1,10 +1,8 @@ #!/usr/bin/env bash +# If you use this file as template, don't forget to `chmod a+x newfile` set -e -echo "First install pybind11, so that nmslib build can succeed" -pip install pybind11 - echo "pip installing required python packages" pip install -r requirements.txt