From ca91a9a3fa0e8219461c94ac0c5dbfafbe1a8517 Mon Sep 17 00:00:00 2001 From: Mathieu Guillame-Bert Date: Wed, 8 May 2024 01:21:37 -0700 Subject: [PATCH] Release YDF 0.4.3 and TF-DF 1.9.1 PiperOrigin-RevId: 631693889 --- CHANGELOG.md | 6 ++ configure/setup.py | 5 +- documentation/known_issues.md | 1 + tensorflow_decision_forests/__init__.py | 2 +- .../keras/wrappers_pre_generated.py | 72 ++++++++++++++++--- .../tools/run_e2e_tfdf_test.sh | 67 +++++++++++++++++ tools/build_pip_package.sh | 19 ++--- tools/start_compile_docker.sh | 16 +++-- 8 files changed, 164 insertions(+), 24 deletions(-) create mode 100755 tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c7200b..eaa29c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 1.9.1 - 2024-05-07 + +### Fix + +- Solve dependency collision of YDF Proto between PYDF and TF-DF. + ## 1.9.0 - 2024-03-12 ### Fix diff --git a/configure/setup.py b/configure/setup.py index 19ea78d..fd4b97f 100644 --- a/configure/setup.py +++ b/configure/setup.py @@ -16,12 +16,13 @@ This file is used by tools/build_pip_package.sh. """ + import platform import setuptools from setuptools.command.install import install from setuptools.dist import Distribution -_VERSION = "1.9.0" +_VERSION = "1.9.1" with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -35,6 +36,7 @@ "wheel", "wurlitzer", "tf_keras~=2.16", + "ydf", ] @@ -54,6 +56,7 @@ def has_ext_modules(self): def is_pure(self): return False + try: from wheel.bdist_wheel import bdist_wheel as _bdist_wheel diff --git a/documentation/known_issues.md b/documentation/known_issues.md index e181ae1..ab4b097 100644 --- a/documentation/known_issues.md +++ b/documentation/known_issues.md @@ -54,6 +54,7 @@ The following table shows the compatibility between tensorflow_decision_forests | tensorflow --------------------------- | --------------- +1.9.1 | 2.16.1 1.9.0 | 2.16.1 1.8.0 - 1.8.1 | 2.15.0 1.6.0 - 1.7.0 | 2.14.0 diff --git a/tensorflow_decision_forests/__init__.py b/tensorflow_decision_forests/__init__.py index e99378d..1e367c9 100644 --- a/tensorflow_decision_forests/__init__.py +++ b/tensorflow_decision_forests/__init__.py @@ -51,7 +51,7 @@ """ -__version__ = "1.9.0" +__version__ = "1.9.1" __author__ = "Mathieu Guillame-Bert" compatible_tf_versions = ["2.16.1"] diff --git a/tensorflow_decision_forests/keras/wrappers_pre_generated.py b/tensorflow_decision_forests/keras/wrappers_pre_generated.py index b7684f0..4e8acd3 100644 --- a/tensorflow_decision_forests/keras/wrappers_pre_generated.py +++ b/tensorflow_decision_forests/keras/wrappers_pre_generated.py @@ -257,6 +257,18 @@ class CartModel(core.CoreModel): expressed in seconds. Each learning algorithm is free to use this parameter at it sees fit. Enabling maximum training duration makes the model training non-deterministic. Default: -1.0. + mhld_oblique_max_num_attributes: For MHLD oblique splits i.e. + `split_axis=MHLD_OBLIQUE`. Maximum number of attributes in the projection. + Increasing this value increases the training time. Decreasing this value + acts as a regularization. The value should be in [2, + num_numerical_features]. If the value is above the total number of + numerical features, the value is capped automatically. The value 1 is + allowed but results in ordinary (non-oblique) splits. Default: None. + mhld_oblique_sample_attributes: For MHLD oblique splits i.e. + `split_axis=MHLD_OBLIQUE`. If true, applies the attribute sampling + controlled by the "num_candidate_attributes" or + "num_candidate_attributes_ratio" parameters. If false, all the attributes + are tested. Default: None. min_examples: Minimum number of examples in a node. Default: 5. missing_value_policy: Method used to handle missing attribute values. - `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean @@ -345,9 +357,11 @@ class CartModel(core.CoreModel): split_axis: What structure of split to consider for numerical features. - `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This is the "classical" way to train a tree. Default value. - `SPARSE_OBLIQUE`: - Sparse oblique splits (i.e. splits one a small number of features) from - "Sparse Projection Oblique Random Forests", Tomita et al., 2020. Default: - "AXIS_ALIGNED". + Sparse oblique splits (i.e. random splits one a small number of features) + from "Sparse Projection Oblique Random Forests", Tomita et al., 2020. - + `MHLD_OBLIQUE`: Multi-class Hellinger Linear Discriminant splits from + "Classification Based on Multivariate Contrast Patterns", Canete-Sifuentes + et al., 2029 Default: "AXIS_ALIGNED". uplift_min_examples_in_treatment: For uplift models only. Minimum number of examples per treatment in a node. Default: 5. uplift_split_score: For uplift models only. Splitter score i.e. score @@ -402,6 +416,8 @@ def __init__( max_num_nodes: Optional[int] = None, maximum_model_size_in_memory_in_bytes: Optional[float] = -1.0, maximum_training_duration_seconds: Optional[float] = -1.0, + mhld_oblique_max_num_attributes: Optional[int] = None, + mhld_oblique_sample_attributes: Optional[bool] = None, min_examples: Optional[int] = 5, missing_value_policy: Optional[str] = "GLOBAL_IMPUTATION", num_candidate_attributes: Optional[int] = 0, @@ -445,6 +461,8 @@ def __init__( maximum_model_size_in_memory_in_bytes ), "maximum_training_duration_seconds": maximum_training_duration_seconds, + "mhld_oblique_max_num_attributes": mhld_oblique_max_num_attributes, + "mhld_oblique_sample_attributes": mhld_oblique_sample_attributes, "min_examples": min_examples, "missing_value_policy": missing_value_policy, "num_candidate_attributes": num_candidate_attributes, @@ -1124,6 +1142,18 @@ class GradientBoostedTreesModel(core.CoreModel): expressed in seconds. Each learning algorithm is free to use this parameter at it sees fit. Enabling maximum training duration makes the model training non-deterministic. Default: -1.0. + mhld_oblique_max_num_attributes: For MHLD oblique splits i.e. + `split_axis=MHLD_OBLIQUE`. Maximum number of attributes in the projection. + Increasing this value increases the training time. Decreasing this value + acts as a regularization. The value should be in [2, + num_numerical_features]. If the value is above the total number of + numerical features, the value is capped automatically. The value 1 is + allowed but results in ordinary (non-oblique) splits. Default: None. + mhld_oblique_sample_attributes: For MHLD oblique splits i.e. + `split_axis=MHLD_OBLIQUE`. If true, applies the attribute sampling + controlled by the "num_candidate_attributes" or + "num_candidate_attributes_ratio" parameters. If false, all the attributes + are tested. Default: None. min_examples: Minimum number of examples in a node. Default: 5. missing_value_policy: Method used to handle missing attribute values. - `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean @@ -1232,9 +1262,11 @@ class GradientBoostedTreesModel(core.CoreModel): split_axis: What structure of split to consider for numerical features. - `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This is the "classical" way to train a tree. Default value. - `SPARSE_OBLIQUE`: - Sparse oblique splits (i.e. splits one a small number of features) from - "Sparse Projection Oblique Random Forests", Tomita et al., 2020. Default: - "AXIS_ALIGNED". + Sparse oblique splits (i.e. random splits one a small number of features) + from "Sparse Projection Oblique Random Forests", Tomita et al., 2020. - + `MHLD_OBLIQUE`: Multi-class Hellinger Linear Discriminant splits from + "Classification Based on Multivariate Contrast Patterns", Canete-Sifuentes + et al., 2029 Default: "AXIS_ALIGNED". subsample: Ratio of the dataset (sampling without replacement) used to train individual trees for the random sampling method. If \\"subsample\\" is set and if \\"sampling_method\\" is NOT set or set to \\"NONE\\", then @@ -1324,6 +1356,8 @@ def __init__( max_num_nodes: Optional[int] = None, maximum_model_size_in_memory_in_bytes: Optional[float] = -1.0, maximum_training_duration_seconds: Optional[float] = -1.0, + mhld_oblique_max_num_attributes: Optional[int] = None, + mhld_oblique_sample_attributes: Optional[bool] = None, min_examples: Optional[int] = 5, missing_value_policy: Optional[str] = "GLOBAL_IMPUTATION", num_candidate_attributes: Optional[int] = -1, @@ -1397,6 +1431,8 @@ def __init__( maximum_model_size_in_memory_in_bytes ), "maximum_training_duration_seconds": maximum_training_duration_seconds, + "mhld_oblique_max_num_attributes": mhld_oblique_max_num_attributes, + "mhld_oblique_sample_attributes": mhld_oblique_sample_attributes, "min_examples": min_examples, "missing_value_policy": missing_value_policy, "num_candidate_attributes": num_candidate_attributes, @@ -2213,6 +2249,18 @@ class RandomForestModel(core.CoreModel): expressed in seconds. Each learning algorithm is free to use this parameter at it sees fit. Enabling maximum training duration makes the model training non-deterministic. Default: -1.0. + mhld_oblique_max_num_attributes: For MHLD oblique splits i.e. + `split_axis=MHLD_OBLIQUE`. Maximum number of attributes in the projection. + Increasing this value increases the training time. Decreasing this value + acts as a regularization. The value should be in [2, + num_numerical_features]. If the value is above the total number of + numerical features, the value is capped automatically. The value 1 is + allowed but results in ordinary (non-oblique) splits. Default: None. + mhld_oblique_sample_attributes: For MHLD oblique splits i.e. + `split_axis=MHLD_OBLIQUE`. If true, applies the attribute sampling + controlled by the "num_candidate_attributes" or + "num_candidate_attributes_ratio" parameters. If false, all the attributes + are tested. Default: None. min_examples: Minimum number of examples in a node. Default: 5. missing_value_policy: Method used to handle missing attribute values. - `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean @@ -2315,9 +2363,11 @@ class RandomForestModel(core.CoreModel): split_axis: What structure of split to consider for numerical features. - `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This is the "classical" way to train a tree. Default value. - `SPARSE_OBLIQUE`: - Sparse oblique splits (i.e. splits one a small number of features) from - "Sparse Projection Oblique Random Forests", Tomita et al., 2020. Default: - "AXIS_ALIGNED". + Sparse oblique splits (i.e. random splits one a small number of features) + from "Sparse Projection Oblique Random Forests", Tomita et al., 2020. - + `MHLD_OBLIQUE`: Multi-class Hellinger Linear Discriminant splits from + "Classification Based on Multivariate Contrast Patterns", Canete-Sifuentes + et al., 2029 Default: "AXIS_ALIGNED". uplift_min_examples_in_treatment: For uplift models only. Minimum number of examples per treatment in a node. Default: 5. uplift_split_score: For uplift models only. Splitter score i.e. score @@ -2380,6 +2430,8 @@ def __init__( max_num_nodes: Optional[int] = None, maximum_model_size_in_memory_in_bytes: Optional[float] = -1.0, maximum_training_duration_seconds: Optional[float] = -1.0, + mhld_oblique_max_num_attributes: Optional[int] = None, + mhld_oblique_sample_attributes: Optional[bool] = None, min_examples: Optional[int] = 5, missing_value_policy: Optional[str] = "GLOBAL_IMPUTATION", num_candidate_attributes: Optional[int] = 0, @@ -2433,6 +2485,8 @@ def __init__( maximum_model_size_in_memory_in_bytes ), "maximum_training_duration_seconds": maximum_training_duration_seconds, + "mhld_oblique_max_num_attributes": mhld_oblique_max_num_attributes, + "mhld_oblique_sample_attributes": mhld_oblique_sample_attributes, "min_examples": min_examples, "missing_value_policy": missing_value_policy, "num_candidate_attributes": num_candidate_attributes, diff --git a/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh b/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh new file mode 100755 index 0000000..79ef913 --- /dev/null +++ b/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright 2021 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +# Converts a non-submitted CL to a standalone Bazel project in a local +# directory, compile the project and run the tests. +# +# Usage example: +# third_party/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh + +set -vex + +LOCAL_DIR="/usr/local/google/home/${USER}/git/decision-forests" + +CL=$(hg exportedcl) +echo "Current CL: ${CL}" +echo "Make sure the CL is synced!" + +function export_project() { + COPYBARA="/google/bin/releases/copybara/public/copybara/copybara" + + # Test the copy bara configuration. + bazel test third_party/tensorflow_decision_forests:copybara_test + + echo "Export a Bazel project locally" + echo "==============================" + + rm -fr ${LOCAL_DIR} + ${COPYBARA} third_party/tensorflow_decision_forests/copy.bara.sky presubmit_piper_to_gerrit ${CL} \ + --dry-run --init-history --squash --force \ + --git-destination-path ${LOCAL_DIR} --ignore-noop + + /google/bin/releases/opensource/thirdparty/cross/cross ${LOCAL_DIR} +} + +echo "Test the project" +echo "================" + +run_all() { + cd ${LOCAL_DIR} + + # Start the Docker + sudo ./tools/start_compile_docker.sh /bin/bash + + # In the docker, you can now trigger the builder with the following line in + # the docker: + # RUN_TESTS=1 PY_VERSION=3.9 TF_VERSION=2.16.1 ./tools/test_bazel.sh + + # Alternatively, you can trigger the build directly with: + # sudo ./tools/start_compile_docker.sh "RUN_TESTS=1 PY_VERSION=3.8 TF_VERSION=2.10.0 ./tools/test_bazel.sh && chmod -R a+xrw . && /bin/bash" +} + +export_project +run_all diff --git a/tools/build_pip_package.sh b/tools/build_pip_package.sh index 043c5a9..1d8a84f 100755 --- a/tools/build_pip_package.sh +++ b/tools/build_pip_package.sh @@ -116,15 +116,16 @@ function assemble_files() { # Distribution server binaries cp ${SRCBIN}/keras/grpc_worker_main ${SRCPK}/tensorflow_decision_forests/keras/ - # YDF's proto wrappers. - YDFSRCBIN="bazel-bin/external/ydf/yggdrasil_decision_forests" - mkdir -p ${SRCPK}/yggdrasil_decision_forests - pushd ${YDFSRCBIN} - find . -name \*.py -exec rsync -R -arv {} ${SRCPK}/yggdrasil_decision_forests \; - popd - - # Add __init__.py to all exported Yggdrasil sub-directories. - find ${SRCPK}/yggdrasil_decision_forests -type d -exec touch {}/__init__.py \; + # Note: Starting with TF-DF 0.9.1, the YDF Protos are included by (P)YDF. + # TODO: Remove this block. + # # YDF's proto wrappers. + # YDFSRCBIN="bazel-bin/external/ydf/yggdrasil_decision_forests" + # mkdir -p ${SRCPK}/yggdrasil_decision_forests + # pushd ${YDFSRCBIN} + # find . -name \*.py -exec rsync -R -arv {} ${SRCPK}/yggdrasil_decision_forests \; + # popd + # # Add __init__.py to all exported Yggdrasil sub-directories. + # find ${SRCPK}/yggdrasil_decision_forests -type d -exec touch {}/__init__.py \; } # Build a pip package. diff --git a/tools/start_compile_docker.sh b/tools/start_compile_docker.sh index df67b72..12118e8 100755 --- a/tools/start_compile_docker.sh +++ b/tools/start_compile_docker.sh @@ -58,15 +58,23 @@ # ./tools/build_pip_package.sh ALL_VERSIONS_ALREADY_ASSEMBLED # # https://hub.docker.com/r/tensorflow/build/tags?page=1 -DOCKER=tensorflow/build:2.17-python3.9 # Current directory # Useful if Yggdrasil Decision Forests is available locally in a neighbor # directory. TFDF_DIRNAME=${PWD##*/} -# Download docker -docker pull ${DOCKER} +DOCKER_IMAGE=tensorflow/build:2.16-python3.9 +DOCKER_CONTAINER=compile_tfdf + +echo "Available containers:" +sudo sudo docker container ls -a --size + +set +e # Ignore error if the container already exist +CREATE_DOCKER_FLAGS="-i -t -p 8889:8889 --network host -v ${PWD}/..:/working_dir -w /working_dir/${TFDF_DIRNAME}" +sudo docker create ${CREATE_DOCKER_FLAGS} --name ${DOCKER_CONTAINER} ${DOCKER_IMAGE} +sudo docker start ${DOCKER_CONTAINER} +set -e # Start docker -docker run -it -v ${PWD}/..:/working_dir -w /working_dir/${TFDF_DIRNAME} ${DOCKER} $@ +sudo docker exec -it ${DOCKER_CONTAINER} /bin/bash -c $@