Release YDF 0.4.3 and TF-DF 1.9.1

PiperOrigin-RevId: 631693889
tensorflow · May 8, 2024 · ca91a9a · ca91a9a
1 parent 12f70f5
commit ca91a9a
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 1.9.1 - 2024-05-07
+
+### Fix
+
+-   Solve dependency collision of YDF Proto between PYDF and TF-DF.
+
 ## 1.9.0 - 2024-03-12
 
 ### Fix

diff --git a/configure/setup.py b/configure/setup.py
@@ -16,12 +16,13 @@
 
 This file is used by tools/build_pip_package.sh.
 """
+
 import platform
 import setuptools
 from setuptools.command.install import install
 from setuptools.dist import Distribution
 
-_VERSION = "1.9.0"
+_VERSION = "1.9.1"
 
 with open("README.md", "r", encoding="utf-8") as fh:
   long_description = fh.read()
@@ -35,6 +36,7 @@
     "wheel",
     "wurlitzer",
     "tf_keras~=2.16",
+    "ydf",
 ]
 
 
@@ -54,6 +56,7 @@ def has_ext_modules(self):
   def is_pure(self):
     return False
 
+
 try:
   from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 

diff --git a/documentation/known_issues.md b/documentation/known_issues.md
@@ -54,6 +54,7 @@ The following table shows the compatibility between
 
 tensorflow_decision_forests | tensorflow
 --------------------------- | ---------------
+1.9.1                       | 2.16.1
 1.9.0                       | 2.16.1
 1.8.0 - 1.8.1               | 2.15.0
 1.6.0 - 1.7.0               | 2.14.0

diff --git a/tensorflow_decision_forests/__init__.py b/tensorflow_decision_forests/__init__.py
@@ -51,7 +51,7 @@
 
 """
 
-__version__ = "1.9.0"
+__version__ = "1.9.1"
 __author__ = "Mathieu Guillame-Bert"
 
 compatible_tf_versions = ["2.16.1"]

diff --git a/tensorflow_decision_forests/keras/wrappers_pre_generated.py b/tensorflow_decision_forests/keras/wrappers_pre_generated.py
@@ -257,6 +257,18 @@ class CartModel(core.CoreModel):
       expressed in seconds. Each learning algorithm is free to use this
       parameter at it sees fit. Enabling maximum training duration makes the
       model training non-deterministic. Default: -1.0.
+    mhld_oblique_max_num_attributes: For MHLD oblique splits i.e.
+      `split_axis=MHLD_OBLIQUE`. Maximum number of attributes in the projection.
+      Increasing this value increases the training time. Decreasing this value
+      acts as a regularization. The value should be in [2,
+      num_numerical_features]. If the value is above the total number of
+      numerical features, the value is capped automatically. The value 1 is
+      allowed but results in ordinary (non-oblique) splits. Default: None.
+    mhld_oblique_sample_attributes: For MHLD oblique splits i.e.
+      `split_axis=MHLD_OBLIQUE`. If true, applies the attribute sampling
+      controlled by the "num_candidate_attributes" or
+      "num_candidate_attributes_ratio" parameters. If false, all the attributes
+      are tested. Default: None.
     min_examples: Minimum number of examples in a node. Default: 5.
     missing_value_policy: Method used to handle missing attribute values. -
       `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean
@@ -345,9 +357,11 @@ class CartModel(core.CoreModel):
     split_axis: What structure of split to consider for numerical features. -
       `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This
       is the "classical" way to train a tree. Default value. - `SPARSE_OBLIQUE`:
-      Sparse oblique splits (i.e. splits one a small number of features) from
-      "Sparse Projection Oblique Random Forests", Tomita et al., 2020. Default:
-      "AXIS_ALIGNED".
+      Sparse oblique splits (i.e. random splits one a small number of features)
+      from "Sparse Projection Oblique Random Forests", Tomita et al., 2020. -
+      `MHLD_OBLIQUE`: Multi-class Hellinger Linear Discriminant splits from
+      "Classification Based on Multivariate Contrast Patterns", Canete-Sifuentes
+      et al., 2029 Default: "AXIS_ALIGNED".
     uplift_min_examples_in_treatment: For uplift models only. Minimum number of
       examples per treatment in a node. Default: 5.
     uplift_split_score: For uplift models only. Splitter score i.e. score
@@ -402,6 +416,8 @@ def __init__(
       max_num_nodes: Optional[int] = None,
       maximum_model_size_in_memory_in_bytes: Optional[float] = -1.0,
       maximum_training_duration_seconds: Optional[float] = -1.0,
+      mhld_oblique_max_num_attributes: Optional[int] = None,
+      mhld_oblique_sample_attributes: Optional[bool] = None,
       min_examples: Optional[int] = 5,
       missing_value_policy: Optional[str] = "GLOBAL_IMPUTATION",
       num_candidate_attributes: Optional[int] = 0,
@@ -445,6 +461,8 @@ def __init__(
             maximum_model_size_in_memory_in_bytes
         ),
         "maximum_training_duration_seconds": maximum_training_duration_seconds,
+        "mhld_oblique_max_num_attributes": mhld_oblique_max_num_attributes,
+        "mhld_oblique_sample_attributes": mhld_oblique_sample_attributes,
         "min_examples": min_examples,
         "missing_value_policy": missing_value_policy,
         "num_candidate_attributes": num_candidate_attributes,
@@ -1124,6 +1142,18 @@ class GradientBoostedTreesModel(core.CoreModel):
       expressed in seconds. Each learning algorithm is free to use this
       parameter at it sees fit. Enabling maximum training duration makes the
       model training non-deterministic. Default: -1.0.
+    mhld_oblique_max_num_attributes: For MHLD oblique splits i.e.
+      `split_axis=MHLD_OBLIQUE`. Maximum number of attributes in the projection.
+      Increasing this value increases the training time. Decreasing this value
+      acts as a regularization. The value should be in [2,
+      num_numerical_features]. If the value is above the total number of
+      numerical features, the value is capped automatically. The value 1 is
+      allowed but results in ordinary (non-oblique) splits. Default: None.
+    mhld_oblique_sample_attributes: For MHLD oblique splits i.e.
+      `split_axis=MHLD_OBLIQUE`. If true, applies the attribute sampling
+      controlled by the "num_candidate_attributes" or
+      "num_candidate_attributes_ratio" parameters. If false, all the attributes
+      are tested. Default: None.
     min_examples: Minimum number of examples in a node. Default: 5.
     missing_value_policy: Method used to handle missing attribute values. -
       `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean
@@ -1232,9 +1262,11 @@ class GradientBoostedTreesModel(core.CoreModel):
     split_axis: What structure of split to consider for numerical features. -
       `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This
       is the "classical" way to train a tree. Default value. - `SPARSE_OBLIQUE`:
-      Sparse oblique splits (i.e. splits one a small number of features) from
-      "Sparse Projection Oblique Random Forests", Tomita et al., 2020. Default:
-      "AXIS_ALIGNED".
+      Sparse oblique splits (i.e. random splits one a small number of features)
+      from "Sparse Projection Oblique Random Forests", Tomita et al., 2020. -
+      `MHLD_OBLIQUE`: Multi-class Hellinger Linear Discriminant splits from
+      "Classification Based on Multivariate Contrast Patterns", Canete-Sifuentes
+      et al., 2029 Default: "AXIS_ALIGNED".
     subsample: Ratio of the dataset (sampling without replacement) used to train
       individual trees for the random sampling method. If \\"subsample\\" is set
       and if \\"sampling_method\\" is NOT set or set to \\"NONE\\", then
@@ -1324,6 +1356,8 @@ def __init__(
       max_num_nodes: Optional[int] = None,
       maximum_model_size_in_memory_in_bytes: Optional[float] = -1.0,
       maximum_training_duration_seconds: Optional[float] = -1.0,
+      mhld_oblique_max_num_attributes: Optional[int] = None,
+      mhld_oblique_sample_attributes: Optional[bool] = None,
       min_examples: Optional[int] = 5,
       missing_value_policy: Optional[str] = "GLOBAL_IMPUTATION",
       num_candidate_attributes: Optional[int] = -1,
@@ -1397,6 +1431,8 @@ def __init__(
             maximum_model_size_in_memory_in_bytes
         ),
         "maximum_training_duration_seconds": maximum_training_duration_seconds,
+        "mhld_oblique_max_num_attributes": mhld_oblique_max_num_attributes,
+        "mhld_oblique_sample_attributes": mhld_oblique_sample_attributes,
         "min_examples": min_examples,
         "missing_value_policy": missing_value_policy,
         "num_candidate_attributes": num_candidate_attributes,
@@ -2213,6 +2249,18 @@ class RandomForestModel(core.CoreModel):
       expressed in seconds. Each learning algorithm is free to use this
       parameter at it sees fit. Enabling maximum training duration makes the
       model training non-deterministic. Default: -1.0.
+    mhld_oblique_max_num_attributes: For MHLD oblique splits i.e.
+      `split_axis=MHLD_OBLIQUE`. Maximum number of attributes in the projection.
+      Increasing this value increases the training time. Decreasing this value
+      acts as a regularization. The value should be in [2,
+      num_numerical_features]. If the value is above the total number of
+      numerical features, the value is capped automatically. The value 1 is
+      allowed but results in ordinary (non-oblique) splits. Default: None.
+    mhld_oblique_sample_attributes: For MHLD oblique splits i.e.
+      `split_axis=MHLD_OBLIQUE`. If true, applies the attribute sampling
+      controlled by the "num_candidate_attributes" or
+      "num_candidate_attributes_ratio" parameters. If false, all the attributes
+      are tested. Default: None.
     min_examples: Minimum number of examples in a node. Default: 5.
     missing_value_policy: Method used to handle missing attribute values. -
       `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean
@@ -2315,9 +2363,11 @@ class RandomForestModel(core.CoreModel):
     split_axis: What structure of split to consider for numerical features. -
       `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This
       is the "classical" way to train a tree. Default value. - `SPARSE_OBLIQUE`:
-      Sparse oblique splits (i.e. splits one a small number of features) from
-      "Sparse Projection Oblique Random Forests", Tomita et al., 2020. Default:
-      "AXIS_ALIGNED".
+      Sparse oblique splits (i.e. random splits one a small number of features)
+      from "Sparse Projection Oblique Random Forests", Tomita et al., 2020. -
+      `MHLD_OBLIQUE`: Multi-class Hellinger Linear Discriminant splits from
+      "Classification Based on Multivariate Contrast Patterns", Canete-Sifuentes
+      et al., 2029 Default: "AXIS_ALIGNED".
     uplift_min_examples_in_treatment: For uplift models only. Minimum number of
       examples per treatment in a node. Default: 5.
     uplift_split_score: For uplift models only. Splitter score i.e. score
@@ -2380,6 +2430,8 @@ def __init__(
       max_num_nodes: Optional[int] = None,
       maximum_model_size_in_memory_in_bytes: Optional[float] = -1.0,
       maximum_training_duration_seconds: Optional[float] = -1.0,
+      mhld_oblique_max_num_attributes: Optional[int] = None,
+      mhld_oblique_sample_attributes: Optional[bool] = None,
       min_examples: Optional[int] = 5,
       missing_value_policy: Optional[str] = "GLOBAL_IMPUTATION",
       num_candidate_attributes: Optional[int] = 0,
@@ -2433,6 +2485,8 @@ def __init__(
             maximum_model_size_in_memory_in_bytes
         ),
         "maximum_training_duration_seconds": maximum_training_duration_seconds,
+        "mhld_oblique_max_num_attributes": mhld_oblique_max_num_attributes,
+        "mhld_oblique_sample_attributes": mhld_oblique_sample_attributes,
         "min_examples": min_examples,
         "missing_value_policy": missing_value_policy,
         "num_candidate_attributes": num_candidate_attributes,

diff --git a/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh b/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright 2021 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# Converts a non-submitted CL to a standalone Bazel project in a local
+# directory, compile the project and run the tests.
+#
+# Usage example:
+#   third_party/tensorflow_decision_forests/tools/run_e2e_tfdf_test.sh
+
+set -vex
+
+LOCAL_DIR="/usr/local/google/home/${USER}/git/decision-forests"
+
+CL=$(hg exportedcl)
+echo "Current CL: ${CL}"
+echo "Make sure the CL is synced!"
+
+function export_project() {
+  COPYBARA="/google/bin/releases/copybara/public/copybara/copybara"
+
+  # Test the copy bara configuration.
+  bazel test third_party/tensorflow_decision_forests:copybara_test
+
+  echo "Export a Bazel project locally"
+  echo "=============================="
+
+  rm -fr ${LOCAL_DIR}
+  ${COPYBARA} third_party/tensorflow_decision_forests/copy.bara.sky presubmit_piper_to_gerrit ${CL} \
+    --dry-run --init-history --squash --force \
+    --git-destination-path ${LOCAL_DIR} --ignore-noop
+
+  /google/bin/releases/opensource/thirdparty/cross/cross ${LOCAL_DIR}
+}
+
+echo "Test the project"
+echo "================"
+
+run_all() {
+  cd ${LOCAL_DIR}
+
+  # Start the Docker
+  sudo ./tools/start_compile_docker.sh /bin/bash
+
+  # In the docker, you can now trigger the builder with the following line in
+  # the docker:
+  # RUN_TESTS=1 PY_VERSION=3.9 TF_VERSION=2.16.1 ./tools/test_bazel.sh
+
+  # Alternatively, you can trigger the build directly with:
+  # sudo ./tools/start_compile_docker.sh "RUN_TESTS=1 PY_VERSION=3.8 TF_VERSION=2.10.0 ./tools/test_bazel.sh && chmod -R a+xrw . && /bin/bash"
+}
+
+export_project
+run_all
diff --git a/tools/build_pip_package.sh b/tools/build_pip_package.sh
@@ -116,15 +116,16 @@ function assemble_files() {
   # Distribution server binaries
   cp ${SRCBIN}/keras/grpc_worker_main ${SRCPK}/tensorflow_decision_forests/keras/
 
-  # YDF's proto wrappers.
-  YDFSRCBIN="bazel-bin/external/ydf/yggdrasil_decision_forests"
-  mkdir -p ${SRCPK}/yggdrasil_decision_forests
-  pushd ${YDFSRCBIN}
-  find . -name \*.py -exec rsync -R -arv {} ${SRCPK}/yggdrasil_decision_forests \;
-  popd
-
-  # Add __init__.py to all exported Yggdrasil sub-directories.
-  find ${SRCPK}/yggdrasil_decision_forests -type d -exec touch {}/__init__.py \;
+  # Note: Starting with TF-DF 0.9.1, the YDF Protos are included by (P)YDF.
+  # TODO: Remove this block.
+  # # YDF's proto wrappers.
+  # YDFSRCBIN="bazel-bin/external/ydf/yggdrasil_decision_forests"
+  # mkdir -p ${SRCPK}/yggdrasil_decision_forests
+  # pushd ${YDFSRCBIN}
+  # find . -name \*.py -exec rsync -R -arv {} ${SRCPK}/yggdrasil_decision_forests \;
+  # popd
+  # # Add __init__.py to all exported Yggdrasil sub-directories.
+  # find ${SRCPK}/yggdrasil_decision_forests -type d -exec touch {}/__init__.py \;
 }
 
 # Build a pip package.

diff --git a/tools/start_compile_docker.sh b/tools/start_compile_docker.sh
@@ -58,15 +58,23 @@
 #  ./tools/build_pip_package.sh ALL_VERSIONS_ALREADY_ASSEMBLED
 #
 # https://hub.docker.com/r/tensorflow/build/tags?page=1
-DOCKER=tensorflow/build:2.17-python3.9
 
 # Current directory
 # Useful if Yggdrasil Decision Forests is available locally in a neighbor
 # directory.
 TFDF_DIRNAME=${PWD##*/}
 
-# Download docker
-docker pull ${DOCKER}
+DOCKER_IMAGE=tensorflow/build:2.16-python3.9
+DOCKER_CONTAINER=compile_tfdf
+
+echo "Available containers:"
+sudo sudo docker container ls -a --size
+
+set +e  # Ignore error if the container already exist
+CREATE_DOCKER_FLAGS="-i -t -p 8889:8889 --network host -v ${PWD}/..:/working_dir -w /working_dir/${TFDF_DIRNAME}"
+sudo docker create ${CREATE_DOCKER_FLAGS} --name ${DOCKER_CONTAINER} ${DOCKER_IMAGE}
+sudo docker start ${DOCKER_CONTAINER}
+set -e
 
 # Start docker
-docker run -it -v ${PWD}/..:/working_dir -w /working_dir/${TFDF_DIRNAME} ${DOCKER} $@
+sudo docker exec -it ${DOCKER_CONTAINER} /bin/bash -c $@