Skip to content

Commit

Permalink
higgs benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedb committed Oct 31, 2023
1 parent 6da57c7 commit cc1466e
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 6 deletions.
12 changes: 6 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
EvoTreesCUDAExt = "CUDA"

[compat]
BSON = "0.3"
CUDA = "3.0, 4.0, 5.0"
Expand All @@ -29,9 +35,6 @@ StatsBase = "0.32, 0.33, 0.34"
Tables = "1.9"
julia = "1.6"

[extensions]
EvoTreesCUDAExt = "CUDA"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand All @@ -43,6 +46,3 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
docs = ["Documenter"]
test = ["CUDA", "DataFrames", "Test", "MLJBase", "MLJTestInterface"]

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
93 changes: 93 additions & 0 deletions benchmarks/Higgs-logloss.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
using Revise
using Random
using CSV
using DataFrames
using StatsBase
using Statistics: mean, std
using EvoTrees
using Solage: Connectors
using AWS: AWSCredentials, AWSConfig, @service

@service S3
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
bucket = "jeremiedb"

path = "share/data/higgs/HIGGS.arrow"
df_tot = Connectors.read_arrow_aws(path; bucket="jeremiedb", aws_config)

rename!(df_tot, "Column1" => "y")
feature_names = setdiff(names(df_tot), ["y"])
target_name = "y"

function percent_rank(x::AbstractVector{T}) where {T}
return tiedrank(x) / (length(x) + 1)
end

transform!(df_tot, feature_names .=> percent_rank .=> feature_names)

dtrain = df_tot[1:end-500_000, :];
deval = df_tot[end-500_000+1:end, :];
dtest = df_tot[end-500_000+1:end, :];

config = EvoTreeRegressor(
loss=:logloss,
nrounds=5000,
eta=0.15,
nbins=128,
max_depth=9,
lambda=1.0,
gamma=0.0,
rowsample=0.8,
colsample=0.8,
min_weight=1,
rng=123,
)

device = "gpu"
metric = "logloss"
@time m_evo = fit_evotree(config, dtrain; target_name, fnames=feature_names, deval, metric, device, early_stopping_rounds=200, print_every_n=100);

p_test = m_evo(dtest);
@info extrema(p_test)
logloss_test = mean(-dtest.y .* log.(p_test) .+ (dtest.y .- 1) .* log.(1 .- p_test))
@info "LogLoss - dtest" logloss_test
error_test = 1 - mean(round.(Int, p_test) .== dtest.y)
@info "ERROR - dtest" error_test
# ┌ Info: LogLoss - dtest
# └ logloss_test = 0.4716574579097044
# ┌ Info: ERROR - dtest
# └ error_test = 0.229522

@info "XGBoost"
@info "train"
using XGBoost
params_xgb = Dict(
:num_round => 4000,
:max_depth => 8,
:eta => 0.15,
:objective => "reg:logistic",
:print_every_n => 5,
:gamma => 0,
:lambda => 1,
:subsample => 0.8,
:colsample_bytree => 0.8,
:tree_method => "gpu_hist", # hist/gpu_hist
:max_bin => 128,
)

dtrain_xgb = DMatrix(select(dtrain, feature_names), dtrain.y)
watchlist = Dict("eval" => DMatrix(select(deval, feature_names), deval.y));
@time m_xgb = xgboost(dtrain_xgb; watchlist, nthread=Threads.nthreads(), verbosity=0, eval_metric="logloss", params_xgb...);

pred_xgb = XGBoost.predict(m_xgb, DMatrix(select(deval, feature_names)));
@info extrema(pred_xgb)
# (1.9394008f-6, 0.9999975f0)
logloss_test = mean(-dtest.y .* log.(pred_xgb) .+ (dtest.y .- 1) .* log.(1 .- pred_xgb))
@info "LogLoss - dtest" logloss_test
error_test = 1 - mean(round.(Int, pred_xgb) .== dtest.y)
@info "ERROR - xgb test" error_test
# ┌ Info: LogLoss - dtest
# └ logloss_test = 0.4710665675338929
# ┌ Info: ERROR - xgb test
# └ error_test = 0.22987999999999997

0 comments on commit cc1466e

Please sign in to comment.