Skip to content

Commit

Permalink
Merge branch 'WenjieDu:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
LinglongQian committed Jun 17, 2024
2 parents ea5c72e + 89f0ce0 commit cad4cfd
Show file tree
Hide file tree
Showing 13 changed files with 1,531 additions and 64 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/greetings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
steps:
- uses: actions/first-interaction@v1
with:
repo-token: ${{ secrets.ACCESS_TOKEN }}
repo-token: ${{ secrets.GITHUB_TOKEN }}
issue-message: |
Hi there 👋,
Expand All @@ -34,7 +34,7 @@ jobs:
pr-message: |
Hi there 👋,
We really really appreciate that you have taken the time to make this PR on PyPOTS' Awesome Imputation project!
We really appreciate that you have taken the time to make this PR on PyPOTS' Awesome Imputation project!
If you are trying to fix a bug, please reference the issue number in the description or give your details about the bug.
If you are implementing a feature request, please check with the maintainers that the feature will be accepted first.
Expand Down
6 changes: 3 additions & 3 deletions benchmark_code/data/dataset_generating_block05.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# License: BSD-3-Clause


from benchpots.preprocessing import (
from benchpots.datasets import (
preprocess_beijing_air_quality,
preprocess_italy_air_quality,
preprocess_electricity_load_diagrams,
Expand All @@ -15,7 +15,7 @@
)
from pypots.utils.random import set_random_seed

from dataset_generating_point01 import organize_and_save
from utils import organize_and_save

if __name__ == "__main__":
set_random_seed(2024)
Expand Down Expand Up @@ -74,7 +74,7 @@
block_len = 6
block_width = 6
ett = preprocess_ett(
set_name="ETTh1",
subset="ETTh1",
rate=rate,
n_steps=step,
pattern=pattern,
Expand Down
47 changes: 15 additions & 32 deletions benchmark_code/data/dataset_generating_point01.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

from pypots.data.saving import save_dict_into_h5
from pypots.utils.random import set_random_seed

from benchpots.preprocessing import (
from benchpots.datasets import (
preprocess_physionet2012,
preprocess_physionet2019,
preprocess_beijing_air_quality,
Expand All @@ -18,39 +15,19 @@
preprocess_pems_traffic,
preprocess_ucr_uea_datasets,
)
from pypots.utils.random import set_random_seed


def organize_and_save(data_dict, saving_dir):
train = {
"X": data_dict["train_X"],
"X_ori": data_dict["train_X_ori"] if "train_X_ori" in data_dict.keys() else "",
"y": data_dict["train_y"] if "train_y" in data_dict.keys() else "",
}
val = {
"X": data_dict["val_X"],
"X_ori": data_dict["val_X_ori"],
"y": data_dict["val_y"] if "val_y" in data_dict.keys() else "",
}
test = {
"X": data_dict["test_X"],
"X_ori": data_dict["test_X_ori"],
"y": data_dict["test_y"] if "test_y" in data_dict.keys() else "",
}
save_dict_into_h5(train, saving_dir, "train.h5")
save_dict_into_h5(val, saving_dir, "val.h5")
save_dict_into_h5(test, saving_dir, "test.h5")
print("\n\n\n")

from utils import organize_and_save

if __name__ == "__main__":
set_random_seed(2024)
rate = 0.1
pattern = "point"

physionet_2012 = preprocess_physionet2012(
subset="set-a",
rate=rate,
pattern="point",
subset="set-a",
features=[
"DiasABP",
"HR",
Expand Down Expand Up @@ -93,7 +70,9 @@ def organize_and_save(data_dict, saving_dir):

step = 24
beijing_air_quality = preprocess_beijing_air_quality(
rate=rate, n_steps=step, pattern=pattern
rate=rate,
n_steps=step,
pattern=pattern,
)
organize_and_save(
beijing_air_quality,
Expand All @@ -102,7 +81,9 @@ def organize_and_save(data_dict, saving_dir):

step = 12
italy_air_quality = preprocess_italy_air_quality(
rate=rate, n_steps=step, pattern=pattern
rate=rate,
n_steps=step,
pattern=pattern,
)
organize_and_save(
italy_air_quality,
Expand All @@ -122,7 +103,7 @@ def organize_and_save(data_dict, saving_dir):

step = 48
ett = preprocess_ett(
set_name="ETTh1",
subset="ETTh1",
rate=rate,
n_steps=step,
pattern=pattern,
Expand All @@ -145,7 +126,7 @@ def organize_and_save(data_dict, saving_dir):

step = 24
melbourne_pedestrian = preprocess_ucr_uea_datasets(
"ucr_uea_MelbournePedestrian",
dataset_name="ucr_uea_MelbournePedestrian",
rate=rate,
)
organize_and_save(
Expand All @@ -154,7 +135,9 @@ def organize_and_save(data_dict, saving_dir):
)

physionet_2019 = preprocess_physionet2019(
rate=rate, pattern="point", subset="training_setA"
subset="training_setA",
rate=rate,
pattern="point",
)
organize_and_save(
physionet_2019, "generated_datasets/physionet_2019_rate01_step48_point"
Expand Down
12 changes: 6 additions & 6 deletions benchmark_code/data/dataset_generating_point05.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

from pypots.utils.random import set_random_seed

from benchpots.preprocessing import (
from benchpots.datasets import (
preprocess_beijing_air_quality,
preprocess_italy_air_quality,
preprocess_electricity_load_diagrams,
preprocess_ett,
preprocess_pems_traffic,
preprocess_ucr_uea_datasets,
)
from dataset_generating_point01 import organize_and_save
from pypots.utils.random import set_random_seed

from utils import organize_and_save

if __name__ == "__main__":
set_random_seed(2024)
Expand Down Expand Up @@ -53,7 +53,7 @@

step = 48
ett = preprocess_ett(
set_name="ETTh1",
subset="ETTh1",
rate=rate,
n_steps=step,
pattern=pattern,
Expand All @@ -76,7 +76,7 @@

step = 24
melbourne_pedestrian = preprocess_ucr_uea_datasets(
"ucr_uea_MelbournePedestrian",
dataset_name="ucr_uea_MelbournePedestrian",
rate=rate,
)
organize_and_save(
Expand Down
20 changes: 12 additions & 8 deletions benchmark_code/data/dataset_generating_point09.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

from pypots.utils.random import set_random_seed

from benchpots.preprocessing import (
from benchpots.datasets import (
preprocess_beijing_air_quality,
preprocess_italy_air_quality,
preprocess_electricity_load_diagrams,
preprocess_ett,
preprocess_pems_traffic,
preprocess_ucr_uea_datasets,
)
from dataset_generating_point01 import organize_and_save
from pypots.utils.random import set_random_seed

from utils import organize_and_save

if __name__ == "__main__":
set_random_seed(2024)
Expand All @@ -24,7 +24,9 @@

step = 24
beijing_air_quality = preprocess_beijing_air_quality(
rate=rate, n_steps=step, pattern=pattern
rate=rate,
n_steps=step,
pattern=pattern,
)
organize_and_save(
beijing_air_quality,
Expand All @@ -33,7 +35,9 @@

step = 12
italy_air_quality = preprocess_italy_air_quality(
rate=rate, n_steps=step, pattern=pattern
rate=rate,
n_steps=step,
pattern=pattern,
)
organize_and_save(
italy_air_quality,
Expand All @@ -53,7 +57,7 @@

step = 48
ett = preprocess_ett(
set_name="ETTh1",
subset="ETTh1",
rate=rate,
n_steps=step,
pattern=pattern,
Expand All @@ -76,7 +80,7 @@

step = 24
melbourne_pedestrian = preprocess_ucr_uea_datasets(
"ucr_uea_MelbournePedestrian",
dataset_name="ucr_uea_MelbournePedestrian",
rate=rate,
)
organize_and_save(
Expand Down
12 changes: 6 additions & 6 deletions benchmark_code/data/dataset_generating_subseq05.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@
# License: BSD-3-Clause


from pypots.utils.random import set_random_seed

from benchpots.preprocessing import (
from benchpots.datasets import (
preprocess_beijing_air_quality,
preprocess_italy_air_quality,
preprocess_electricity_load_diagrams,
preprocess_ett,
preprocess_pems_traffic,
preprocess_ucr_uea_datasets,
)
from dataset_generating_point01 import organize_and_save
from pypots.utils.random import set_random_seed

from utils import organize_and_save

if __name__ == "__main__":
set_random_seed(2024)
Expand Down Expand Up @@ -56,7 +56,7 @@
step = 48
seq_len = 36
ett = preprocess_ett(
set_name="ETTh1",
subset="ETTh1",
rate=rate,
n_steps=step,
pattern=pattern,
Expand Down Expand Up @@ -84,7 +84,7 @@
step = 24
seq_len = 18
melbourne_pedestrian = preprocess_ucr_uea_datasets(
"ucr_uea_MelbournePedestrian",
dataset_name="ucr_uea_MelbournePedestrian",
rate=rate,
pattern=pattern,
**{"seq_len": seq_len},
Expand Down
30 changes: 30 additions & 0 deletions benchmark_code/data/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
"""

# Created by Wenjie Du <[email protected]>
# License: BSD-3-Clause

from pypots.data.saving import save_dict_into_h5


def organize_and_save(data_dict, saving_dir):
train = {
"X": data_dict["train_X"],
"X_ori": data_dict["train_X_ori"] if "train_X_ori" in data_dict.keys() else "",
"y": data_dict["train_y"] if "train_y" in data_dict.keys() else "",
}
val = {
"X": data_dict["val_X"],
"X_ori": data_dict["val_X_ori"],
"y": data_dict["val_y"] if "val_y" in data_dict.keys() else "",
}
test = {
"X": data_dict["test_X"],
"X_ori": data_dict["test_X_ori"],
"y": data_dict["test_y"] if "test_y" in data_dict.keys() else "",
}
save_dict_into_h5(train, saving_dir, "train.h5")
save_dict_into_h5(val, saving_dir, "val.h5")
save_dict_into_h5(test, saving_dir, "test.h5")
print("\n\n\n")
35 changes: 30 additions & 5 deletions benchmark_code/downstream_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,23 @@
from global_config import RANDOM_SEEDS


def calc_multiclass_classification_metrics(proba_predictions, y_true, n_classes):
# calc weighted roc auc
weighted_roc_auc = 0
weighted_pr_auc = 0
for i in range(n_classes):
result_metrics = calc_binary_classification_metrics(
proba_predictions[:, i], y_true == i
)
pr_auc = result_metrics["pr_auc"]
roc_auc = result_metrics["roc_auc"]
weighted_pr_auc += pr_auc * (y_true == i).sum()
weighted_roc_auc += roc_auc * (y_true == i).sum()
weighted_roc_auc /= len(y_true)
weighted_pr_auc /= len(y_true)
return weighted_pr_auc, weighted_roc_auc


class LoadImputedDataAndLabel(Dataset):
def __init__(self, imputed_data, labels):
self.imputed_data = imputed_data
Expand Down Expand Up @@ -269,11 +286,13 @@ def get_dataloaders(train_X, train_y, val_X, val_y, test_X, test_y, batch_size=1
classification_metrics["roc_auc"],
)
else:
pr_auc, roc_auc = None, None
pr_auc, roc_auc = calc_multiclass_classification_metrics(
proba_predictions, test_y, args.n_classes
)
xgb_wo_pr_auc_collector.append(pr_auc)
xgb_wo_roc_auc_collector.append(roc_auc)

# XGBoost model without imputation
# XGBoost model with imputation
xgb = XGBClassifier()
xgb.fit(
train_X.reshape(-1, n_flatten_features),
Expand All @@ -291,7 +310,9 @@ def get_dataloaders(train_X, train_y, val_X, val_y, test_X, test_y, batch_size=1
classification_metrics["roc_auc"],
)
else:
pr_auc, roc_auc = None, None
pr_auc, roc_auc = calc_multiclass_classification_metrics(
proba_predictions, test_y, args.n_classes
)
xgb_pr_auc_collector.append(pr_auc)
xgb_roc_auc_collector.append(roc_auc)

Expand All @@ -312,7 +333,9 @@ def get_dataloaders(train_X, train_y, val_X, val_y, test_X, test_y, batch_size=1
classification_metrics["roc_auc"],
)
else:
pr_auc, roc_auc = None, None
pr_auc, roc_auc = calc_multiclass_classification_metrics(
proba_predictions, test_y, args.n_classes
)
rnn_pr_auc_collector.append(pr_auc)
rnn_roc_auc_collector.append(roc_auc)

Expand All @@ -339,7 +362,9 @@ def get_dataloaders(train_X, train_y, val_X, val_y, test_X, test_y, batch_size=1
classification_metrics["roc_auc"],
)
else:
pr_auc, roc_auc = None, None
pr_auc, roc_auc = calc_multiclass_classification_metrics(
proba_predictions, test_y, args.n_classes
)
transformer_pr_auc_collector.append(pr_auc)
transformer_roc_auc_collector.append(roc_auc)

Expand Down
Loading

0 comments on commit cad4cfd

Please sign in to comment.