-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_sdms_predict.py
114 lines (98 loc) · 4.52 KB
/
ml_sdms_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 15 11:26:05 2020
@author: danielfurman
"""
# Now that we have trained our ML classifiers, we are ready to deploy the .pkl
# files we saved from PyCaret's training and examine the model predictions
# on the validation set and examine the models' performance. We first print
# validation set accuracy, as well as the F statistic and the 2x2 confusion
# matrix. Finally, we visualize the AUC statistic with a ROC curve for each
# model. We also examine the results of a blended model constructed from the
# aforementioned five most predictive learners.
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from matplotlib import style
from sklearn.metrics import roc_curve, auc, f1_score
from pycaret.classification import load_model
import warnings
import numpy as np
import pandas as pd
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.inspection import permutation_importance
warnings.filterwarnings("ignore")
# create a dictionary of ML models, name -> (line format, classifier)
# models deployed from ML_sdms_train.py
CLASS_MAP = {
'Random Forest': ('-.', load_model('classifier_models(pkl)/xant_rf')[23]),
#'Catboost': ('-.', load_model('classifier_models(pkl)/xant_cboost')[23]),
'LGBoost Machine': ('-.', load_model('classifier_models(pkl)/xant_lgbm')[23]),
'Extra Trees': ('-.', load_model('classifier_models(pkl)/xant_etrees')[23]),
#'XGBoost': ('-.', load_model('classifier_models(pkl)/xant_xgb')[23]),
#'Logistic Regression': ('-.', load_model('classifier_models(pkl)/xant_log')[23]),
'Blend (rf, et, lgbm)': ('-', load_model('classifier_models(pkl)/xant_blended')[23])
}
# load training (80%) and test (20%) sets
env_data = pd.read_csv('data_2.0/envtrain_xv_test.csv')
training_class = env_data['pa']
training_data = env_data.drop(['pa'], axis=1)
env_data_test = pd.read_csv('data_2.0/envtest_xv.csv')
validation_class = env_data_test['pa']
validation_data = env_data_test.drop(['pa'], axis=1)
# perform validation set analyses, iterate over dictionary :
names_bclim = ['bclim11', 'bclim12', 'bclim14', 'bclim15', 'bclim18',
'bclim2', 'bclim3', 'bclim4', 'bclim6', 'bclim7', 'bclim8',
'bclim9']
f_score = np.zeros(len(CLASS_MAP))
col_names = []
feature_importances = []
i = 0
style.use('ggplot')
colors = ('tab:blue', 'tab:orange', 'tab:red', 'tab:grey', 'lightgreen',
'darkgoldenrod', 'black')
for name, (line_fmt, model) in CLASS_MAP.items():
col_names.append(name)
result = model.fit(training_data, training_class)
# feature importances via permutation
perm_importance = permutation_importance(result, validation_data,
validation_class, random_state=100, n_repeats=10)
perm_importances = np.array(perm_importance.importances_mean)
perm_importances = pd.DataFrame(data=perm_importances.reshape(
-1, len(perm_importances)), columns=names_bclim)
perm = PermutationImportance(result, random_state=100).fit(
validation_data, validation_class)
feature_importances.append(eli5.show_weights(
perm, feature_names = validation_data.columns.tolist()))
# make ROC plot
preds = model.predict_proba(validation_data)
pred = pd.Series(preds[:, 1])
fpr, tpr, thresholds = roc_curve(validation_class, pred)
auc_score = auc(fpr, tpr)
label = '%s: AUC: %.4f' % (name, auc_score)
plt.plot(fpr, tpr, line_fmt, linewidth=1.75, label=label,
color=colors[i])
# compute confusion matrix and F1 stat
predicted_class_type = model.predict(validation_data)
print('\n\nFraction correct validation ' + name + ' :',
np.sum(predicted_class_type == validation_class)
/ len(validation_class))
cnf_matrix_test = confusion_matrix(validation_class, predicted_class_type)
print(cnf_matrix_test)
f_score[i] = f1_score(validation_class, predicted_class_type)
i = (i + 1)
# annotate ROC Plot
plt.legend(loc="lower right", shadow=True)
#plt.title('Comparing Classifiers for *X. vigilis*: Validation AUC')
plt.plot([0, 1], [0, 1], 'k-', alpha=0.2)
plt.ylim([0.0, 1.05])
#plt.xlabel('False Positive Rate')
#plt.ylabel('True Positive Rate')
plt.savefig('auc.png', dpi=400)
# create pandas dataframe with F statistic scores
f_score = pd.DataFrame(data=f_score.reshape(-1, len(f_score)),
columns=col_names)
f_score = f_score.rename(index={0: 'F-statistic :'})
f_score = f_score.sort_values(by='F-statistic :', axis=1,
ascending=False)