Merge pull request #227 from winedarksea/dev

0.6.9
winedarksea · Jan 22, 2024 · 3e6baff · 3e6baff
2 parents c23c244 + c40093a
commit 3e6baff
Show file tree

Hide file tree

Showing 22 changed files with 190 additions and 159 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2022 Colin Catlin
+Copyright (c) 2024 Colin Catlin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/TODO.md b/TODO.md
@@ -12,10 +12,9 @@
 * The most recent data will generally be the most important
 * Forecasts are desired for the future immediately following the most recent data.
 
-# 0.6.8 🇺🇦 🇺🇦 🇺🇦
-* bug fixes, robust for OpenBLAS nan handling kernel failures
-* added BKBandpassFilter
-* added expand_horizontal for scaling mosaics
+# 0.6.9 🇺🇦 🇺🇦 🇺🇦
+* expanded regressor options for MultivariateRegression, NeuralForecast (currently only available directly, not from AutoTS class)
+* matse bug fix on all 0 history
 
 ### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
 * Pytorch-Forecasting

diff --git a/autots/__init__.py b/autots/__init__.py
@@ -26,7 +26,7 @@
 from autots.models.cassandra import Cassandra
 
 
-__version__ = '0.6.8'
+__version__ = '0.6.9'
 
 TransformTS = GeneralTransformer
 

diff --git a/autots/evaluator/anomaly_detector.py b/autots/evaluator/anomaly_detector.py
@@ -91,7 +91,8 @@ def detect(self, df):
             model = GeneralTransformer(
                 verbose=2, **self.transform_dict
             )  # DATEPART, LOG, SMOOTHING, DIFF, CLIP OUTLIERS with high z score
-            self.df_anomaly = model.fit_transform(self.df_anomaly)
+            # the post selecting by columns is for CenterSplit and any similar renames or expansions
+            self.df_anomaly = model.fit_transform(self.df_anomaly)[self.df.columns]
 
         if self.forecast_params is not None:
             backcast = back_forecast(

diff --git a/autots/evaluator/auto_model.py b/autots/evaluator/auto_model.py
@@ -982,6 +982,7 @@ def __init__(
         per_series_uwmse=None,
         per_series_smoothness=None,
         per_series_mate=None,
+        per_series_matse=None,
         per_series_wasserstein=None,
         per_series_dwd=None,
         model_count: int = 0,
@@ -1005,6 +1006,7 @@ def __init__(
         self.per_series_uwmse = per_series_uwmse
         self.per_series_smoothness = per_series_smoothness
         self.per_series_mate = per_series_mate
+        self.per_series_matse = per_series_matse
         self.per_series_wasserstein = per_series_wasserstein
         self.per_series_dwd = per_series_dwd
         self.full_mae_ids = []
@@ -1083,6 +1085,9 @@ def concat(self, another_eval):
         self.per_series_mate = pd.concat(
             [self.per_series_mate, another_eval.per_series_mate], axis=0, sort=False
         )
+        self.per_series_matse = pd.concat(
+            [self.per_series_matse, another_eval.per_series_matse], axis=0, sort=False
+        )
         self.per_series_wasserstein = pd.concat(
             [self.per_series_wasserstein, another_eval.per_series_wasserstein],
             axis=0,
@@ -1667,57 +1672,6 @@ def virtual_memory():
             ps_metric.index = [model_id] * ps_metric.shape[0]
             ps_metric.index.name = "ID"
             template_result.per_series_metrics.append(ps_metric)
-
-            """
-            template_result.per_series_mae.append(
-                _ps_metric(ps_metric, 'mae', model_id)
-            )
-            template_result.per_series_made.append(
-                _ps_metric(ps_metric, 'made', model_id)
-            )
-            template_result.per_series_contour.append(
-                _ps_metric(ps_metric, 'contour', model_id)
-            )
-            template_result.per_series_rmse.append(
-                _ps_metric(ps_metric, 'rmse', model_id)
-            )
-            template_result.per_series_spl.append(
-                _ps_metric(ps_metric, 'spl', model_id)
-            )
-            template_result.per_series_mle.append(
-                _ps_metric(ps_metric, 'mle', model_id)
-            )
-            template_result.per_series_imle.append(
-                _ps_metric(ps_metric, 'imle', model_id)
-            )
-            template_result.per_series_maxe.append(
-                _ps_metric(ps_metric, 'maxe', model_id)
-            )
-            template_result.per_series_oda.append(
-                _ps_metric(ps_metric, 'oda', model_id)
-            )
-            template_result.per_series_mqae.append(
-                _ps_metric(ps_metric, 'mqae', model_id)
-            )
-            template_result.per_series_dwae.append(
-                _ps_metric(ps_metric, 'dwae', model_id)
-            )
-            template_result.per_series_ewmae.append(
-                _ps_metric(ps_metric, 'ewmae', model_id)
-            )
-            template_result.per_series_uwmse.append(
-                _ps_metric(ps_metric, 'uwmse', model_id)
-            )
-            template_result.per_series_smoothness.append(
-                _ps_metric(ps_metric, 'smoothness', model_id)
-            )
-            template_result.per_series_mate.append(
-                _ps_metric(ps_metric, 'mate', model_id)
-            )
-            template_result.per_series_wasserstein.append(
-                _ps_metric(ps_metric, 'wasserstein', model_id)
-            )
-            """
             if 'distance' in ensemble:
                 cur_smape = model_error.per_timestamp.loc['weighted_smape']
                 cur_smape = pd.DataFrame(cur_smape).transpose()
@@ -1864,62 +1818,15 @@ def virtual_memory():
         template_result.per_series_mate = ps[ps['autots_eval_metric'] == 'mate'].drop(
             columns='autots_eval_metric'
         )
+        template_result.per_series_matse = ps[ps['autots_eval_metric'] == 'matse'].drop(
+            columns='autots_eval_metric'
+        )
         template_result.per_series_wasserstein = ps[
             ps['autots_eval_metric'] == 'wasserstein'
         ].drop(columns='autots_eval_metric')
         template_result.per_series_dwd = ps[ps['autots_eval_metric'] == 'dwd'].drop(
             columns='autots_eval_metric'
         )
-        """
-        template_result.per_series_mae = pd.concat(
-            template_result.per_series_mae, axis=0
-        )
-        template_result.per_series_made = pd.concat(
-            template_result.per_series_made, axis=0
-        )
-        template_result.per_series_contour = pd.concat(
-            template_result.per_series_contour, axis=0
-        )
-        template_result.per_series_rmse = pd.concat(
-            template_result.per_series_rmse, axis=0
-        )
-        template_result.per_series_spl = pd.concat(
-            template_result.per_series_spl, axis=0
-        )
-        template_result.per_series_mle = pd.concat(
-            template_result.per_series_mle, axis=0
-        )
-        template_result.per_series_imle = pd.concat(
-            template_result.per_series_imle, axis=0
-        )
-        template_result.per_series_maxe = pd.concat(
-            template_result.per_series_maxe, axis=0
-        )
-        template_result.per_series_oda = pd.concat(
-            template_result.per_series_oda, axis=0
-        )
-        template_result.per_series_mqae = pd.concat(
-            template_result.per_series_mqae, axis=0
-        )
-        template_result.per_series_dwae = pd.concat(
-            template_result.per_series_dwae, axis=0
-        )
-        template_result.per_series_ewmae = pd.concat(
-            template_result.per_series_ewmae, axis=0
-        )
-        template_result.per_series_uwmse = pd.concat(
-            template_result.per_series_uwmse, axis=0
-        )
-        template_result.per_series_smoothness = pd.concat(
-            template_result.per_series_smoothness, axis=0
-        )
-        template_result.per_series_mate = pd.concat(
-            template_result.per_series_mate, axis=0
-        )
-        template_result.per_series_wasserstein = pd.concat(
-            template_result.per_series_wasserstein, axis=0
-        )
-        """
     else:
         template_result.per_series_metrics = pd.DataFrame()
         template_result.per_series_mae = pd.DataFrame()
@@ -1937,6 +1844,7 @@ def virtual_memory():
         template_result.per_series_uwmse = pd.DataFrame()
         template_result.per_series_smoothness = pd.DataFrame()
         template_result.per_series_mate = pd.DataFrame()
+        template_result.per_series_matse = pd.DataFrame()
         template_result.per_series_wasserstein = pd.DataFrame()
         template_result.per_series_dwd = pd.DataFrame()
         if verbose > 0 and not template.empty:
@@ -2731,6 +2639,7 @@ def generate_score_per_series(
     uwmse_weighting = metric_weighting.get('uwmse_weighting', 0)
     smoothness_weighting = metric_weighting.get('smoothness_weighting', 0)
     mate_weighting = metric_weighting.get('mate_weighting', 0)
+    matse_weighting = metric_weighting.get('matse_weighting', 0)
     wasserstein_weighting = metric_weighting.get('wasserstein_weighting', 0)
     dwd_weighting = metric_weighting.get('dwd_weighting', 0)
 
@@ -2823,6 +2732,14 @@ def generate_score_per_series(
         )
         mate_score = results_object.per_series_mate / mate_scaler
         overall_score = overall_score + (mate_score * mate_weighting)
+    if matse_weighting != 0:
+        matse_scaler = (
+            results_object.per_series_matse[results_object.per_series_matse != 0]
+            .min()
+            .fillna(1)
+        )
+        matse_score = results_object.per_series_matse / matse_scaler
+        overall_score = overall_score + (matse_score * matse_weighting)
     if wasserstein_weighting != 0:
         wasserstein_scaler = (
             results_object.per_series_wasserstein[

diff --git a/autots/evaluator/auto_ts.py b/autots/evaluator/auto_ts.py
@@ -1393,7 +1393,7 @@ def fit(
             validation_template = validation_template.drop_duplicates(
                 subset=['Model', 'ModelParameters', 'TransformationParameters']
             )
-        self.validation_template = validation_template[self.template_cols]
+        self.validation_template = validation_template[self.template_cols_id]
         if self.validate_import is not None:
             self.validation_template = pd.concat(
                 [self.validation_template, self.validate_import]
@@ -1813,7 +1813,14 @@ def validation_agg(self):
         )
         return self
 
-    def _set_best_model(self, metric_weighting=None, allow_horizontal=True):
+    def _set_best_model(self, metric_weighting=None, allow_horizontal=True, n=1):
+        """Sets best model based on validation results.
+
+        Args:
+            metric_weighting (dict): if not None, overrides input metric weighting this this metric weighting
+            allow_horizontal (bool): if False, force no horizontal, if True, allows if ensemble param and runs occurred
+            n (int): default 1 means chose best model, 2 = use 2nd best, and so on
+        """
         if metric_weighting is None:
             metric_weighting = self.metric_weighting
         hens_model_results = self.initial_results.model_results[
@@ -1828,7 +1835,7 @@ def _set_best_model(self, metric_weighting=None, allow_horizontal=True):
         # horizontal ensembles can't be compared directly to others because they don't get run through all validations
         # they are built themselves from cross validation so a full rerun of validations is unnecessary
         self.best_model_non_horizontal = self._best_non_horizontal(
-            metric_weighting=metric_weighting
+            metric_weighting=metric_weighting, n=n
         )
         if not hens_model_results.empty and requested_H_ens:
             hens_model_results.loc['Score'] = generate_score(
@@ -1838,7 +1845,7 @@ def _set_best_model(self, metric_weighting=None, allow_horizontal=True):
             )
             self.best_model = hens_model_results.sort_values(
                 by="Score", ascending=True, na_position='last'
-            ).head(1)[self.template_cols_id]
+            ).iloc[(n - 1) : n][self.template_cols_id]
             self.ensemble_check = 1
         # print a warning if requested but unable to produce a horz ensemble
         elif requested_H_ens:
@@ -1859,7 +1866,7 @@ def _set_best_model(self, metric_weighting=None, allow_horizontal=True):
         self.parse_best_model()
         return self
 
-    def _best_non_horizontal(self, metric_weighting=None, series=None):
+    def _best_non_horizontal(self, metric_weighting=None, series=None, n=1):
         if self.validation_results is None:
             if not self.initial_results.model_results.empty:
                 self = self.validation_agg()
@@ -1908,7 +1915,7 @@ def _best_non_horizontal(self, metric_weighting=None, series=None):
                         by="Score", ascending=True, na_position='last'
                     )
                     .drop_duplicates(subset=self.template_cols)
-                    .head(1)[self.template_cols_id]
+                    .iloc[(n - 1) : n][self.template_cols_id]
                 )
             except IndexError:
                 raise ValueError(

diff --git a/autots/evaluator/metrics.py b/autots/evaluator/metrics.py
@@ -682,7 +682,9 @@ def full_metric_evaluation(
     else:
         mate = np.abs(np.sum(full_errors, axis=0))
     # possibly temporary
-    matse = mate / np.sum(A, axis=0)
+    matse_scale = np.sum(A, axis=0)
+    matse_scale[matse_scale == 0] = 1
+    matse = mate / matse_scale
 
     direc_sign = np.sign(F - last_of_array) == np.sign(A - last_of_array)
     weights = np.geomspace(1, 10, full_mae_errors.shape[0])[:, np.newaxis]

diff --git a/autots/models/cassandra.py b/autots/models/cassandra.py
@@ -1138,7 +1138,7 @@ def process_components(self, to_origin_space=True):
                     pd.DataFrame(
                         self.components[:, comp, :],
                         index=t_indx,
-                        columns=self.column_names,
+                        columns=self.df.columns,
                     ),
                     components=True,
                     bounds=True,
@@ -2537,7 +2537,7 @@ def lstsq_minimize(X, y, maxiter=15000, cost_function="l1", method=None):
     elif cost_function == "quantile":
         cost_func = cost_function_quantile
     elif cost_function == "l1_positive":
-        bounds = [(0, 10) for x in x0]
+        bounds = [(0, 14) for x in x0]
         cost_func = cost_function_l1
     else:
         cost_func = cost_function_l1

diff --git a/autots/models/ensemble.py b/autots/models/ensemble.py
@@ -400,6 +400,8 @@ def horizontal_classifier(
     if classifier_params is None:
         # found using FLAML
         classifier_params = {"model": 'KNN', "model_params": {'n_neighbors': 5}}
+        # newer, but don't like as much
+        # RandomForest {'n_estimators': 69, 'max_features': 0.5418860350847585, 'max_leaves': 439, 'criterion': 'gini'}
 
     # known = {'EXUSEU': 'xx1', 'MCOILWTICO': 'xx2', 'CSUSHPISA': 'xx3'}
     Xt, Y, Xf = horizontal_xy(df_train, known)
@@ -473,6 +475,16 @@ def mosaic_classifier(df_train, known, classifier_params=None):
                 'criterion': 'gini',
             },
         }
+        # slightly newer, on a mosaic-weighted-0-40
+        classifier_params = {
+            "model": 'ExtraTrees',
+            "model_params": {
+                'n_estimators': 62,
+                'max_features': 0.181116,
+                'max_leaves': 261,
+                'criterion': 'entropy',
+            },
+        }
 
     X, Xf, Y, to_predict = mosaic_xy(df_train, known)