Skip to content

Commit

Permalink
Merge pull request #222 from winedarksea/dev
Browse files Browse the repository at this point in the history
0.6.7
  • Loading branch information
winedarksea committed Jan 3, 2024
2 parents 1c28035 + a5c4746 commit 4e4f6bd
Show file tree
Hide file tree
Showing 41 changed files with 545 additions and 110 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ pip install autots
```
This includes dependencies for basic models, but [additonal packages](https://github.com/winedarksea/AutoTS/blob/master/extended_tutorial.md#installation-and-dependency-versioning) are required for some models and methods.

Be advised there are several other projects that have chosen similar names, so make sure you are on the right AutoTS code, papers, and documentation.

## Basic Use

Input data for AutoTS is expected to come in either a *long* or a *wide* format:
Expand Down
7 changes: 5 additions & 2 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
* The most recent data will generally be the most important
* Forecasts are desired for the future immediately following the most recent data.

# 0.6.6 🐌🐌🐌
* bug fixes, particularly compatability for the archaic pandas 1.0.3 still used at a certain big tech company
# 0.6.7 🇺🇦 🇺🇦 🇺🇦
* Cassandra bug fix
* isolated_only to anomaly methods
* matse metric is possibly temporary and not added to per series weighting options
* added HistoricValues transformer

### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
* Pytorch-Forecasting
Expand Down
2 changes: 1 addition & 1 deletion autots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from autots.models.cassandra import Cassandra


__version__ = '0.6.6'
__version__ = '0.6.7'

TransformTS = GeneralTransformer

Expand Down
1 change: 1 addition & 0 deletions autots/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def load_live_daily(
caiso_query: str = "ENE_SLRS",
timeout: float = 300.05,
sleep_seconds: int = 2,
**kwargs,
):
"""Generates a dataframe of data up to the present day. Requires active internet connection.
Try to be respectful of these free data sources by not calling too much too heavily.
Expand Down
22 changes: 18 additions & 4 deletions autots/evaluator/anomaly_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ def __init__(
forecast_params=None,
method_params={},
eval_period=None,
isolated_only=False,
n_jobs=1,
):
"""Detect anomalies on a historic dataset.
Note anomaly score patterns vary by method.
Anomaly flag is standard -1 = anomaly; 1 = regular
Anomaly flag is standard -1 = anomaly; 1 = regular as per sklearn
Args:
output (str): 'multivariate' (each series unique outliers), or 'univariate' (all series together for one outlier flag per timestamp)
Expand All @@ -53,6 +54,7 @@ def __init__(
forecast_params (dict): used to backcast and identify 'unforecastable' values, required only for predict_interval method
method_params (dict): parameters specific to the method, use `.get_new_params()` to see potential models
eval_periods (int): only use this length tail of data, currently only implemented for forecast_params forecasting if used
isolated_only (bool): if True, only standalone anomalies reported
n_jobs (int): multiprocessing jobs, used by some methods
Methods:
Expand All @@ -71,6 +73,7 @@ def __init__(
self.forecast_params = forecast_params
self.method_params = method_params
self.eval_period = eval_period
self.isolated_only = isolated_only
self.n_jobs = n_jobs
self.anomaly_classifier = None

Expand All @@ -86,7 +89,7 @@ def detect(self, df):
self.df_anomaly = df.copy()
if self.transform_dict is not None:
model = GeneralTransformer(
**self.transform_dict
verbose=2, **self.transform_dict
) # DATEPART, LOG, SMOOTHING, DIFF, CLIP OUTLIERS with high z score
self.df_anomaly = model.fit_transform(self.df_anomaly)

Expand All @@ -109,6 +112,10 @@ def detect(self, df):
else:
self.df_anomaly = self.df_anomaly - backcast.forecast

if len(self.df_anomaly.columns) != len(df.columns):
raise ValueError(
f"anomaly returned a column mismatch from params {self.method_params} and {self.transform_dict}"
)
if not all(self.df_anomaly.columns == df.columns):
self.df_anomaly.columns = df.columns

Expand All @@ -130,6 +137,13 @@ def detect(self, df):
eval_period=self.eval_period,
n_jobs=self.n_jobs,
)
if self.isolated_only:
# replace all anomalies (-1) except those which are isolated (1 before and after)
mask_minus_one = self.anomalies == -1
mask_prev_one = self.anomalies.shift(1) == 1
mask_next_one = self.anomalies.shift(-1) == 1
mask_replace = mask_minus_one & ~(mask_prev_one & mask_next_one)
self.anomalies[mask_replace] = 1
return self.anomalies, self.scores

def plot(self, series_name=None, title=None, plot_kwargs={}):
Expand Down Expand Up @@ -286,6 +300,8 @@ def __init__(
def detect(self, df):
"""Run holiday detection. Input wide-style pandas time series."""
self.anomaly_model.detect(df)
self.df = df
self.df_cols = df.columns
if np.min(self.anomaly_model.anomalies.values) != -1:
print("No anomalies detected.")
(
Expand All @@ -312,8 +328,6 @@ def detect(self, df):
use_islamic_holidays=self.use_islamic_holidays,
use_hebrew_holidays=self.use_hebrew_holidays,
)
self.df = df
self.df_cols = df.columns

def plot_anomaly(self, kwargs={}):
self.anomaly_model.plot(**kwargs)
Expand Down
9 changes: 9 additions & 0 deletions autots/evaluator/auto_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2432,6 +2432,7 @@ def validation_aggregation(
'mate': 'mean',
'wasserstein': 'mean',
'dwd': 'mean',
'matse': 'mean',
'smape_weighted': 'mean',
'mae_weighted': 'mean',
'rmse_weighted': 'mean',
Expand All @@ -2451,6 +2452,7 @@ def validation_aggregation(
'mate_weighted': 'mean',
'wasserstein_weighted': 'mean',
'dwd_weighted': 'mean',
'matse_weighted': 'mean',
'containment_weighted': 'mean',
'contour_weighted': 'mean',
'TotalRuntimeSeconds': 'mean',
Expand Down Expand Up @@ -2535,6 +2537,7 @@ def generate_score(
mate_weighting = metric_weighting.get('mate_weighting', 0)
wasserstein_weighting = metric_weighting.get('wasserstein_weighting', 0)
dwd_weighting = metric_weighting.get('dwd_weighting', 0)
matse_weighting = metric_weighting.get('matse_weighting', 0)
# handle various runtime information records
if 'TotalRuntimeSeconds' in model_results.columns:
model_results['TotalRuntimeSeconds'] = np.where(
Expand Down Expand Up @@ -2652,6 +2655,12 @@ def generate_score(
].min()
dwd_score = model_results['dwd_weighted'] / dwd_scaler
overall_score = overall_score + (dwd_score * dwd_weighting)
if matse_weighting != 0:
matse_scaler = model_results['matse_weighted'][
model_results['matse_weighted'] != 0
].min()
matse_score = model_results['matse_weighted'] / matse_scaler
overall_score = overall_score + (matse_score * matse_weighting)
if smoothness_weighting != 0:
smoothness_scaler = model_results['smoothness_weighted'][
model_results['smoothness_weighted'] != 0
Expand Down
36 changes: 34 additions & 2 deletions autots/evaluator/auto_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,8 @@ def _run_template(
)
else:
# trying to catch a rare and sneaky bug (perhaps some variety of beetle?)
print(f"TotalRuntime missing in {current_generation}!")
if verbose >= 0:
print(f"TotalRuntime missing in {current_generation}!")
self.template_result_error = template_result.model_results.copy()
self.template_error = template.copy()
# gather results of template run
Expand Down Expand Up @@ -2665,7 +2666,6 @@ def _generate_mosaic_template(self, df_subset=None, models_to_use=None):
for mos in mosaic_ensembles:
try:
mosaic_config = parse_mosaic(mos)
print(mosaic_config)
# choose metric to optimize on
met = mosaic_config.get("metric", "mae")
if met in ["spl", "pl"]:
Expand Down Expand Up @@ -3662,6 +3662,38 @@ def plot_metric_corr(self, cols=None, percent_best=0.1):
plt.title("Correlogram of Metric Correlations from Optimized Forecasts")
return ax

def plot_transformer_failure_rate(self):
"""Failure Rate per Transformer type (ignoring ensembles), failure may be due to other model or transformer."""
initial_results = self.results()
failures = []
successes = []
for idx, row in initial_results.iterrows():
failed = not pd.isnull(row['Exceptions'])
transforms = list(
json.loads(row['TransformationParameters'])
.get('transformations', {})
.values()
)
if failed:
failures = failures + transforms
else:
successes = successes + transforms
total = pd.concat(
[
pd.Series(failures).value_counts().rename("failures").to_frame(),
pd.Series(successes).value_counts().rename("successes"),
],
axis=1,
).fillna(0)
total['failure_rate'] = total['failures'] / (
total['successes'] + total['failures']
)
return (
total.sort_values("failure_rate", ascending=False)['failure_rate']
.iloc[0:20]
.plot(kind='bar', title='Transformers by Failure Rate', color='forestgreen')
)

def diagnose_params(self, target='runtime', waterfall_plots=True):
"""Attempt to explain params causing measured outcomes using shap and linear regression coefficients.
Expand Down
3 changes: 3 additions & 0 deletions autots/evaluator/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,8 @@ def full_metric_evaluation(
mate = np.abs(np.nansum(full_errors, axis=0))
else:
mate = np.abs(np.sum(full_errors, axis=0))
# possibly temporary
matse = mate / np.sum(A, axis=0)

direc_sign = np.sign(F - last_of_array) == np.sign(A - last_of_array)
weights = np.geomspace(1, 10, full_mae_errors.shape[0])[:, np.newaxis]
Expand All @@ -707,6 +709,7 @@ def full_metric_evaluation(
# aggregate error
'mage': mage, # Gandalf approved
'mate': mate, # the British version, of course
'matse': matse, # pronounced like the painter 'Matisse'
'underestimate': np.nansum(np.where(~ovm, full_errors, 0), axis=0),
'mle': msle(full_errors, full_mae_errors, log_errors, nan_flag=nan_flag),
'overestimate': np.nansum(np.where(ovm, full_errors, 0), axis=0),
Expand Down
10 changes: 8 additions & 2 deletions autots/models/cassandra.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,19 +798,25 @@ def rolling_trend(self, trend_residuals, t):
axis=1,
)
wind = 30 if self.trend_window is None else self.trend_window
# the uneven fraction of the window goes at the ened
# and minus one is because there will always be at least one real point
w_1 = wind - 1
steps_ahd = int(w_1 / 2)
y0 = np.repeat(np.array(trend_residuals[0:1]), steps_ahd, axis=0)
# d0 = -1 * dates_2d[1 : y0.shape[0] + 1][::-1]
start_pt = dates_2d[0, 0]
step = dates_2d[1, 0] - start_pt
extra_step = y0.shape[0] + 1
# there's some weird float thing that can happen here I still don't understand
# when it produces one more step than expected
d0 = np_2d_arange(
start_pt,
stop=start_pt - ((y0.shape[0] + 1) * step),
stop=start_pt - (extra_step * step),
step=-step,
num_columns=dates_2d.shape[1],
)[1:][::-1]
)[1:extra_step][::-1]
shape2 = (w_1 - steps_ahd, y0.shape[1])
# these combine a fake first half and fake last half window with real data in between
y2 = np.concatenate(
[
y0,
Expand Down
37 changes: 26 additions & 11 deletions autots/models/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,13 +398,17 @@ def retrieve_regressor(
elif model_class in ['xgboost', 'XGBRegressor']:
import xgboost as xgb

smaller_n_jobs = int(n_jobs / 2) if n_jobs > 3 else n_jobs

if False: # this is no longer necessary in 1.6 and beyond
regr = MultiOutputRegressor(
xgb.XGBRegressor(verbosity=0, **model_param_dict, n_jobs=1),
n_jobs=n_jobs,
n_jobs=smaller_n_jobs,
)
else:
regr = xgb.XGBRegressor(verbosity=0, **model_param_dict, n_jobs=n_jobs)
regr = xgb.XGBRegressor(
verbosity=0, **model_param_dict, n_jobs=smaller_n_jobs
)
return regr
elif model_class == 'SVM':
from sklearn.svm import LinearSVR
Expand Down Expand Up @@ -672,16 +676,16 @@ def retrieve_classifier(
# these are models that are relatively fast with large multioutput Y, small n obs
datepart_model_dict: dict = {
# 'RandomForest': 0.05, # crashes sometimes at scale for unclear reasons
'ElasticNet': 0.05,
'xgboost': 0.01,
'ElasticNet': 0.1,
'xgboost': 0.001, # excess memory at scale
'MLP': 0.05,
'DecisionTree': 0.02,
'Adaboost': 0.05,
'SVM': 0.01,
'KerasRNN': 0.02,
'Transformer': 0.02, # slow
'ExtraTrees': 0.00001, # some params cause RAM crash?
'RadiusNeighbors': 0.05,
'RadiusNeighbors': 0.1,
'MultioutputGPR': 0.00001,
}
gpu = ['Transformer', 'KerasRNN', 'MLP'] # or more accurately, no dnn
Expand Down Expand Up @@ -888,15 +892,21 @@ def generate_regressor_params(
param_dict = {
"model": 'xgboost',
"model_params": {
"booster": random.choices(['gbtree', 'gblinear'], [0.7, 0.3])[
0
],
"objective": objective,
"max_depth": random.choices(
[6, 3, 2, 8], [0.6, 0.4, 0.2, 0.01]
)[0],
"eta": random.choices(
[1.0, 0.3, 0.01, 0.03, 0.05, 0.003],
[0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
)[
0
], # aka learning_rate
"min_child_weight": random.choices(
[0.05, 0.5, 1, 2, 5], [0.1, 0.2, 0.8, 0.1, 0.1]
[0.05, 0.5, 1, 2, 5, 10], [0.01, 0.05, 0.8, 0.1, 0.1, 0.1]
)[0],
"subsample": random.choices(
[1, 0.9, 0.7, 0.5], [0.9, 0.05, 0.05, 0.05]
Expand Down Expand Up @@ -2317,11 +2327,16 @@ def predict(
)
self.X_pred.columns = [str(xc) for xc in self.X_pred.columns]

forecast = pd.DataFrame(
self.model.predict(self.X_pred.astype(float)),
index=index,
columns=self.column_names,
)
try:
forecast = pd.DataFrame(
self.model.predict(self.X_pred.astype(float)),
index=index,
columns=self.column_names,
)
except Exception as e:
raise ValueError(
f"Datepart prediction with params {self.get_params()} failed"
) from e

if just_point_forecast:
return forecast
Expand Down
Loading

0 comments on commit 4e4f6bd

Please sign in to comment.