Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prediction error for non-fe model #573

Open
wants to merge 45 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
88a791a
feols.vcov() returns covariance matrix
greenguy33 Jun 28, 2024
7372f2b
changed self._vcov to self.vcov in feols.py
greenguy33 Jul 1, 2024
f4e281a
Update quickstart.ipynb with vcov attribute
greenguy33 Jul 1, 2024
c05e12c
Update lpdid.py
s3alfisc Jul 1, 2024
ab24f36
Update quickstart.ipynb
s3alfisc Jul 1, 2024
61e41bb
Merge branch 'master' of https://github.com/py-econometrics/pyfixest
greenguy33 Jul 5, 2024
81be661
Update feols_.py
greenguy33 Jul 5, 2024
2228131
Update quickstart.ipynb
greenguy33 Jul 6, 2024
6f1fb75
.vcov to ._vcov
greenguy33 Jul 6, 2024
417a5f3
Merge remote-tracking branch 'upstream/master'
greenguy33 Jul 11, 2024
25c3886
Merge branch 'master' of https://github.com/greenguy33/pyfixest
greenguy33 Jul 11, 2024
a407c92
lsqr method for fixed effect solver
greenguy33 Jul 11, 2024
c3b088d
Take first element from last()
greenguy33 Jul 12, 2024
ba59808
Update feols_.py
s3alfisc Jul 12, 2024
0d9d4cf
Merge remote-tracking branch 'upstream/master'
greenguy33 Jul 19, 2024
4e73f58
atol and btol argument to lsqr
greenguy33 Jul 19, 2024
e67d45a
Merge branch 'master' of https://github.com/greenguy33/pyfixest
greenguy33 Jul 19, 2024
a5428fe
float data type fixed
greenguy33 Jul 19, 2024
34d0f08
changed assert_allclose tolerance level in test
greenguy33 Jul 19, 2024
9327308
adding tolerance level to test_did assert_allclose()
greenguy33 Jul 19, 2024
62a96b5
Merge remote-tracking branch 'upstream/master'
greenguy33 Aug 1, 2024
253c6f4
first pass at stdp
greenguy33 Aug 1, 2024
71bd23c
argument to vcov
greenguy33 Aug 1, 2024
02cdd0b
stdp no fe
greenguy33 Aug 1, 2024
445166a
moved polars to pandas newdata line up
greenguy33 Aug 1, 2024
c84bfcc
added .yhat to predict calls for tests
greenguy33 Aug 17, 2024
f3efd52
Merge remote-tracking branch 'upstream/master'
greenguy33 Aug 17, 2024
e1a7718
tests updated for stdp
greenguy33 Aug 30, 2024
0332250
poetry lock
greenguy33 Aug 30, 2024
ceb7112
Merge remote-tracking branch 'upstream/master'
greenguy33 Aug 30, 2024
c766ccc
poetry lock updated
greenguy33 Aug 30, 2024
fbbc8cf
fixed missing .yhat
greenguy33 Aug 30, 2024
484fdaf
uncommented rpy2
greenguy33 Aug 30, 2024
9dd5c87
added docstring for new methods
greenguy33 Aug 30, 2024
4bd4bab
fixed docstrings in fepois class
greenguy33 Aug 30, 2024
ac7fa15
cleaning up for pre-commit checks
greenguy33 Aug 30, 2024
d1a5bf1
fepois predict method returns dataframe
greenguy33 Aug 30, 2024
1dfa80c
fepois predict signature matches feols predict signature
greenguy33 Aug 30, 2024
72374ca
fixed missing yhat in test_predict_nas
greenguy33 Aug 31, 2024
54a0806
precommit fix
greenguy33 Sep 5, 2024
0dea3b7
updated docs with predict
greenguy33 Sep 12, 2024
5b7b7c3
merge from upstream
greenguy33 Sep 12, 2024
90e7062
fepois signature updated
greenguy33 Sep 12, 2024
f2a2dbd
fepois.predict returns dataframe
greenguy33 Sep 12, 2024
711278d
update fepois sig
greenguy33 Sep 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
338 changes: 241 additions & 97 deletions docs/quickstart.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# uncomment for windows
set shell := ["powershell.exe", "-c"]
# set shell := ["powershell.exe", "-c"]

# docs: justfile docs: https://just.systems/man/en/
default:
Expand Down
853 changes: 430 additions & 423 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyfixest/did/did2s.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def _did2s_estimate(
fit1.fixef()

# demean data
Y_hat = fit1.predict(newdata=data)
Y_hat = fit1.predict(newdata=data).yhat
_first_u = data[f"{yname}"].to_numpy().flatten() - Y_hat
data[f"{yname}_hat"] = _first_u

Expand Down
76 changes: 70 additions & 6 deletions pyfixest/estimation/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -1426,7 +1426,8 @@
atol: float = 1e-6,
btol: float = 1e-6,
type: str = "link",
) -> np.ndarray:
compute_stdp: bool = False
) -> pd.DataFrame:
"""
Predict values of the model on new data.

Expand All @@ -1451,23 +1452,29 @@
Another stopping tolerance for scipy.sparse.linalg.lsqr().
See https://docs.scipy.org/doc/
scipy/reference/generated/scipy.sparse.linalg.lsqr.html
link:
type:
The type of prediction to be made. Can be either 'link' or 'response'.
Defaults to 'link'. 'link' and 'response' lead
to identical results for linear models.
compute_stdp: boolean
Whether to compute standard error of the predictions

Returns
-------
y_hat : np.ndarray
A flat np.array with predicted values of the regression model.
pred_results : pd.DataFrame
Dataframe with columns "y_hat" and optionally "stdp" (if compute_stdp is True)
"""
if self._is_iv:
raise NotImplementedError(
"The predict() method is currently not supported for IV models."
)

prediction_df = pd.DataFrame()
if newdata is None:
return self._Y_untransformed.to_numpy().flatten() - self.resid()
prediction_df["yhat"] = self._Y_untransformed.to_numpy().flatten() - self._u_hat.flatten()
if compute_stdp:
prediction_df["stdp"] = self.get_newdata_stdp(self._X)
return prediction_df

newdata = _polars_to_pandas(newdata).reset_index(drop=False)

Expand Down Expand Up @@ -1501,7 +1508,64 @@
_fixef_mat = _apply_fixef_numpy(df_fe.values, fixef_dicts)
y_hat += np.sum(_fixef_mat, axis=1)

return y_hat.flatten()
prediction_df["yhat"] = y_hat.flatten()
if compute_stdp:
stdp_df = np.full(newdata.shape[0], np.nan)
stdp_df[X_index] = self.get_newdata_stdp(X)
prediction_df["stdp"] = stdp_df

Check warning on line 1515 in pyfixest/estimation/feols_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/feols_.py#L1513-L1515

Added lines #L1513 - L1515 were not covered by tests

return prediction_df


def get_newdata_stdp(self, X):
"""
Get standard error of predictions for new data.

Parameters
----------
X : np.ndarray
Covariates for new data points.

Returns
-------
list
Standard errors for each prediction
"""
# for now only compute prediction error if model has no fixed effects
# TODO: implement for fixed effects
if not self._has_fixef:
if not self._X_is_empty:
return list(map(self.get_single_row_stdp, X))
else:
warnings.warn(

Check warning on line 1540 in pyfixest/estimation/feols_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/feols_.py#L1540

Added line #L1540 was not covered by tests
"""
Standard error of the prediction cannot be computed if X is empty.
Prediction dataframe stdp column will be None.
"""
)
else:
warnings.warn(

Check warning on line 1547 in pyfixest/estimation/feols_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/feols_.py#L1547

Added line #L1547 was not covered by tests
"""
Standard error of the prediction is not implemented for fixed effects models.
Prediction dataframe stdp column will be None.
"""
)

def get_single_row_stdp(self, row):
"""
Get standard error of predictions for a single row.

Parameters
----------
row : np.ndarray
Single row of new covariate data

Returns
-------
np.ndarray
Standard error of prediction for single row
"""
return np.linalg.multi_dot([row, self._vcov, np.transpose(row)])

def get_nobs(self):
"""
Expand Down
18 changes: 11 additions & 7 deletions pyfixest/estimation/fepois_.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,8 @@
atol: float = 1e-6,
btol: float = 1e-6,
type: str = "link",
) -> np.ndarray:
compute_stdp: bool = False
) -> pd.DataFrame:
"""
Return predicted values from regression model.

Expand Down Expand Up @@ -334,13 +335,13 @@
btol : Float, default 1e-6
Another stopping tolerance for scipy.sparse.linalg.lsqr().
See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.lsqr.html


compute_stdp: boolean
Ignored, included to conform with method signature of feols.predict()

Returns
-------
np.ndarray
A flat array with the predicted values of the regression model.
pred_results : pd.DataFrame
Dataframe with columns "y_hat" with predicted values from regression model
"""
_Xbeta = self._Xbeta.flatten()
_has_fixef = self._has_fixef
Expand All @@ -355,10 +356,13 @@
)

# y_hat = super().predict(newdata=newdata, type=type, atol=atol, btol=btol)
prediction_df = pd.DataFrame()

Check warning on line 359 in pyfixest/estimation/fepois_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/fepois_.py#L359

Added line #L359 was not covered by tests
if type == "link":
return np.exp(_Xbeta)
prediction_df["yhat"] = np.exp(_Xbeta)
return prediction_df

Check warning on line 362 in pyfixest/estimation/fepois_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/fepois_.py#L361-L362

Added lines #L361 - L362 were not covered by tests
elif type == "response":
return _Xbeta
prediction_df["yhat"] = _Xbeta
return prediction_df

Check warning on line 365 in pyfixest/estimation/fepois_.py

View check run for this annotation

Codecov / codecov/patch

pyfixest/estimation/fepois_.py#L364-L365

Added lines #L364 - L365 were not covered by tests
else:
raise ValueError("type must be one of 'response' or 'link'.")

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ wildboottest = ">=0.2.0"
pre-commit = "^3.6.0"
doubleml = "^0.7.1"
marginaleffects = "^0.0.10"
statsmodels = "^0.14.1"

[tool.poetry.group.docs.dependencies]
causaldata = "*"
Expand Down
Binary file modified tests/.coverage
Binary file not shown.
30 changes: 15 additions & 15 deletions tests/test_predict_resid_fixef.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def test_internally(data):
# predict via feols, without fixed effect
fit = feols(fml="Y~csw(X1, X2)", data=data, vcov="iid")
mod = fit.fetch_model(0)
original_prediction = mod.predict()
updated_prediction = mod.predict(newdata=mod._data)
original_prediction = mod.predict().yhat
updated_prediction = mod.predict(newdata=mod._data).yhat
np.allclose(original_prediction, updated_prediction)
assert mod._data.shape[0] == original_prediction.shape[0]
assert mod._data.shape[0] == updated_prediction.shape[0]
Expand All @@ -47,15 +47,15 @@ def test_internally(data):
fit = feols(fml="Y~csw(X1, X2) | f1", data=data, vcov="iid")
mod = fit.fetch_model(0)
mod.fixef()
original_prediction = mod.predict()
updated_prediction = mod.predict(newdata=mod._data)
original_prediction = mod.predict().yhat
updated_prediction = mod.predict(newdata=mod._data).yhat
np.allclose(original_prediction, updated_prediction)
assert mod._data.shape[0] == original_prediction.shape[0]
assert mod._data.shape[0] == updated_prediction.shape[0]

# now expect error with updated predicted being a subset of data
with pytest.raises(ValueError):
updated_prediction = mod.predict(newdata=data.iloc[0:100, :])
updated_prediction = mod.predict(newdata=data.iloc[0:100, :]).yhat
np.allclose(original_prediction, updated_prediction)

# fepois NotImplementedError(s)
Expand Down Expand Up @@ -123,18 +123,18 @@ def test_vs_fixest(data, fml):
# raise ValueError("sumFE for Poisson are not equal")

# test predict for OLS
if not np.allclose(feols_mod.predict(), r_fixest_ols.rx2("fitted.values")):
if not np.allclose(feols_mod.predict().yhat, r_fixest_ols.rx2("fitted.values")):
raise ValueError("Predictions for OLS are not equal")

if not np.allclose(len(feols_mod.predict()), len(stats.predict(r_fixest_ols))):
if not np.allclose(len(feols_mod.predict().yhat), len(stats.predict(r_fixest_ols))):
raise ValueError("Predictions for OLS are not the same length")
# test predict for Poisson
# if not np.allclose(fepois_mod.predict(), r_fixest_pois.rx2("fitted.values")):
# if not np.allclose(fepois_mod.predict().yhat, r_fixest_pois.rx2("fitted.values")):
# raise ValueError("Predictions for Poisson are not equal")

# test on new data - OLS.
if not np.allclose(
feols_mod.predict(newdata=data2), stats.predict(r_fixest_ols, newdata=data2)
feols_mod.predict(newdata=data2).yhat, stats.predict(r_fixest_ols, newdata=data2)
):
raise ValueError("Predictions for OLS are not equal")

Expand Down Expand Up @@ -171,7 +171,7 @@ def test_predict_nas():
# test 1
fml = "Y ~ X1 + X2 | f1"
fit = feols(fml, data=data)
res = fit.predict(newdata=data)
res = fit.predict(newdata=data).yhat
fit_r = fixest.feols(ro.Formula(fml), data=data)
res_r = stats.predict(fit_r, newdata=data)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
Expand All @@ -184,15 +184,15 @@ def test_predict_nas():

fml = "Y ~ X1 + X2 | f1"
fit = feols(fml, data=data)
res = fit.predict(newdata=newdata)
res = fit.predict(newdata=newdata).yhat
fit_r = fixest.feols(ro.Formula(fml), data=data)
res_r = stats.predict(fit_r, newdata=newdata)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
assert newdata.shape[0] == len(res)
assert len(res) == len(res_r)

newdata.loc[198, "Y"] = np.nan
res = fit.predict(newdata=newdata)
res = fit.predict(newdata=newdata).yhat
res_r = stats.predict(fit_r, newdata=newdata)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
assert newdata.shape[0] == len(res)
Expand All @@ -201,7 +201,7 @@ def test_predict_nas():
# test 3
fml = "Y ~ X1 + X2 + f1| f1 "
fit = feols(fml, data=data)
res = fit.predict(newdata=data)
res = fit.predict(newdata=data).yhat
fit_r = fixest.feols(ro.Formula(fml), data=data)
res_r = stats.predict(fit_r, newdata=data)
np.testing.assert_allclose(res, res_r, atol=1e-05, rtol=1e-05)
Expand Down Expand Up @@ -229,7 +229,7 @@ def test_new_fixef_level(data, fml):
se="hetero",
)

updated_prediction_py = feols_mod.predict(newdata=data2)
updated_prediction_py = feols_mod.predict(newdata=data2).yhat
updated_prediction_r = stats.predict(r_fixest_ols, newdata=data2)

if not np.allclose(updated_prediction_py, updated_prediction_r):
Expand All @@ -249,7 +249,7 @@ def test_categorical_covariate_predict():
df_sub = df.query("x == 1 or x == 2 or x == 3").copy()

py_fit = feols("y ~ C(x, contr.treatment(base=1))", df)
py_predict = py_fit.predict(df_sub)
py_predict = py_fit.predict(df_sub).yhat

r_predict = np.array(
[
Expand Down
22 changes: 22 additions & 0 deletions tests/test_stdp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import numpy as np
import statsmodels.api as sm

from pyfixest.estimation.estimation import feols
from pyfixest.utils.utils import get_data


def test_stdp():
"""Compare the standard error of the prediction to statsmodels.get_prediction()."""
data = get_data().dropna()

fit = feols("Y ~ X1 + X2", data=data)
stdp = fit.predict(compute_stdp=True).stdp

X = data[["X1","X2"]]
Y = data["Y"]
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.get_prediction()
stdp_sm = predictions.var_pred_mean

np.testing.assert_allclose(stdp, stdp_sm)
Loading