Skip to content

Commit

Permalink
Merge pull request #107 from theislab/singular_fix
Browse files Browse the repository at this point in the history
Singular fix
  • Loading branch information
davidsebfischer committed Mar 2, 2020
2 parents ed2f5ba + d3287b3 commit 8842f6b
Show file tree
Hide file tree
Showing 10 changed files with 262 additions and 366 deletions.
2 changes: 1 addition & 1 deletion batchglm/api/data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from batchglm.data import design_matrix
from batchglm.data import constraint_matrix_from_dict, constraint_matrix_from_string, string_constraints_from_dict, \
constraint_system_from_star
from batchglm.data import view_coef_names, preview_coef_names
from batchglm.data import view_coef_names, preview_coef_names, bin_continuous_covariate
28 changes: 28 additions & 0 deletions batchglm/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,3 +453,31 @@ def constraint_matrix_from_string(
)

return constraint_mat


def bin_continuous_covariate(
sample_description: pd.DataFrame,
factor_to_bin: str,
bins: Union[int, list, np.ndarray, Tuple]
):
r"""
Bin a continuous covariate.
Adds the binned covariate to the table. Binning is performed on quantiles of the distribution.
:param sample_description: Sample description table.
:param factor_to_bin: Name of columns of factor to bin.
:param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the
quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5.
:return: Sample description table with binned covariate added.
"""
if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple):
bins = np.asarray(bins)
else:
bins = np.arange(0, 1, 1 / bins)

sample_description[factor_to_bin + "_binned"] = np.digitize(
np.argsort(np.argsort(sample_description[factor_to_bin].values)) / sample_description.shape[0],
bins
)
return sample_description
19 changes: 19 additions & 0 deletions batchglm/models/base/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
import pprint
import sys

try:
import anndata
Expand Down Expand Up @@ -112,6 +113,14 @@ def train_sequence(
logger.debug("training strategy:\n%s", pprint.pformat(training_strategy))
for idx, d in enumerate(training_strategy):
logger.debug("Beginning with training sequence #%d", idx + 1)
# Override duplicate arguments with user choice:
if np.any([x in list(d.keys()) for x in list(kwargs.keys())]):
d = dict([(x, y) for x, y in d.items() if x not in list(kwargs.keys())])
for x in [xx for xx in list(d.keys()) if xx in list(kwargs.keys())]:
sys.stdout.write(
"overrding %s from training strategy with value %s with new value %s\n" %
(x, str(d[x]), str(kwargs[x]))
)
self.train(**d, **kwargs)
logger.debug("Training sequence #%d complete", idx + 1)

Expand Down Expand Up @@ -165,6 +174,11 @@ def _plot_coef_vs_ref(
from matplotlib import gridspec
from matplotlib import rcParams

if isinstance(true_values, dask.array.core.Array):
true_values = true_values.compute()
if isinstance(estim_values, dask.array.core.Array):
estim_values = estim_values.compute()

plt.ioff()

n_par = true_values.shape[0]
Expand Down Expand Up @@ -258,6 +272,11 @@ def _plot_deviation(
import seaborn as sns
import matplotlib.pyplot as plt

if isinstance(true_values, dask.array.core.Array):
true_values = true_values.compute()
if isinstance(estim_values, dask.array.core.Array):
estim_values = estim_values.compute()

plt.ioff()

n_par = true_values.shape[0]
Expand Down
144 changes: 144 additions & 0 deletions batchglm/models/glm_nb/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import dask
import logging
import numpy as np
import scipy.sparse
from typing import Union
Expand Down Expand Up @@ -71,3 +73,145 @@ def compute_scales_fun(variance, mean):
inv_link_fn=invlink_fn,
compute_scales_fun=compute_scales_fun
)


def init_par(
input_data,
init_a,
init_b,
init_model
):
r"""
standard:
Only initialise intercept and keep other coefficients as zero.
closed-form:
Initialize with Maximum Likelihood / Maximum of Momentum estimators
Idea:
$$
\theta &= f(x) \\
\Rightarrow f^{-1}(\theta) &= x \\
&= (D \cdot D^{+}) \cdot x \\
&= D \cdot (D^{+} \cdot x) \\
&= D \cdot x' = f^{-1}(\theta)
$$
"""
train_loc = True
train_scale = True

if init_model is None:
groupwise_means = None
init_a_str = None
if isinstance(init_a, str):
init_a_str = init_a.lower()
# Chose option if auto was chosen
if init_a.lower() == "auto":
if isinstance(input_data.design_loc, dask.array.core.Array):
dloc = input_data.design_loc.compute()
else:
dloc = input_data.design_loc
one_hot = len(np.unique(dloc)) == 2 and \
np.abs(np.min(dloc) - 0.) == 0. and \
np.abs(np.max(dloc) - 1.) == 0.
init_a = "standard" if not one_hot else "closed_form"

if init_a.lower() == "closed_form":
groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu(
x=input_data.x,
design_loc=input_data.design_loc,
constraints_loc=input_data.constraints_loc,
size_factors=input_data.size_factors,
link_fn=lambda mu: np.log(mu+np.nextafter(0, 1, dtype=mu.dtype))
)

# train mu, if the closed-form solution is inaccurate
train_loc = not (np.all(np.abs(rmsd_a) < 1e-20) or rmsd_a.size == 0)

if input_data.size_factors is not None:
if np.any(input_data.size_factors != 1):
train_loc = True
elif init_a.lower() == "standard":
overall_means = np.mean(input_data.x, axis=0) # directly calculate the mean
init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
init_a[0, :] = np.log(overall_means)
train_loc = True
elif init_a.lower() == "all_zero":
init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
train_loc = True
else:
raise ValueError("init_a string %s not recognized" % init_a)

if isinstance(init_b, str):
if init_b.lower() == "auto":
init_b = "standard"

if init_b.lower() == "standard":
groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi(
x=input_data.x,
design_scale=input_data.design_scale[:, [0]],
constraints=input_data.constraints_scale[[0], :][:, [0]],
size_factors=input_data.size_factors,
groupwise_means=None,
link_fn=lambda r: np.log(r+np.nextafter(0, 1, dtype=r.dtype))
)
init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
init_b[0, :] = init_b_intercept
elif init_b.lower() == "closed_form":
dmats_unequal = False
if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
if np.any(input_data.design_loc != input_data.design_scale):
dmats_unequal = True

inits_unequal = False
if init_a_str is not None:
if init_a_str != init_b:
inits_unequal = True

if inits_unequal or dmats_unequal:
raise ValueError("cannot use closed_form init for scale model " +
"if scale model differs from loc model")

groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi(
x=input_data.x,
design_scale=input_data.design_scale,
constraints=input_data.constraints_scale,
size_factors=input_data.size_factors,
groupwise_means=groupwise_means,
link_fn=lambda r: np.log(r)
)
elif init_b.lower() == "all_zero":
init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]])
else:
raise ValueError("init_b string %s not recognized" % init_b)
else:
# Locations model:
if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"):
my_loc_names = set(input_data.loc_names)
my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names))

init_loc = np.zeros([input_data.num_loc_params, input_data.num_features])
for parm in my_loc_names:
init_idx = np.where(init_model.input_data.loc_names == parm)[0]
my_idx = np.where(input_data.loc_names == parm)[0]
init_loc[my_idx] = init_model.a_var[init_idx]

init_a = init_loc
logging.getLogger("batchglm").debug("Using initialization based on input model for mean")

# Scale model:
if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"):
my_scale_names = set(input_data.scale_names)
my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names)

init_scale = np.zeros([input_data.num_scale_params, input_data.num_features])
for parm in my_scale_names:
init_idx = np.where(init_model.input_data.scale_names == parm)[0]
my_idx = np.where(input_data.scale_names == parm)[0]
init_scale[my_idx] = init_model.b_var[init_idx]

init_b = init_scale
logging.getLogger("batchglm").debug("Using initialization based on input model for dispersion")

return init_a, init_b, train_loc, train_scale

98 changes: 53 additions & 45 deletions batchglm/train/numpy/base_glm/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ def train(
"""
# Iterate until conditions are fulfilled.
train_step = 0
if self._train_scale:
if not self._train_loc:
update_b_freq = 1
else:
update_b_freq = np.inf
epochs_until_b_update = update_b_freq
fully_converged = np.tile(False, self.model.model_vars.n_features)

Expand All @@ -97,25 +102,29 @@ def train(
if epochs_until_b_update == 0:
# Compute update.
idx_update = np.where(np.logical_not(fully_converged))[0]
b_step = self.b_step(
idx_update=idx_update,
method=method_b,
ftol=ftol_b,
lr=lr_b,
max_iter=max_iter_b,
nproc=nproc
)
# Perform trial update.
self.model.b_var = self.model.b_var + b_step
# Reverse update by feature if update leads to worse loss:
ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute()
idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]]
if isinstance(self.model.b_var, dask.array.core.Array):
b_var_new = self.model.b_var.compute()
if self._train_scale:
b_step = self.b_step(
idx_update=idx_update,
method=method_b,
ftol=ftol_b,
lr=lr_b,
max_iter=max_iter_b,
nproc=nproc
)
# Perform trial update.
self.model.b_var = self.model.b_var + b_step
# Reverse update by feature if update leads to worse loss:
ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute()
idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]]
if isinstance(self.model.b_var, dask.array.core.Array):
b_var_new = self.model.b_var.compute()
else:
b_var_new = self.model.b_var.copy()
b_var_new[:, idx_bad_step] = b_var_new[:, idx_bad_step] - b_step[:, idx_bad_step]
self.model.b_var = b_var_new
else:
b_var_new = self.model.b_var.copy()
b_var_new[:, idx_bad_step] = b_var_new[:, idx_bad_step] - b_step[:, idx_bad_step]
self.model.b_var = b_var_new
ll_proposal = ll_current[idx_update]
idx_bad_step = np.array([], dtype=np.int32)
# Update likelihood vector with updated genes based on already evaluated proposal likelihood.
ll_new = ll_current.copy()
ll_new[idx_update] = ll_proposal
Expand All @@ -126,18 +135,22 @@ def train(
# IWLS step for location model:
# Compute update.
idx_update = self.model.idx_not_converged
a_step = self.iwls_step(idx_update=idx_update)
# Perform trial update.
self.model.a_var = self.model.a_var + a_step
# Reverse update by feature if update leads to worse loss:
ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute()
idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]]
if isinstance(self.model.b_var, dask.array.core.Array):
a_var_new = self.model.a_var.compute()
if self._train_loc:
a_step = self.iwls_step(idx_update=idx_update)
# Perform trial update.
self.model.a_var = self.model.a_var + a_step
# Reverse update by feature if update leads to worse loss:
ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute()
idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]]
if isinstance(self.model.b_var, dask.array.core.Array):
a_var_new = self.model.a_var.compute()
else:
a_var_new = self.model.a_var.copy()
a_var_new[:, idx_bad_step] = a_var_new[:, idx_bad_step] - a_step[:, idx_bad_step]
self.model.a_var = a_var_new
else:
a_var_new = self.model.a_var.copy()
a_var_new[:, idx_bad_step] = a_var_new[:, idx_bad_step] - a_step[:, idx_bad_step]
self.model.a_var = a_var_new
ll_proposal = ll_current[idx_update]
idx_bad_step = np.array([], dtype=np.int32)
# Update likelihood vector with updated genes based on already evaluated proposal likelihood.
ll_new = ll_current.copy()
ll_new[idx_update] = ll_proposal
Expand Down Expand Up @@ -273,10 +286,16 @@ def iwls_step(
invertible = np.where(dask.array.map_blocks(
get_cond_number, a, chunks=a.shape
).squeeze().compute() < 1 / sys.float_info.epsilon)[0]
delta_theta[:, idx_update[invertible]] = dask.array.map_blocks(
np.linalg.solve, a[invertible], b[invertible, :, None],
chunks=b[invertible, :, None].shape
).squeeze().T.compute()
if len(idx_update[invertible]) > 1:
delta_theta[:, idx_update[invertible]] = dask.array.map_blocks(
np.linalg.solve, a[invertible], b[invertible, :, None],
chunks=b[invertible, :, None].shape
).squeeze().T.compute()
elif len(idx_update[invertible]) == 1:
delta_theta[:, idx_update[invertible]] = np.expand_dims(
np.linalg.solve(a[invertible[0]], b[invertible[0]]).compute(),
axis=-1
)
else:
if np.linalg.cond(a.compute(), p=None) < 1 / sys.float_info.epsilon:
delta_theta[:, idx_update] = np.expand_dims(
Expand All @@ -290,7 +309,7 @@ def iwls_step(
invertible = np.where(np.linalg.cond(a, p=None) < 1 / sys.float_info.epsilon)[0]
delta_theta[:, idx_update[invertible]] = np.linalg.solve(a[invertible], b[invertible]).T
if invertible.shape[0] < len(idx_update):
print("caught %i linalg singular matrix errors" % (len(idx_update) - invertible.shape[0]))
sys.stdout.write("caught %i linalg singular matrix errors\n" % (len(idx_update) - invertible.shape[0]))
# Via np.linalg.lsts:
#delta_theta[:, idx_update] = np.concatenate([
# np.expand_dims(np.linalg.lstsq(a[i, :, :], b[i, :])[0], axis=-1)
Expand Down Expand Up @@ -537,14 +556,3 @@ def get_model_container(
input_data
):
pass

@abc.abstractmethod
def init_par(
self,
input_data,
init_a,
init_b,
init_model
):
pass

Loading

0 comments on commit 8842f6b

Please sign in to comment.