From 0139db13c7157b4c7b92e3918492a693b9e9a436 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Tue, 25 Feb 2020 13:51:54 +0100 Subject: [PATCH 1/8] fixed dimension bug when single gene was invertible --- batchglm/train/numpy/base_glm/estimator.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py index 116a4b20..0aa2de66 100644 --- a/batchglm/train/numpy/base_glm/estimator.py +++ b/batchglm/train/numpy/base_glm/estimator.py @@ -273,10 +273,16 @@ def iwls_step( invertible = np.where(dask.array.map_blocks( get_cond_number, a, chunks=a.shape ).squeeze().compute() < 1 / sys.float_info.epsilon)[0] - delta_theta[:, idx_update[invertible]] = dask.array.map_blocks( - np.linalg.solve, a[invertible], b[invertible, :, None], - chunks=b[invertible, :, None].shape - ).squeeze().T.compute() + if len(idx_update[invertible]) > 1: + delta_theta[:, idx_update[invertible]] = dask.array.map_blocks( + np.linalg.solve, a[invertible], b[invertible, :, None], + chunks=b[invertible, :, None].shape + ).squeeze().T.compute() + elif len(idx_update[invertible]) == 1: + delta_theta[:, idx_update[invertible]] = np.expand_dims( + np.linalg.solve(a[invertible], b[invertible]).compute(), + axis=-1 + ) else: if np.linalg.cond(a.compute(), p=None) < 1 / sys.float_info.epsilon: delta_theta[:, idx_update] = np.expand_dims( From 6d848b22fb2f58e6956a1ad141b7c5aa0e181d3d Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Tue, 25 Feb 2020 17:29:14 +0100 Subject: [PATCH 2/8] fixed bug in last dimensionality fix --- batchglm/train/numpy/base_glm/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py index 0aa2de66..33b3dff4 100644 --- a/batchglm/train/numpy/base_glm/estimator.py +++ b/batchglm/train/numpy/base_glm/estimator.py @@ -280,7 +280,7 @@ def iwls_step( ).squeeze().T.compute() elif len(idx_update[invertible]) == 1: delta_theta[:, idx_update[invertible]] = np.expand_dims( - np.linalg.solve(a[invertible], b[invertible]).compute(), + np.linalg.solve(a[invertible[0]], b[invertible[0]]).compute(), axis=-1 ) else: From 7fc8597eef4853ba07f6ba1847d5a701e3072e02 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Tue, 25 Feb 2020 22:54:36 +0100 Subject: [PATCH 3/8] moved init par into util of model and out of train specific estimator code --- batchglm/models/base/estimator.py | 10 ++ batchglm/models/glm_nb/utils.py | 144 ++++++++++++++++++ batchglm/train/numpy/base_glm/estimator.py | 84 ++++++----- batchglm/train/numpy/glm_nb/estimator.py | 168 ++------------------- batchglm/train/numpy/glm_nb/external.py | 2 +- batchglm/train/tf1/glm_nb/estimator.py | 163 +------------------- batchglm/train/tf1/glm_nb/external.py | 2 +- 7 files changed, 213 insertions(+), 360 deletions(-) diff --git a/batchglm/models/base/estimator.py b/batchglm/models/base/estimator.py index fd2f1f4a..091cee3a 100644 --- a/batchglm/models/base/estimator.py +++ b/batchglm/models/base/estimator.py @@ -165,6 +165,11 @@ def _plot_coef_vs_ref( from matplotlib import gridspec from matplotlib import rcParams + if isinstance(true_values, dask.array.core.Array): + true_values = true_values.compute() + if isinstance(estim_values, dask.array.core.Array): + estim_values = estim_values.compute() + plt.ioff() n_par = true_values.shape[0] @@ -258,6 +263,11 @@ def _plot_deviation( import seaborn as sns import matplotlib.pyplot as plt + if isinstance(true_values, dask.array.core.Array): + true_values = true_values.compute() + if isinstance(estim_values, dask.array.core.Array): + estim_values = estim_values.compute() + plt.ioff() n_par = true_values.shape[0] diff --git a/batchglm/models/glm_nb/utils.py b/batchglm/models/glm_nb/utils.py index b422ff65..929e50d3 100644 --- a/batchglm/models/glm_nb/utils.py +++ b/batchglm/models/glm_nb/utils.py @@ -1,3 +1,5 @@ +import dask +import logging import numpy as np import scipy.sparse from typing import Union @@ -71,3 +73,145 @@ def compute_scales_fun(variance, mean): inv_link_fn=invlink_fn, compute_scales_fun=compute_scales_fun ) + + +def init_par( + input_data, + init_a, + init_b, + init_model +): + r""" + standard: + Only initialise intercept and keep other coefficients as zero. + + closed-form: + Initialize with Maximum Likelihood / Maximum of Momentum estimators + + Idea: + $$ + \theta &= f(x) \\ + \Rightarrow f^{-1}(\theta) &= x \\ + &= (D \cdot D^{+}) \cdot x \\ + &= D \cdot (D^{+} \cdot x) \\ + &= D \cdot x' = f^{-1}(\theta) + $$ + """ + train_loc = True + train_scale = True + + if init_model is None: + groupwise_means = None + init_a_str = None + if isinstance(init_a, str): + init_a_str = init_a.lower() + # Chose option if auto was chosen + if init_a.lower() == "auto": + if isinstance(input_data.design_loc, dask.array.core.Array): + dloc = input_data.design_loc.compute() + else: + dloc = input_data.design_loc + one_hot = len(np.unique(dloc)) == 2 and \ + np.abs(np.min(dloc) - 0.) == 0. and \ + np.abs(np.max(dloc) - 1.) == 0. + init_a = "standard" if not one_hot else "closed_form" + + if init_a.lower() == "closed_form": + groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu( + x=input_data.x, + design_loc=input_data.design_loc, + constraints_loc=input_data.constraints_loc, + size_factors=input_data.size_factors, + link_fn=lambda mu: np.log(mu+np.nextafter(0, 1, dtype=mu.dtype)) + ) + + # train mu, if the closed-form solution is inaccurate + train_loc = not (np.all(np.abs(rmsd_a) < 1e-20) or rmsd_a.size == 0) + + if input_data.size_factors is not None: + if np.any(input_data.size_factors != 1): + train_loc = True + elif init_a.lower() == "standard": + overall_means = np.mean(input_data.x, axis=0) # directly calculate the mean + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + init_a[0, :] = np.log(overall_means) + train_loc = True + elif init_a.lower() == "all_zero": + init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) + train_loc = True + else: + raise ValueError("init_a string %s not recognized" % init_a) + + if isinstance(init_b, str): + if init_b.lower() == "auto": + init_b = "standard" + + if init_b.lower() == "standard": + groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi( + x=input_data.x, + design_scale=input_data.design_scale[:, [0]], + constraints=input_data.constraints_scale[[0], :][:, [0]], + size_factors=input_data.size_factors, + groupwise_means=None, + link_fn=lambda r: np.log(r+np.nextafter(0, 1, dtype=r.dtype)) + ) + init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) + init_b[0, :] = init_b_intercept + elif init_b.lower() == "closed_form": + dmats_unequal = False + if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]: + if np.any(input_data.design_loc != input_data.design_scale): + dmats_unequal = True + + inits_unequal = False + if init_a_str is not None: + if init_a_str != init_b: + inits_unequal = True + + if inits_unequal or dmats_unequal: + raise ValueError("cannot use closed_form init for scale model " + + "if scale model differs from loc model") + + groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi( + x=input_data.x, + design_scale=input_data.design_scale, + constraints=input_data.constraints_scale, + size_factors=input_data.size_factors, + groupwise_means=groupwise_means, + link_fn=lambda r: np.log(r) + ) + elif init_b.lower() == "all_zero": + init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]]) + else: + raise ValueError("init_b string %s not recognized" % init_b) + else: + # Locations model: + if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"): + my_loc_names = set(input_data.loc_names) + my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names)) + + init_loc = np.zeros([input_data.num_loc_params, input_data.num_features]) + for parm in my_loc_names: + init_idx = np.where(init_model.input_data.loc_names == parm)[0] + my_idx = np.where(input_data.loc_names == parm)[0] + init_loc[my_idx] = init_model.a_var[init_idx] + + init_a = init_loc + logging.getLogger("batchglm").debug("Using initialization based on input model for mean") + + # Scale model: + if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"): + my_scale_names = set(input_data.scale_names) + my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names) + + init_scale = np.zeros([input_data.num_scale_params, input_data.num_features]) + for parm in my_scale_names: + init_idx = np.where(init_model.input_data.scale_names == parm)[0] + my_idx = np.where(input_data.scale_names == parm)[0] + init_scale[my_idx] = init_model.b_var[init_idx] + + init_b = init_scale + logging.getLogger("batchglm").debug("Using initialization based on input model for dispersion") + + return init_a, init_b, train_loc, train_scale + diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py index 33b3dff4..e9f978d5 100644 --- a/batchglm/train/numpy/base_glm/estimator.py +++ b/batchglm/train/numpy/base_glm/estimator.py @@ -82,6 +82,11 @@ def train( """ # Iterate until conditions are fulfilled. train_step = 0 + if self._train_scale: + if not self._train_loc: + update_b_freq = 1 + else: + update_b_freq = np.inf epochs_until_b_update = update_b_freq fully_converged = np.tile(False, self.model.model_vars.n_features) @@ -97,25 +102,29 @@ def train( if epochs_until_b_update == 0: # Compute update. idx_update = np.where(np.logical_not(fully_converged))[0] - b_step = self.b_step( - idx_update=idx_update, - method=method_b, - ftol=ftol_b, - lr=lr_b, - max_iter=max_iter_b, - nproc=nproc - ) - # Perform trial update. - self.model.b_var = self.model.b_var + b_step - # Reverse update by feature if update leads to worse loss: - ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute() - idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]] - if isinstance(self.model.b_var, dask.array.core.Array): - b_var_new = self.model.b_var.compute() + if self._train_scale: + b_step = self.b_step( + idx_update=idx_update, + method=method_b, + ftol=ftol_b, + lr=lr_b, + max_iter=max_iter_b, + nproc=nproc + ) + # Perform trial update. + self.model.b_var = self.model.b_var + b_step + # Reverse update by feature if update leads to worse loss: + ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute() + idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]] + if isinstance(self.model.b_var, dask.array.core.Array): + b_var_new = self.model.b_var.compute() + else: + b_var_new = self.model.b_var.copy() + b_var_new[:, idx_bad_step] = b_var_new[:, idx_bad_step] - b_step[:, idx_bad_step] + self.model.b_var = b_var_new else: - b_var_new = self.model.b_var.copy() - b_var_new[:, idx_bad_step] = b_var_new[:, idx_bad_step] - b_step[:, idx_bad_step] - self.model.b_var = b_var_new + ll_proposal = ll_current + idx_bad_step = np.array([], dtype=np.int32) # Update likelihood vector with updated genes based on already evaluated proposal likelihood. ll_new = ll_current.copy() ll_new[idx_update] = ll_proposal @@ -126,18 +135,22 @@ def train( # IWLS step for location model: # Compute update. idx_update = self.model.idx_not_converged - a_step = self.iwls_step(idx_update=idx_update) - # Perform trial update. - self.model.a_var = self.model.a_var + a_step - # Reverse update by feature if update leads to worse loss: - ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute() - idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]] - if isinstance(self.model.b_var, dask.array.core.Array): - a_var_new = self.model.a_var.compute() + if self._train_loc: + a_step = self.iwls_step(idx_update=idx_update) + # Perform trial update. + self.model.a_var = self.model.a_var + a_step + # Reverse update by feature if update leads to worse loss: + ll_proposal = - self.model.ll_byfeature_j(j=idx_update).compute() + idx_bad_step = idx_update[np.where(ll_proposal > ll_current[idx_update])[0]] + if isinstance(self.model.b_var, dask.array.core.Array): + a_var_new = self.model.a_var.compute() + else: + a_var_new = self.model.a_var.copy() + a_var_new[:, idx_bad_step] = a_var_new[:, idx_bad_step] - a_step[:, idx_bad_step] + self.model.a_var = a_var_new else: - a_var_new = self.model.a_var.copy() - a_var_new[:, idx_bad_step] = a_var_new[:, idx_bad_step] - a_step[:, idx_bad_step] - self.model.a_var = a_var_new + ll_proposal = ll_current + idx_bad_step = np.array([], dtype=np.int32) # Update likelihood vector with updated genes based on already evaluated proposal likelihood. ll_new = ll_current.copy() ll_new[idx_update] = ll_proposal @@ -296,7 +309,7 @@ def iwls_step( invertible = np.where(np.linalg.cond(a, p=None) < 1 / sys.float_info.epsilon)[0] delta_theta[:, idx_update[invertible]] = np.linalg.solve(a[invertible], b[invertible]).T if invertible.shape[0] < len(idx_update): - print("caught %i linalg singular matrix errors" % (len(idx_update) - invertible.shape[0])) + sys.stdout.write("caught %i linalg singular matrix errors" % (len(idx_update) - invertible.shape[0])) # Via np.linalg.lsts: #delta_theta[:, idx_update] = np.concatenate([ # np.expand_dims(np.linalg.lstsq(a[i, :, :], b[i, :])[0], axis=-1) @@ -543,14 +556,3 @@ def get_model_container( input_data ): pass - - @abc.abstractmethod - def init_par( - self, - input_data, - init_a, - init_b, - init_model - ): - pass - diff --git a/batchglm/train/numpy/glm_nb/estimator.py b/batchglm/train/numpy/glm_nb/estimator.py index 03c95196..b3a345de 100644 --- a/batchglm/train/numpy/glm_nb/estimator.py +++ b/batchglm/train/numpy/glm_nb/estimator.py @@ -1,9 +1,9 @@ -import logging from typing import Tuple, Union import numpy as np +import sys from .external import InputDataGLM, Model, EstimatorGlm -from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi +from .external import init_par from .vars import ModelVars from .model import ModelIwlsNb @@ -56,20 +56,20 @@ def __init__( Useful in scenarios where fitting the exact `scale` is not absolutely necessary. :param dtype: Numerical precision. """ - - self._train_loc = True - self._train_scale = True - - (init_a, init_b) = self.init_par( + init_a, init_b, train_loc, train_scale = init_par( input_data=input_data, init_a=init_a, init_b=init_b, init_model=None ) - init_a = init_a.astype(dtype) - init_b = init_b.astype(dtype) + self._train_loc = train_loc + self._train_scale = train_scale if quick_scale: self._train_scale = False + sys.stdout.write("training location model: %s\n" % str(self._train_loc)) + sys.stdout.write("training scale model: %s\n" % str(self._train_scale)) + init_a = init_a.astype(dtype) + init_b = init_b.astype(dtype) self.model_vars = ModelVars( init_a=init_a, @@ -97,153 +97,3 @@ def get_model_container( input_data ): return Model(input_data=input_data) - - def init_par( - self, - input_data, - init_a, - init_b, - init_model - ): - r""" - standard: - Only initialise intercept and keep other coefficients as zero. - - closed-form: - Initialize with Maximum Likelihood / Maximum of Momentum estimators - - Idea: - $$ - \theta &= f(x) \\ - \Rightarrow f^{-1}(\theta) &= x \\ - &= (D \cdot D^{+}) \cdot x \\ - &= D \cdot (D^{+} \cdot x) \\ - &= D \cdot x' = f^{-1}(\theta) - $$ - """ - if init_model is None: - groupwise_means = None - init_a_str = None - if isinstance(init_a, str): - init_a_str = init_a.lower() - # Chose option if auto was chosen - if init_a.lower() == "auto": - init_a = "standard" - - if init_a.lower() == "closed_form": - groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu( - x=input_data.x, - design_loc=input_data.design_loc, - constraints_loc=input_data.constraints_loc, - size_factors=input_data.size_factors, - link_fn=lambda mu: np.log(mu+np.nextafter(0, 1, dtype=mu.dtype)) - ) - - # train mu, if the closed-form solution is inaccurate - self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0) - - if input_data.size_factors is not None: - if np.any(input_data.size_factors != 1): - self._train_loc = True - - logging.getLogger("batchglm").debug("Using closed-form MLE initialization for mean") - logging.getLogger("batchglm").debug("Should train mu: %s", self._train_loc) - elif init_a.lower() == "standard": - overall_means = np.mean(input_data.x, axis=0) # directly calculate the mean - - init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) - init_a[0, :] = np.log(overall_means) - self._train_loc = True - - logging.getLogger("batchglm").debug("Using standard initialization for mean") - logging.getLogger("batchglm").debug("Should train mu: %s", self._train_loc) - elif init_a.lower() == "all_zero": - init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) - self._train_loc = True - - logging.getLogger("batchglm").debug("Using all_zero initialization for mean") - logging.getLogger("batchglm").debug("Should train mu: %s", self._train_loc) - else: - raise ValueError("init_a string %s not recognized" % init_a) - - if isinstance(init_b, str): - if init_b.lower() == "auto": - init_b = "standard" - - if init_b.lower() == "standard": - groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi( - x=input_data.x, - design_scale=input_data.design_scale[:, [0]], - constraints=input_data.constraints_scale[[0], :][:, [0]], - size_factors=input_data.size_factors, - groupwise_means=None, - link_fn=lambda r: np.log(r+np.nextafter(0, 1, dtype=r.dtype)) - ) - init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) - init_b[0, :] = init_b_intercept - - logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion") - logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) - elif init_b.lower() == "closed_form": - dmats_unequal = False - if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]: - if np.any(input_data.design_loc != input_data.design_scale): - dmats_unequal = True - - inits_unequal = False - if init_a_str is not None: - if init_a_str != init_b: - inits_unequal = True - - if inits_unequal or dmats_unequal: - raise ValueError("cannot use closed_form init for scale model " + - "if scale model differs from loc model") - - groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi( - x=input_data.x, - design_scale=input_data.design_scale, - constraints=input_data.constraints_scale, - size_factors=input_data.size_factors, - groupwise_means=groupwise_means, - link_fn=lambda r: np.log(r) - ) - - logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion") - logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) - elif init_b.lower() == "all_zero": - init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]]) - - logging.getLogger("batchglm").debug("Using standard initialization for dispersion") - logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) - else: - raise ValueError("init_b string %s not recognized" % init_b) - else: - # Locations model: - if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"): - my_loc_names = set(input_data.loc_names) - my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names)) - - init_loc = np.zeros([input_data.num_loc_params, input_data.num_features]) - for parm in my_loc_names: - init_idx = np.where(init_model.input_data.loc_names == parm)[0] - my_idx = np.where(input_data.loc_names == parm)[0] - init_loc[my_idx] = init_model.a_var[init_idx] - - init_a = init_loc - logging.getLogger("batchglm").debug("Using initialization based on input model for mean") - - # Scale model: - if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"): - my_scale_names = set(input_data.scale_names) - my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names) - - init_scale = np.zeros([input_data.num_scale_params, input_data.num_features]) - for parm in my_scale_names: - init_idx = np.where(init_model.input_data.scale_names == parm)[0] - my_idx = np.where(input_data.scale_names == parm)[0] - init_scale[my_idx] = init_model.b_var[init_idx] - - init_b = init_scale - logging.getLogger("batchglm").debug("Using initialization based on input model for dispersion") - - return init_a, init_b diff --git a/batchglm/train/numpy/glm_nb/external.py b/batchglm/train/numpy/glm_nb/external.py index d9b43d55..f868fe0a 100644 --- a/batchglm/train/numpy/glm_nb/external.py +++ b/batchglm/train/numpy/glm_nb/external.py @@ -2,7 +2,7 @@ from batchglm.models.glm_nb import _EstimatorGLM, InputDataGLM, Model from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale -from batchglm.models.glm_nb.utils import closedform_nb_glm_logmu, closedform_nb_glm_logphi +from batchglm.models.glm_nb.utils import init_par from batchglm.utils.linalg import groupwise_solve_lm from batchglm import pkg_constants diff --git a/batchglm/train/tf1/glm_nb/estimator.py b/batchglm/train/tf1/glm_nb/estimator.py index 61d9d713..65f0c592 100644 --- a/batchglm/train/tf1/glm_nb/estimator.py +++ b/batchglm/train/tf1/glm_nb/estimator.py @@ -8,7 +8,7 @@ tf = None from .external import TFEstimatorGLM, InputDataGLM, Model -from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi +from .external import init_par from .estimator_graph import EstimatorGraph from .model import ProcessModel from .training_strategies import TrainingStrategies @@ -109,15 +109,14 @@ def __init__( self.TrainingStrategies = TrainingStrategies self._input_data = input_data - self._train_loc = True - self._train_scale = True - - (init_a, init_b) = self.init_par( + init_a, init_b, train_loc, train_scale = init_par( input_data=input_data, init_a=init_a, init_b=init_b, - init_model=init_model + init_model=None ) + self._train_loc = train_loc + self._train_scale = train_scale init_a = init_a.astype(dtype) init_b = init_b.astype(dtype) if quick_scale: @@ -151,155 +150,3 @@ def get_model_container( input_data ): return Model(input_data=input_data) - - def init_par( - self, - input_data, - init_a, - init_b, - init_model - ): - r""" - standard: - Only initialise intercept and keep other coefficients as zero. - - closed-form: - Initialize with Maximum Likelihood / Maximum of Momentum estimators - - Idea: - $$ - \theta &= f(x) \\ - \Rightarrow f^{-1}(\theta) &= x \\ - &= (D \cdot D^{+}) \cdot x \\ - &= D \cdot (D^{+} \cdot x) \\ - &= D \cdot x' = f^{-1}(\theta) - $$ - """ - - if init_model is None: - groupwise_means = None - init_a_str = None - if isinstance(init_a, str): - init_a_str = init_a.lower() - # Chose option if auto was chosen - if init_a.lower() == "auto": - init_a = "standard" - - if init_a.lower() == "closed_form": - groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu( - x=input_data.x, - design_loc=input_data.design_loc, - constraints_loc=input_data.constraints_loc, - size_factors=input_data.size_factors, - link_fn=lambda mu: np.log(self.np_clip_param(mu, "mu")) - ) - - # train mu, if the closed-form solution is inaccurate - self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0) - - if input_data.size_factors is not None: - if np.any(input_data.size_factors != 1): - self._train_loc = True - - logging.getLogger("batchglm").debug("Using closed-form MLE initialization for mean") - logging.getLogger("batchglm").debug("Should train mu: %s", self._train_loc) - elif init_a.lower() == "standard": - overall_means = np.mean(input_data.x, axis=0) # directly calculate the mean - overall_means = self.np_clip_param(overall_means, "mu") - - init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) - init_a[0, :] = np.log(overall_means) - self._train_loc = True - - logging.getLogger("batchglm").debug("Using standard initialization for mean") - logging.getLogger("batchglm").debug("Should train mu: %s", self._train_loc) - elif init_a.lower() == "all_zero": - init_a = np.zeros([input_data.num_loc_params, input_data.num_features]) - self._train_loc = True - - logging.getLogger("batchglm").debug("Using all_zero initialization for mean") - logging.getLogger("batchglm").debug("Should train mu: %s", self._train_loc) - else: - raise ValueError("init_a string %s not recognized" % init_a) - - if isinstance(init_b, str): - if init_b.lower() == "auto": - init_b = "standard" - - if init_b.lower() == "standard": - groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi( - x=input_data.x, - design_scale=input_data.design_scale[:, [0]], - constraints=input_data.constraints_scale[[0], :][:, [0]], - size_factors=input_data.size_factors, - groupwise_means=None, - link_fn=lambda r: np.log(self.np_clip_param(r, "r")) - ) - init_b = np.zeros([input_data.num_scale_params, input_data.num_features]) - init_b[0, :] = init_b_intercept - - logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion") - logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) - elif init_b.lower() == "closed_form": - dmats_unequal = False - if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]: - if np.any(input_data.design_loc != input_data.design_scale): - dmats_unequal = True - - inits_unequal = False - if init_a_str is not None: - if init_a_str != init_b: - inits_unequal = True - - if inits_unequal or dmats_unequal: - raise ValueError("cannot use closed_form init for scale model " + - "if scale model differs from loc model") - - groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi( - x=input_data.x, - design_scale=input_data.design_scale, - constraints=input_data.constraints_scale, - size_factors=input_data.size_factors, - groupwise_means=groupwise_means, - link_fn=lambda r: np.log(self.np_clip_param(r, "r")) - ) - - logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion") - logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) - elif init_b.lower() == "all_zero": - init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]]) - - logging.getLogger("batchglm").debug("Using standard initialization for dispersion") - logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale) - else: - raise ValueError("init_b string %s not recognized" % init_b) - else: - # Locations model: - if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"): - my_loc_names = set(input_data.loc_names) - my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names)) - - init_loc = np.zeros([input_data.num_loc_params, input_data.num_features]) - for parm in my_loc_names: - init_idx = np.where(init_model.input_data.loc_names == parm)[0] - my_idx = np.where(input_data.loc_names == parm)[0] - init_loc[my_idx] = init_model.a_var[init_idx] - - init_a = init_loc - logging.getLogger("batchglm").debug("Using initialization based on input model for mean") - - # Scale model: - if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"): - my_scale_names = set(input_data.scale_names) - my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names) - - init_scale = np.zeros([input_data.num_scale_params, input_data.num_features]) - for parm in my_scale_names: - init_idx = np.where(init_model.input_data.scale_names == parm)[0] - my_idx = np.where(input_data.scale_names == parm)[0] - init_scale[my_idx] = init_model.b_var[init_idx] - - init_b = init_scale - logging.getLogger("batchglm").debug("Using initialization based on input model for dispersion") - - return init_a, init_b diff --git a/batchglm/train/tf1/glm_nb/external.py b/batchglm/train/tf1/glm_nb/external.py index 5c34fd77..5f04c9cf 100644 --- a/batchglm/train/tf1/glm_nb/external.py +++ b/batchglm/train/tf1/glm_nb/external.py @@ -2,7 +2,7 @@ from batchglm.models.glm_nb import _EstimatorGLM, InputDataGLM, Model from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale -from batchglm.models.glm_nb.utils import closedform_nb_glm_logmu, closedform_nb_glm_logphi +from batchglm.models.glm_nb.utils import init_par import batchglm.train.tf1.train as train_utils from batchglm.train.tf1.base import TFEstimatorGraph From 1100e1a39da463817d769364871332da9c080a14 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Tue, 25 Feb 2020 22:56:07 +0100 Subject: [PATCH 4/8] added missing return statement --- batchglm/train/numpy/base_glm/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py index e9f978d5..c8dc96cf 100644 --- a/batchglm/train/numpy/base_glm/estimator.py +++ b/batchglm/train/numpy/base_glm/estimator.py @@ -309,7 +309,7 @@ def iwls_step( invertible = np.where(np.linalg.cond(a, p=None) < 1 / sys.float_info.epsilon)[0] delta_theta[:, idx_update[invertible]] = np.linalg.solve(a[invertible], b[invertible]).T if invertible.shape[0] < len(idx_update): - sys.stdout.write("caught %i linalg singular matrix errors" % (len(idx_update) - invertible.shape[0])) + sys.stdout.write("caught %i linalg singular matrix errors\n" % (len(idx_update) - invertible.shape[0])) # Via np.linalg.lsts: #delta_theta[:, idx_update] = np.concatenate([ # np.expand_dims(np.linalg.lstsq(a[i, :, :], b[i, :])[0], axis=-1) From eae5c985ef8acc072bf0f392d1d4e0ee4a072903 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 26 Feb 2020 12:04:12 +0100 Subject: [PATCH 5/8] allowed overriding train args set in trianing strategy via train_kwargs --- batchglm/models/base/estimator.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/batchglm/models/base/estimator.py b/batchglm/models/base/estimator.py index 091cee3a..dc1e3bf1 100644 --- a/batchglm/models/base/estimator.py +++ b/batchglm/models/base/estimator.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pprint +import sys try: import anndata @@ -112,6 +113,14 @@ def train_sequence( logger.debug("training strategy:\n%s", pprint.pformat(training_strategy)) for idx, d in enumerate(training_strategy): logger.debug("Beginning with training sequence #%d", idx + 1) + # Override duplicate arguments with user choice: + if np.any([x in list(d.keys()) for x in list(kwargs.keys())]): + d = dict([(x, y) for x, y in d.items() if x not in list(kwargs.keys())]) + for x in [xx for xx in list(d.keys()) if xx in list(kwargs.keys())]: + sys.stdout.write( + "overrding %s from training strategy with value %s with new value %s\n" % + (x, str(d[x]), str(kwargs[x])) + ) self.train(**d, **kwargs) logger.debug("Training sequence #%d complete", idx + 1) From a646d950d93a5b8090cdf1a80a2f9298c1b1b5e4 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 26 Feb 2020 12:04:53 +0100 Subject: [PATCH 6/8] added function to bin continuous covariates --- batchglm/api/data.py | 2 +- batchglm/data.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/batchglm/api/data.py b/batchglm/api/data.py index ae022f34..65a5f3d2 100644 --- a/batchglm/api/data.py +++ b/batchglm/api/data.py @@ -1,4 +1,4 @@ from batchglm.data import design_matrix from batchglm.data import constraint_matrix_from_dict, constraint_matrix_from_string, string_constraints_from_dict, \ constraint_system_from_star -from batchglm.data import view_coef_names, preview_coef_names +from batchglm.data import view_coef_names, preview_coef_names, bin_continuous_covariate diff --git a/batchglm/data.py b/batchglm/data.py index f4711721..d76a7f7d 100644 --- a/batchglm/data.py +++ b/batchglm/data.py @@ -453,3 +453,31 @@ def constraint_matrix_from_string( ) return constraint_mat + + +def bin_continuous_covariate( + sample_description: pd.DataFrame, + factor_to_bin: str, + bins: Union[int, list, np.ndarray, Tuple] +): + r""" + Bin a continuous covariate. + + Adds the binned covariate to the table. Binning is performed on quantiles of the distribution. + + :param sample_description: Sample description table. + :param factor_to_bin: Name of columns of factor to bin. + :param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the + quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5. + :return: Sample description table with binned covariate added. + """ + if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple): + bins = np.asarray(bins) + else: + bins = np.arange(0, 1, 1 / bins) + + sample_description[factor_to_bin + "_binned"] = np.digitize( + np.argsort(np.argsort(sample_description[factor_to_bin].values)) / sample_description.shape[0], + bins + ) + return sample_description From 4f19e21b48e60f8fa3b3ed809e869fff93b80233 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 26 Feb 2020 12:05:12 +0100 Subject: [PATCH 7/8] increased ocndition nbumbver allowed before warning is thrown in init --- batchglm/utils/linalg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batchglm/utils/linalg.py b/batchglm/utils/linalg.py index 54b6eafa..0560ea21 100644 --- a/batchglm/utils/linalg.py +++ b/batchglm/utils/linalg.py @@ -77,7 +77,7 @@ def apply_fun(grouping): unique_design = dask.array.from_array(unique_design, chunks=unique_design.shape) else: unique_design, inverse_idx = np.unique(dmat, axis=0, return_inverse=True) - if unique_design.shape[0] > 100: + if unique_design.shape[0] > 500: raise ValueError("large least-square problem in init, likely defined a numeric predictor as categorical") full_rank = constraints.shape[1] From d3287b37cad8a8aed75966f39636c3f6eca17b51 Mon Sep 17 00:00:00 2001 From: "david.seb.fischer" Date: Wed, 26 Feb 2020 12:05:40 +0100 Subject: [PATCH 8/8] fixed bug in training of only loc or scale model in numpy --- batchglm/train/numpy/base_glm/estimator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py index c8dc96cf..1ad26326 100644 --- a/batchglm/train/numpy/base_glm/estimator.py +++ b/batchglm/train/numpy/base_glm/estimator.py @@ -123,7 +123,7 @@ def train( b_var_new[:, idx_bad_step] = b_var_new[:, idx_bad_step] - b_step[:, idx_bad_step] self.model.b_var = b_var_new else: - ll_proposal = ll_current + ll_proposal = ll_current[idx_update] idx_bad_step = np.array([], dtype=np.int32) # Update likelihood vector with updated genes based on already evaluated proposal likelihood. ll_new = ll_current.copy() @@ -149,7 +149,7 @@ def train( a_var_new[:, idx_bad_step] = a_var_new[:, idx_bad_step] - a_step[:, idx_bad_step] self.model.a_var = a_var_new else: - ll_proposal = ll_current + ll_proposal = ll_current[idx_update] idx_bad_step = np.array([], dtype=np.int32) # Update likelihood vector with updated genes based on already evaluated proposal likelihood. ll_new = ll_current.copy()