From 3b9777652a48795423ec565d080a64b82d66eda2 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 19:03:13 -0700 Subject: [PATCH 01/44] using family information as linearopertator --- glmnet/base.py | 23 +++-------------- glmnet/cox.py | 47 +++++++++++++++++++++++++++++++++-- glmnet/glm.py | 67 +++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 104 insertions(+), 33 deletions(-) diff --git a/glmnet/base.py b/glmnet/base.py index 950ca0d..e9f7e94 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -126,27 +126,10 @@ def quadratic_form(self, G_sum = n else: - G = np.asarray(G) - if G.ndim == 2: - if np.linalg.norm(G-G.T)/np.linalg.norm(G) > 1e-3: - warnings.warn('G should be symmetric, using (G+G.T)/2') - G = (G + G.T) / 2 - GX = G @ X_R - X1_block = self.X.T @ G.sum(0) - elif G.ndim == 1: - if not np.all(G >= 0): - raise ValueError('weights should be non-negative') - if not scipy.sparse.issparse(self.X): - GX = G[:,None] * X_R - else: - GX = scipy.sparse.diags(G) @ X_R - X1_block = self.X.T @ G - else: - raise ValueError("G should be 1-dim (treated as diagonal) or 2-dim") - if scipy.sparse.issparse(GX): - GX = GX.toarray() + GX = G @ X_R + G1 = G @ np.ones(G.shape[0]) XX_block = self.X.T @ GX - G_sum = G.sum() + G_sum = G1.sum() if scipy.sparse.issparse(XX_block): XX_block = XX_block.toarray() diff --git a/glmnet/cox.py b/glmnet/cox.py index 7d55ee0..643c1c0 100644 --- a/glmnet/cox.py +++ b/glmnet/cox.py @@ -5,6 +5,8 @@ import numpy as np import pandas as pd +from scipy.stats import norm as normal_dbn + from sklearn.utils import check_X_y from sklearn.base import BaseEstimator @@ -41,7 +43,8 @@ def update(self, self.link_parameter = self.linear_predictor else: self.link_parameter = self.linear_predictor + offset - + self.mean_parameter = self.link_parameter + # shorthand self.mu = self.link_parameter self.eta = self.linear_predictor @@ -121,6 +124,7 @@ def null_fit(self, def get_null_deviance(self, y, sample_weight, + offset, # ignored for Cox fit_intercept): mu0 = self.null_fit(y, sample_weight, fit_intercept) return mu0, self.deviance(y, mu0, sample_weight) @@ -153,6 +157,11 @@ def get_response_and_weights(self, return pseudo_response, newton_weights + def information(self, + state, + sample_weight): + return self._coxdev.information(state.link_parameter, + sample_weight) @dataclass class CoxLM(GLM): @@ -181,6 +190,41 @@ def _check(self, check=check, multi_output=True) + def _summarize(self, + exclude, + dispersion, + sample_weight, + X_shape): + + # IRLS used normalized weights, + # this unnormalizes them... + + unscaled_precision_ = self.design_.quadratic_form(self._information) + + keep = np.ones(unscaled_precision_.shape[0]-1, bool) + if exclude is not []: + keep[exclude] = 0 + keep = np.hstack([self.fit_intercept, keep]).astype(bool) + covariance_ = dispersion * np.linalg.inv(unscaled_precision_[keep][:,keep]) + + SE = np.sqrt(np.diag(covariance_)) + index = self.feature_names_in_ + if self.fit_intercept: + coef = np.hstack([self.intercept_, self.coef_]) + T = np.hstack([self.intercept_ / SE[0], self.coef_ / SE[1:]]) + index = ['intercept'] + index + else: + coef = self.coef_ + T = self.coef_ / SE + + summary_ = pd.DataFrame({'coef':coef, + 'std err': SE, + 'z': T, + 'P>|z|': 2 * normal_dbn.sf(np.fabs(T))}, + index=index) + return covariance_, summary_ + + @dataclass class RegCoxLM(RegGLM): @@ -236,7 +280,6 @@ def _get_family_spec(self, def _get_initial_state(self, X, y, - sample_weight, exclude): n, p = X.shape diff --git a/glmnet/glm.py b/glmnet/glm.py index 5be19c6..278e308 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -10,6 +10,7 @@ import pandas as pd import scipy.sparse +from scipy.sparse.linalg import LinearOperator from scipy.stats import norm as normal_dbn from scipy.stats import t as t_dbn @@ -174,6 +175,46 @@ def _accuracy_score(y, yhat, sample_weight): # for binary data classifying at p= return scorers_ + def information(self, + state, + sample_weight=None): + + family = self.base + + # some checks for NAs/zeros + varmu = family.variance(state.mu) + if np.any(np.isnan(varmu)): raise ValueError("NAs in V(mu)") + + if np.any(varmu == 0): raise ValueError("0s in V(mu)") + + dmu_deta = family.link.inverse_deriv(state.link_parameter) + if np.any(np.isnan(dmu_deta)): raise ValueError("NAs in d(mu)/d(eta)") + + W = dmu_deta**2 / varmu + if sample_weight is not None: + W *= sample_weight + + n = W.shape[0] + W = W.reshape(-1) + return DiagonalOperator(W) + +@dataclass +class DiagonalOperator(LinearOperator): + + weights: np.ndarray + + def __post_init__(self): + self.weights = np.asarray(self.weights).reshape(-1) + n = self.weights.shape[0] + self.shape = (n, n) + + def _matvec(self, arg): + return self.weights * arg.reshape(-1) + + def _adjoint(self, arg): + return self._matvec(arg) + + @add_dataclass_docstring @dataclass class GLMControl(object): @@ -452,17 +493,20 @@ def obj_function(response, (converged, boundary, state, - self._final_weights) = IRLS(regularizer, - self._family, - design, - response, - offset, - normed_sample_weight, - state, - obj_function, - self.control) - + _) = IRLS(regularizer, + self._family, + design, + response, + offset, + normed_sample_weight, + state, + obj_function, + self.control) + if self.summarize: + self._information = self._family.information(state, + sample_weight) + # checks on convergence and fitted values if not converged: if self.control.logging: logging.debug("Fitting IRLS: algorithm did not converge") @@ -622,7 +666,8 @@ def _summarize(self, # IRLS used normalized weights, # this unnormalizes them... - unscaled_precision_ = self.design_.quadratic_form(self._final_weights * sample_weight.sum()) + + unscaled_precision_ = self.design_.quadratic_form(self._information) keep = np.ones(unscaled_precision_.shape[0]-1, bool) if exclude is not []: From c3feb6c974f7afb0f288cbb1938c3c6feb424ed5 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 19:27:41 -0700 Subject: [PATCH 02/44] initial commit of inference --- glmnet/inference.py | 1088 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1088 insertions(+) create mode 100644 glmnet/inference.py diff --git a/glmnet/inference.py b/glmnet/inference.py new file mode 100644 index 0000000..b56312d --- /dev/null +++ b/glmnet/inference.py @@ -0,0 +1,1088 @@ +""" +This module contains the core code needed for post selection +inference based on affine selection procedures as +described in the papers `Kac Rice`_, `Spacings`_, `covTest`_ +and `post selection LASSO`_. + +.. _covTest: http://arxiv.org/abs/1301.7161 +.. _Kac Rice: http://arxiv.org/abs/1308.3020 +.. _Spacings: http://arxiv.org/abs/1401.3889 +.. _post selection LASSO: http://arxiv.org/abs/1311.6238 +.. _sample carving: http://arxiv.org/abs/1410.2597 + +""" + +from warnings import warn +from copy import copy + +import numpy as np + +from .discrete_family import discrete_family +from mpmath import mp + +WARNINGS = False + +class constraints(object): + + r""" + This class is the core object for affine selection procedures. + It is meant to describe sets of the form $C$ + where + + .. math:: + + C = \left\{z: Az\leq b \right \} + + Its main purpose is to consider slices through $C$ + and the conditional distribution of a Gaussian $N(\mu,\Sigma)$ + restricted to such slices. + + Notes + ----- + + In this parameterization, the parameter `self.mean` corresponds + to the *reference measure* that is being truncated. It is not the + mean of the truncated Gaussian. + + >>> positive = constraints(-np.identity(2), np.zeros(2)) + >>> Y = np.array([3, 4.4]) + >>> eta = np.array([1, 1], np.float) + >>> list(positive.interval(eta, Y)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + [4.62..., 10.17...] + >>> positive.pivot(eta, Y) # doctest: +ELLIPSIS + 5.187...-07 + >>> list(positive.bounds(eta, Y)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + [1.399..., 7.4..., inf, 1.414...] + >>> + + """ + + def __init__(self, + linear_part, + offset, + covariance=None, + mean=None, + rank=None): + r""" + Create a new inequality. + + Parameters + ---------- + + linear_part : np.float((q,p)) + The linear part, $A$ of the affine constraint + $\{z:Az \leq b\}$. + + offset: np.float(q) + The offset part, $b$ of the affine constraint + $\{z:Az \leq b\}$. + + covariance : np.float((p,p)) + Covariance matrix of Gaussian distribution to be + truncated. Defaults to `np.identity(self.dim)`. + + mean : np.float(p) + Mean vector of Gaussian distribution to be + truncated. Defaults to `np.zeros(self.dim)`. + + rank : int + If not None, this should specify + the rank of the covariance matrix. Defaults + to self.dim. + + """ + + self.linear_part, self.offset = \ + linear_part, np.asarray(offset) + + if self.linear_part.ndim == 2: + self.dim = self.linear_part.shape[1] + else: + self.dim = self.linear_part.shape[0] + + if rank is None: + self.rank = self.dim + else: + self.rank = rank + + if covariance is None: + covariance = np.identity(self.dim) + self.covariance = covariance + + if mean is None: + mean = np.zeros(self.dim) + self.mean = mean + + def __copy__(self): + r""" + A copy of the constraints. + + Also copies _sqrt_cov, _sqrt_inv if attributes are present. + """ + con = constraints(copy(self.linear_part), + copy(self.offset), + mean=copy(self.mean), + covariance=copy(self.covariance), + rank=self.rank) + if hasattr(self, "_sqrt_cov"): + con._sqrt_cov = self._sqrt_cov.copy() + con._sqrt_inv = self._sqrt_inv.copy() + con._rowspace = self._rowspace.copy() + return con + + def __call__(self, Y, tol=1.e-3): + r""" + Check whether Y satisfies the linear + inequality constraints. + >>> A = np.array([[1., -1.], [1., -1.]]) + >>> B = np.array([1., 1.]) + >>> con = constraints(A, B) + >>> Y = np.array([-1., 1.]) + >>> con(Y) + True + """ + V1 = self.value(Y) + return np.all(V1 < tol * np.fabs(V1).max(0)) + + def value(self, Y): + r""" + Compute $\max(Ay-b)$. + """ + return (self.linear_part.dot(Y) - self.offset).max() + + def conditional(self, + linear_part, + value, + rank=None): + """ + Return an equivalent constraint + after having conditioned on a linear equality. + + Let the inequality constraints be specified by + `(A,b)` and the equality constraints be specified + by `(C,d)`. We form equivalent inequality constraints by + considering the residual + + .. math:: + + AY - E(AY|CY=d) + + Parameters + ---------- + + linear_part : np.float((k,q)) + Linear part of equality constraint, `C` above. + + value : np.float(k) + Value of equality constraint, `b` above. + + rank : int + If not None, this should specify + the rank of `linear_part`. Defaults + to `min(k,q)`. + + Returns + ------- + + conditional_con : `constraints` + Affine constraints having applied equality constraint. + + """ + + S = self.covariance + C, d = linear_part, value + + M1 = S.dot(C.T) + M2 = C.dot(M1) + + if M2.shape: + M2i = np.linalg.pinv(M2) + delta_cov = M1.dot(M2i.dot(M1.T)) + delta_mean = M1.dot(M2i.dot(C.dot(self.mean) - d)) + else: + delta_cov = np.multiply.outer(M1, M1) / M2 + delta_mean = M1 * (C.dot(self.mean) - d) / M2 + + if rank is None: + if len(linear_part.shape) == 2: + rank = min(linear_part.shape) + else: + rank = 1 + + return constraints(self.linear_part, + self.offset, + covariance=self.covariance - delta_cov, + mean=self.mean - delta_mean, + rank=self.rank - rank) + + def bounds(self, direction_of_interest, Y): + r""" + For a realization $Y$ of the random variable $N(\mu,\Sigma)$ + truncated to $C$ specified by `self.constraints` compute + the slice of the inequality constraints in a + given direction $\eta$. + + Parameters + ---------- + + direction_of_interest: np.float + A direction $\eta$ for which we may want to form + selection intervals or a test. + + Y : np.float + A realization of $N(\mu,\Sigma)$ where + $\Sigma$ is `self.covariance`. + + Returns + ------- + + L : np.float + Lower truncation bound. + + Z : np.float + The observed $\eta^TY$ + + U : np.float + Upper truncation bound. + + S : np.float + Standard deviation of $\eta^TY$. + + + """ + return interval_constraints(self.linear_part, + self.offset, + self.covariance, + Y, + direction_of_interest) + + def pivot(self, + direction_of_interest, + Y, + null_value=None, + alternative='greater'): + r""" + For a realization $Y$ of the random variable $N(\mu,\Sigma)$ + truncated to $C$ specified by `self.constraints` compute + the slice of the inequality constraints in a + given direction $\eta$ and test whether + $\eta^T\mu$ is greater then 0, less than 0 or equal to 0. + + Parameters + ---------- + + direction_of_interest: np.float + A direction $\eta$ for which we may want to form + selection intervals or a test. + + Y : np.float + A realization of $N(0,\Sigma)$ where + $\Sigma$ is `self.covariance`. + + alternative : ['greater', 'less', 'twosided'] + What alternative to use. + + Returns + ------- + + P : np.float + $p$-value of corresponding test. + + Notes + ----- + + All of the tests are based on the exact pivot $F$ given + by the truncated Gaussian distribution for the + given direction $\eta$. If the alternative is 'greater' + then we return $1-F$; if it is 'less' we return $F$ + and if it is 'twosided' we return $2 \min(F,1-F)$. + + """ + + if alternative not in ['greater', 'less', 'twosided']: + raise ValueError("alternative should be one of ['greater', 'less', 'twosided']") + L, Z, U, S = self.bounds(direction_of_interest, Y) + + if null_value is None: + meanZ = (direction_of_interest * self.mean).sum() + else: + meanZ = null_value + + P = truncnorm_cdf((Z-meanZ)/S, (L-meanZ)/S, (U-meanZ)/S) + + if alternative == 'greater': + return 1 - P + elif alternative == 'less': + return P + else: + return max(2 * min(P, 1-P), 0) + + def interval(self, direction_of_interest, Y, + alpha=0.05, UMAU=False): + r""" + For a realization $Y$ of the random variable $N(\mu,\Sigma)$ + truncated to $C$ specified by `self.constraints` compute + the slice of the inequality constraints in a + given direction $\eta$ and test whether + $\eta^T\mu$ is greater then 0, less than 0 or equal to 0. + + Parameters + ---------- + + direction_of_interest: np.float + + A direction $\eta$ for which we may want to form + selection intervals or a test. + + Y : np.float + + A realization of $N(0,\Sigma)$ where + $\Sigma$ is `self.covariance`. + + alpha : float + + What level of confidence? + + UMAU : bool + + Use the UMAU intervals? + + Returns + ------- + + [U,L] : selection interval + + + """ + ## THE DOCUMENTATION IS NOT GOOD ! HAS TO BE CHANGED ! + + return selection_interval( \ + self.linear_part, + self.offset, + self.covariance, + Y, + direction_of_interest, + alpha=alpha, + UMAU=UMAU) + + def covariance_factors(self, force=True): + """ + Factor `self.covariance`, + finding a possibly non-square square-root. + + Parameters + ---------- + + force : bool + If True, force a recomputation of + the covariance. If not, assumes that + covariance has not changed. + + """ + if not hasattr(self, "_sqrt_cov") or force: + + # original matrix is np.dot(U, (D**2 * U).T) + + U, D = np.linalg.svd(self.covariance)[:2] + D = np.sqrt(D[:self.rank]) + U = U[:,:self.rank] + + self._sqrt_cov = U * D[None,:] + self._sqrt_inv = (U / D[None,:]).T + self._rowspace = U + + return self._sqrt_cov, self._sqrt_inv, self._rowspace + + def whiten(self): + """ + + Return a whitened version of constraints in a different + basis, and a change of basis matrix. + + If `self.covariance` is rank deficient, the change-of + basis matrix will not be square. + + Returns + ------- + + inverse_map : callable + + forward_map : callable + + white_con : `constraints` + + """ + sqrt_cov, sqrt_inv = self.covariance_factors()[:2] + + new_A = self.linear_part.dot(sqrt_cov) + den = np.sqrt((new_A**2).sum(1)) + new_b = self.offset - self.linear_part.dot(self.mean) + new_con = constraints(new_A / den[:,None], new_b / den) + + mu = self.mean.copy() + + def inverse_map(Z): + if Z.ndim == 2: + return sqrt_cov.dot(Z) + mu[:,None] + else: + return sqrt_cov.dot(Z) + mu + + forward_map = lambda W: sqrt_inv.dot(W - mu) + + return inverse_map, forward_map, new_con + + def project_rowspace(self, direction): + """ + Project a vector onto rowspace + of the covariance. + """ + rowspace = self.covariance_factors()[-1] + return rowspace.dot(rowspace.T.dot(direction)) + + def solve(self, direction): + """ + Compute the inverse of the covariance + times a direction vector. + """ + sqrt_inv = self.covariance_factors()[1] + return sqrt_inv.T.dot(sqrt_inv.dot(direction)) + +def stack(*cons): + """ + Combine constraints into a large constaint + by intersection. + + Parameters + ---------- + + cons : [`selection.affine.constraints`_] + A sequence of constraints. + + Returns + ------- + + intersection : `selection.affine.constraints`_ + + Notes + ----- + + Resulting constraint will have mean 0 and covariance $I$. + + """ + ineq, ineq_off = [], [] + for con in cons: + ineq.append(con.linear_part) + ineq_off.append(con.offset) + + intersection = constraints(np.vstack(ineq), + np.hstack(ineq_off)) + return intersection + +def interval_constraints(support_directions, + support_offsets, + covariance, + observed_data, + direction_of_interest, + tol = 1.e-4): + r""" + Given an affine constraint $\{z:Az \leq b \leq \}$ (elementwise) + specified with $A$ as `support_directions` and $b$ as + `support_offset`, a new direction of interest $\eta$, and + an `observed_data` is Gaussian vector $Z \sim N(\mu,\Sigma)$ + with `covariance` matrix $\Sigma$, this + function returns $\eta^TZ$ as well as an interval + bounding this value. + + The interval constructed is such that the endpoints are + independent of $\eta^TZ$, hence the $p$-value + of `Kac Rice`_ + can be used to form an exact pivot. + + Parameters + ---------- + + support_directions : np.float + Matrix specifying constraint, $A$. + + support_offsets : np.float + Offset in constraint, $b$. + + covariance : np.float + Covariance matrix of `observed_data`. + + observed_data : np.float + Observations. + + direction_of_interest : np.float + Direction in which we're interested for the + contrast. + + tol : float + Relative tolerance parameter for deciding + sign of $Az-b$. + + Returns + ------- + + lower_bound : float + + observed : float + + upper_bound : float + + sigma : float + + """ + + # shorthand + A, b, S, X, w = (support_directions, + support_offsets, + covariance, + observed_data, + direction_of_interest) + + U = A.dot(X) - b + if not np.all(U < tol * np.fabs(U).max()) and WARNINGS: + warn('constraints not satisfied: %s' % repr(U)) + + Sw = S.dot(w) + sigma = np.sqrt((w*Sw).sum()) + alpha = A.dot(Sw) / sigma**2 + V = (w*X).sum() # \eta^TZ + + # adding the zero_coords in the denominator ensures that + # there are no divide-by-zero errors in RHS + # these coords are never used in upper_bound or lower_bound + + zero_coords = alpha == 0 + RHS = (-U + V * alpha) / (alpha + zero_coords) + RHS[zero_coords] = np.nan + + pos_coords = alpha > tol * np.fabs(alpha).max() + if np.any(pos_coords): + upper_bound = RHS[pos_coords].min() + else: + upper_bound = np.inf + neg_coords = alpha < -tol * np.fabs(alpha).max() + if np.any(neg_coords): + lower_bound = RHS[neg_coords].max() + else: + lower_bound = -np.inf + + return lower_bound, V, upper_bound, sigma + +def selection_interval(support_directions, + support_offsets, + covariance, + noisy_observation, + observation, + direction_of_interest, + percent_smoothing, + tol = 1.e-4, + level = 0.90, + UMAU=True): + """ + Given an affine in cone constraint $\{z:Az+b \leq 0\}$ (elementwise) + specified with $A$ as `support_directions` and $b$ as + `support_offset`, a new direction of interest $\eta$, and + `noisy_observation` is Gaussian vector $Z \sim N(\mu,\Sigma)$ + with `covariance` matrix $\Sigma$, this + function returns a confidence interval + for $\eta^T\mu$. + + Parameters + ---------- + + support_directions : np.float + Matrix specifying constraint, $A$. + + support_offset : np.float + Offset in constraint, $b$. + + covariance : np.float + Covariance matrix of `observed_data`. + + noisy_observation : np.float + Observations. + + observation : np.float + Observations. + + direction_of_interest : np.float + Direction in which we're interested for the + contrast. + + tol : float + Relative tolerance parameter for deciding + sign of $Az-b$. + + Returns + ------- + + confidence_interval : (float, float) + + """ + + (lower_bound, + noisy_estimate, + upper_bound, + sigma) = interval_constraints(support_directions, + support_offsets, + covariance, + noisy_observation, + direction_of_interest, + tol=tol) + + grid = np.linspace(lower_bound - 4 * sigma, upper_bound + 4 * sigma, 801) + weight = normal_dbn.cdf((upper_bound - grid) / sigma) - normal_dbn.cdf((lower_bound - grid) / sigma) + estimate = (direction_of_interest * observation).sum() + return discrete_family(grid, weight).equal_tailed_interval(estimate, + alpha=1-level) + +def find_root(f, y, lb, ub, tol=1e-6): + """ + searches for solution to f(x) = y in (lb, ub), where + f is a monotone decreasing function + """ + + # make sure solution is in range + a, b = lb, ub + fa, fb = f(a), f(b) + + # assume a < b + if fa > y and fb > y: + while fb > y : + b, fb = b + (b-a), f(b + (b-a)) + elif fa < y and fb < y: + while fa < y : + a, fa = a - (b-a), f(a - (b-a)) + + + # determine the necessary number of iterations + try: + max_iter = int( np.ceil( ( np.log(tol) - np.log(b-a) ) / np.log(0.5) ) ) + except OverflowError: + warnings.warn('root finding failed, returning np.nan') + return np.nan + + + # bisect (slow but sure) until solution is obtained + for _ in range(max_iter): + try: + c, fc = (a+b)/2, f((a+b)/2) + if fc > y: a = c + elif fc < y: b = c + except OverflowError: + warnings.warn('root finding failed, returning np.nan') + return np.nan + + return c + +class discrete_family(object): + + def __init__(self, sufficient_stat, weights, theta=0.): + r""" + A discrete 1-dimensional + exponential family with reference measure $\sum_j w_j \delta_{X_j}$ + and sufficient statistic `sufficient_stat`. For any $\theta$, the distribution + is + + .. math:: + + P_{\theta} = \sum_{j} e^{\theta X_j - \Lambda(\theta)} w_j \delta_{X_j} + + where + + .. math:: + + \Lambda(\theta) = \log \left(\sum_j w_j e^{\theta X_j} \right). + + Parameters + ---------- + + sufficient_stat : `np.float((n))` + + weights : `np.float(n)` + + Notes + ----- + + The weights are normalized to sum to 1. + """ + xw = np.array(sorted(zip(sufficient_stat, weights)), np.float) + self._x = xw[:,0] + self._w = xw[:,1] + self._lw = np.log(xw[:,1]) + self._w /= self._w.sum() # make sure they are a pmf + self.n = len(xw) + self._theta = np.nan + self.theta = theta + + @property + def theta(self): + """ + The natural parameter of the family. + """ + return self._theta + + @theta.setter + def theta(self, _theta): + if _theta != self._theta: + _thetaX = _theta * self.sufficient_stat + self._lw + _largest = _thetaX.max() - 5 # try to avoid over/under flow, 5 seems arbitrary + _exp_thetaX = np.exp(_thetaX - _largest) + _prod = _exp_thetaX + self._partition = np.sum(_prod) + self._pdf = _prod / self._partition + self._partition *= np.exp(_largest) + self._theta = _theta + + @property + def partition(self): + r""" + Partition function at `self.theta`: + + .. math:: + + \sum_j e^{\theta X_j} w_j + """ + if hasattr(self, "_partition"): + return self._partition + + @property + def sufficient_stat(self): + """ + Sufficient statistics of the exponential family. + """ + return self._x + + @property + def weights(self): + """ + Weights of the exponential family. + """ + return self._w + + def pdf(self, theta): + r""" + Density of $P_{\theta}$ with respect to $P_0$. + + Parameters + ---------- + + theta : float + Natural parameter. + + Returns + ------- + + pdf : np.float + + """ + self.theta = theta # compute partition if necessary + return self._pdf + + def cdf(self, theta, x=None, gamma=1): + r""" + The cumulative distribution function of $P_{\theta}$ with + weight `gamma` at `x` + + .. math:: + + P_{\theta}(X < x) + \gamma * P_{\theta}(X = x) + + Parameters + ---------- + + theta : float + Natural parameter. + + x : float (optional) + Where to evaluate CDF. + + gamma : float(optional) + Weight given at `x`. + + Returns + ------- + + cdf : np.float + + """ + pdf = self.pdf(theta) + if x is None: + return np.cumsum(pdf) - pdf * (1 - gamma) + else: + tr = np.sum(pdf * (self.sufficient_stat < x)) + if x in self.sufficient_stat: + tr += gamma * np.sum(pdf[np.where(self.sufficient_stat == x)]) + return tr + + def sf(self, theta, x=None, gamma=0, return_unnorm=False): + r""" + The complementary cumulative distribution function + (i.e. survival function) of $P_{\theta}$ with + weight `gamma` at `x` + + .. math:: + + P_{\theta}(X > x) + \gamma * P_{\theta}(X = x) + + Parameters + ---------- + + theta : float + Natural parameter. + + x : float (optional) + Where to evaluate SF. + + gamma : float(optional) + Weight given at `x`. + + Returns + ------- + + sf : np.float + + """ + pdf = self.pdf(theta) + if x is None: + return np.cumsum(pdf[::-1])[::-1] - pdf * (1 - gamma) + else: + tr = np.sum(pdf * (self.sufficient_stat > x)) + if x in self.sufficient_stat: + tr += gamma * np.sum(pdf[np.where(self.sufficient_stat == x)]) + return tr + + def E(self, theta, func): + r""" + Expectation of `func` under $P_{\theta}$ + + Parameters + ---------- + + theta : float + Natural parameter. + + func : callable + Assumed to be vectorized. + + gamma : float(optional) + Weight given at `x`. + + Returns + ------- + + E : np.float + + """ + T = np.asarray(func(self.sufficient_stat)) + pdf_ = self.pdf(theta) + + if T.ndim == 1: + return (T * pdf_).sum() + else: + val = (T * pdf_[:,None]).sum(0) + return val + + + def Var(self, theta, func): + r""" + Variance of `func` under $P_{\theta}$ + + Parameters + ---------- + + theta : float + Natural parameter. + + func : callable + Assumed to be vectorized. + + Returns + ------- + + var : np.float + + """ + + mu = self.E(theta, func) + return self.E(theta, lambda x: (func(x)-mu)**2) + + def Cov(self, theta, func1, func2): + r""" + Covariance of `func1` and `func2` under $P_{\theta}$ + + Parameters + ---------- + + theta : float + Natural parameter. + + func1, func2 : callable + Assumed to be vectorized. + + Returns + ------- + + cov : np.float + + """ + + mu1 = self.E(theta, func1) + mu2 = self.E(theta, func2) + return self.E(theta, lambda x: (func1(x)-mu1)*(func2(x)-mu2)) + + def equal_tailed_interval(self, + observed, + alpha=0.05, + randomize=True, + auxVar=None, + tol=1e-6): + """ + Form interval by inverting + equal-tailed test with $\alpha/2$ in each tail. + + Parameters + ---------- + + observed : float + Observed sufficient statistic. + + alpha : float (optional) + Size of two-sided test. + + randomize : bool + Perform the randomized test (or conservative test). + + auxVar : [None, float] + If randomizing and not None, use this + as the random uniform variate. + + Returns + ------- + + lower, upper : float + Limits of confidence interval. + + """ + + mu = self.E(self.theta, lambda x: x) + sigma = np.sqrt(self.Var(self.theta, lambda x: x)) + lb = mu - 20 * sigma + ub = mu + 20 * sigma + F = lambda th : self.cdf(th, observed) + L = find_root(F, 1.0 - 0.5 * alpha, lb, ub) + U = find_root(F, 0.5 * alpha, lb, ub) + return L, U + + def equal_tailed_test(self, + theta0, + observed, + alpha=0.05): + r""" + Equal tailed test of H_0:theta=theta_0 + + Parameters + ---------- + + theta0 : float + Natural parameter under null hypothesis. + + observed : float + Observed sufficient statistic. + + alpha : float (optional) + Size of two-sided test. + + randomize : bool + Perform the randomized test (or conservative test). + + Returns + ------- + + decision : np.bool + Is the null hypothesis $H_0:\theta=\theta_0$ rejected? + + Notes + ----- + + We need an auxiliary uniform variable to carry out the randomized test. + Larger auxVar corresponds to x being slightly "larger." It can be passed in, + or chosen at random. If randomize=False, we get a conservative test. + """ + + pval = self.cdf(theta0, observed, gamma=0.5) + return min(pval, 1-pval) < alpha + + def MLE(self, + observed, + initial=0, + max_iter=20, + tol=1.e-4): + + r""" + Compute the maximum likelihood estimator + based on observed sufficient statistic `observed`. + + Parameters + ---------- + + observed : float + Observed value of sufficient statistic + + initial : float + Starting point for Newton-Raphson + + max_iter : int (optional) + Maximum number of Newton-Raphson iterations + + tol : float (optional) + Tolerance parameter for stopping, based + on relative change in parameter estimate. + Iteration stops when the change is smaller + than `tol * max(1, np.fabs(cur_estimate))`. + + Returns + ------- + + theta_hat : float + Maximum likelihood estimator. + + std_err : float + Estimated variance of `theta_hat` based + on inverse of variance of sufficient + statistic at `theta_hat`, i.e. the + observed Fisher information. + + """ + + cur_est = initial + + def first_two_moments(x): + return np.array([x, x**2]).T + + for i in range(max_iter): + cur_moments = self.E(cur_est, first_two_moments) # gradient and + # Hessian of CGF + # (almost) + grad, hessian = (cur_moments[0] - observed, + cur_moments[1] - cur_moments[0]**2) + next_est = cur_est - grad / hessian # newton step + + if np.fabs(next_est - cur_est) < tol * max(1, np.fabs(cur_est)): + break + cur_est = next_est + + if i == max_iter - 1: + warnings.warn('Newton-Raphson failed to converge after %d iterations' % max_iter) + + cur_moments = self.E(cur_est, first_two_moments) # gradient and + # Hessian of CGF + # (almost) + grad, hessian = (cur_moments[0] - observed, + cur_moments[1] - cur_moments[0]**2) + + return cur_est, 1. / hessian, grad + From 6da6b65633374d7eeb98c9221cbe8f95b7f9fc86 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 20:38:14 -0700 Subject: [PATCH 03/44] retain a reference to state --- glmnet/glm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/glmnet/glm.py b/glmnet/glm.py index 5be19c6..4f3214f 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -473,9 +473,7 @@ def obj_function(response, state.mean_parameter, sample_weight) # not the normalized weights! - if offset is None: - offset = np.zeros(y.shape[0]) - + self.state_ = state self._set_coef_intercept(state) if (hasattr(self._family, "base") and From b34b385700555f7323a27bccba5b8b331f944573 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 22:19:24 -0700 Subject: [PATCH 04/44] WIP: exact smoothed polyhedral --- glmnet/inference.py | 54 ++++++++++++++++++++++++++++++++++++--- glmnet/regularized_glm.py | 2 +- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index b56312d..a9d5bc7 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -14,13 +14,59 @@ from warnings import warn from copy import copy - import numpy as np -from .discrete_family import discrete_family -from mpmath import mp +def lasso_inference(glmnet, + lambda_val, + selection_data, + full_data): + + fixed_lambda = glmnet.fixed_lambda_estimator(lambda_val) + X_sel, Y_sel, weight_sel = selection_data + fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) + FL = fixed_lambda # shorthand + + if (not np.all(FL.upper_limits == FL.control.big) or + not np.all(FL.lower_limits == -FL.control.big)): + raise NotImplementedError('upper/lower limits coming soon') + + active_set = np.nonzero(FL.coef != 0)[0] + state = FL.state_ + information = FL._family.information(state, + sample_weight) -WARNINGS = False + Q_E = FL._design.quadratic_form(information, + columns=active_set) + + C_E = np.linalg.inv(Q_E) + keep = np.zeros(Q_E.shape[0]) + penfac = FL.penalty_factors[active_set] + sings = np.sign(FL.coef_[active_set]) + if FL.fit_intercept: + penfac = np.hstack([0, penfac]) + signs = np.hstack([0, signs]) + keep[0] = 1 + keep[1 + active_set] = 1 + Q_E = Q_E[keep] + stacked = np.hstack([state.intercept_, + state.coef_[active_set]]) + else: + keep[active_set] = 1 + Q_E = Q_E[keep] + stacked = state.coef_[active_set] + + C_E = np.linalg.inv(Q_E) + delta = np.zeros(Q_E.shape[0]) + delta[1:] = lambda_val * penfac * signs + delta = C_E @ delta + noisy_mle = stacked + C_E @ neg_score + + penalized = penfac > 0 + sel_P = -signs[penalized][:,None] * np.eye(C_E.shape[0])[penalized] + con = constraints(sel_P, + -signs[penalized] * delta[penalized], + covariance=C_E) # adjust + return con class constraints(object): diff --git a/glmnet/regularized_glm.py b/glmnet/regularized_glm.py index 9d1e294..2285e68 100644 --- a/glmnet/regularized_glm.py +++ b/glmnet/regularized_glm.py @@ -113,7 +113,7 @@ def newton_step(self, warm = (cur_state.coef, cur_state.intercept, - cur_state.eta) # just X\beta -- doesn't include offset + cur_state.linear_parameter) # just X\beta -- doesn't include offset elnet_fit = self.elnet_estimator.fit(design, z, From 8294e7274f4ed7cd18e703dc2b983aaa2003c85f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 22:23:00 -0700 Subject: [PATCH 05/44] WIP: put fixed lambda in inference module --- glmnet/glmnet.py | 2 -- glmnet/inference.py | 27 ++++++++++++++++----------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index 49621e8..b767ab7 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -463,5 +463,3 @@ def _get_initial_state(self, intercept_ = 0 return GLMState(coef_, intercept_), keep.astype(float) - - diff --git a/glmnet/inference.py b/glmnet/inference.py index a9d5bc7..121c1bf 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -1,14 +1,6 @@ """ This module contains the core code needed for post selection -inference based on affine selection procedures as -described in the papers `Kac Rice`_, `Spacings`_, `covTest`_ -and `post selection LASSO`_. - -.. _covTest: http://arxiv.org/abs/1301.7161 -.. _Kac Rice: http://arxiv.org/abs/1308.3020 -.. _Spacings: http://arxiv.org/abs/1401.3889 -.. _post selection LASSO: http://arxiv.org/abs/1311.6238 -.. _sample carving: http://arxiv.org/abs/1410.2597 +after LASSO. """ @@ -16,12 +8,25 @@ from copy import copy import numpy as np -def lasso_inference(glmnet, +def fixed_lambda_estimator(glmnet_obj, + lambda_val): + check_is_fitted(glmnet_obj, ["coefs_", "feature_names_in_"]) + estimator = clone(glmnet_obj.reg_glm_est_) + estimator.lambda_val = lambda_val + coefs, intercepts = glmnet_obj.interpolate_coefs([lambda_val]) + cls = glmnet_obj.state_.__class__ + state = cls(coefs[0], intercepts[0]) + estimator.regularizer_ = glmnet_obj.reg_glm_est_.regularizer_ + estimator.regularizer_.warm_state = state + + return estimator + +def lasso_inference(glmnet_obj, lambda_val, selection_data, full_data): - fixed_lambda = glmnet.fixed_lambda_estimator(lambda_val) + fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) X_sel, Y_sel, weight_sel = selection_data fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) FL = fixed_lambda # shorthand From c1971a85c195b7306581a5d9dd5fd9d1859ff831 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 23:05:40 -0700 Subject: [PATCH 06/44] fitting on selection, refit on GLM; added a method to get an LM for a given GLMNet object -- need to do for fastpaths --- glmnet/base.py | 1 + glmnet/cox.py | 11 +++++---- glmnet/docstrings.py | 5 +++++ glmnet/glm.py | 3 ++- glmnet/glmnet.py | 9 ++++++++ glmnet/inference.py | 47 ++++++++++++++++++++++++++------------- glmnet/paths/fastnet.py | 4 ++++ glmnet/regularized_glm.py | 9 +++++++- 8 files changed, 68 insertions(+), 21 deletions(-) diff --git a/glmnet/base.py b/glmnet/base.py index e9f7e94..534bc5a 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -129,6 +129,7 @@ def quadratic_form(self, GX = G @ X_R G1 = G @ np.ones(G.shape[0]) XX_block = self.X.T @ GX + X1_block = self.X.T @ G1 G_sum = G1.sum() if scipy.sparse.issparse(XX_block): diff --git a/glmnet/cox.py b/glmnet/cox.py index 643c1c0..5333d72 100644 --- a/glmnet/cox.py +++ b/glmnet/cox.py @@ -277,6 +277,12 @@ def _get_family_spec(self, status_id=self.family.status_id, start_id=self.family.start_id) + def get_LM(self): + return CoxLM(family=self.family, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id) + def _get_initial_state(self, X, y, @@ -292,10 +298,7 @@ def _get_initial_state(self, if keep.sum() > 0: X_keep = X[:,keep] - coxlm = CoxLM(family=self.family, - offset_id=self.offset_id, - weight_id=self.weight_id, - response_id=self.response_id) + coxlm = self.get_LM() coxlm.fit(X_keep, y) coef_[keep] = coxlm.coef_ diff --git a/glmnet/docstrings.py b/glmnet/docstrings.py index 8baf97d..d7e08c5 100644 --- a/glmnet/docstrings.py +++ b/glmnet/docstrings.py @@ -12,6 +12,11 @@ # Data class containing GLMNet control parameters. # ''', + 'intercept':''' +intercept: + For a Design, is there an intercept? + ''', + 'nlambda':''' nlambda: int Number of values on data-dependent grid of lambda values. diff --git a/glmnet/glm.py b/glmnet/glm.py index dda3fd7..620e629 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -503,6 +503,8 @@ def obj_function(response, obj_function, self.control) + self.state_ = state + if self.summarize: self._information = self._family.information(state, sample_weight) @@ -517,7 +519,6 @@ def obj_function(response, state.mean_parameter, sample_weight) # not the normalized weights! - self.state_ = state self._set_coef_intercept(state) if (hasattr(self._family, "base") and diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index b767ab7..099b186 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -184,6 +184,8 @@ def fit(self, check=False, fit_null=False) + self.state_ = self.reg_glm_est_.state_ + coefs_.append(self.reg_glm_est_.coef_.copy()) intercepts_.append(self.reg_glm_est_.intercept_) dev_ratios_.append(1 - self.reg_glm_est_.deviance_ / self.null_deviance_) @@ -463,3 +465,10 @@ def _get_initial_state(self, intercept_ = 0 return GLMState(coef_, intercept_), keep.astype(float) + def get_LM(self): + return GLM(family=self.family, + fit_intercept=self.fit_intercept, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id) + diff --git a/glmnet/inference.py b/glmnet/inference.py index 121c1bf..1ba3e5e 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -7,6 +7,10 @@ from warnings import warn from copy import copy import numpy as np +import pandas as pd + +from sklearn.utils.validation import check_is_fitted +from sklearn.base import clone def fixed_lambda_estimator(glmnet_obj, lambda_val): @@ -31,46 +35,59 @@ def lasso_inference(glmnet_obj, fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) FL = fixed_lambda # shorthand - if (not np.all(FL.upper_limits == FL.control.big) or - not np.all(FL.lower_limits == -FL.control.big)): + if (not np.all(FL.upper_limits == np.inf) or + not np.all(FL.lower_limits == -np.inf)): raise NotImplementedError('upper/lower limits coming soon') - active_set = np.nonzero(FL.coef != 0)[0] + active_set = np.nonzero(FL.coef_ != 0)[0] state = FL.state_ information = FL._family.information(state, - sample_weight) + weight_sel) - Q_E = FL._design.quadratic_form(information, + Q_E = FL.design_.quadratic_form(information, columns=active_set) - - C_E = np.linalg.inv(Q_E) - keep = np.zeros(Q_E.shape[0]) - penfac = FL.penalty_factors[active_set] - sings = np.sign(FL.coef_[active_set]) + keep = np.zeros(Q_E.shape[0], bool) + if hasattr(FL, 'penalty_factors'): + penfac = FL.penalty_factors[active_set] + else: + penfac = np.ones(active_set.shape[0]) + signs = np.sign(FL.coef_[active_set]) if FL.fit_intercept: penfac = np.hstack([0, penfac]) signs = np.hstack([0, signs]) keep[0] = 1 keep[1 + active_set] = 1 Q_E = Q_E[keep] - stacked = np.hstack([state.intercept_, - state.coef_[active_set]]) + stacked = np.hstack([state.intercept, + state.coef[active_set]]) else: keep[active_set] = 1 Q_E = Q_E[keep] - stacked = state.coef_[active_set] + stacked = state.coef[active_set] C_E = np.linalg.inv(Q_E) delta = np.zeros(Q_E.shape[0]) - delta[1:] = lambda_val * penfac * signs + delta[1:] = lambda_val * penfac[1:] * signs[1:] delta = C_E @ delta - noisy_mle = stacked + C_E @ neg_score + noisy_mle = stacked + C_E @ delta penalized = penfac > 0 sel_P = -signs[penalized][:,None] * np.eye(C_E.shape[0])[penalized] con = constraints(sel_P, -signs[penalized] * delta[penalized], covariance=C_E) # adjust + + # fit full model + + X_full, Y_full, weight_full = full_data + + unreg_LM = glmnet_obj.get_LM() + unreg_LM.summarize = True + unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) + C_full = unreg_LM.covariance_ + + selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_E).sum(), 0, 1) + print(selection_proportion) return con class constraints(object): diff --git a/glmnet/paths/fastnet.py b/glmnet/paths/fastnet.py index 5246565..e3f8c0f 100644 --- a/glmnet/paths/fastnet.py +++ b/glmnet/paths/fastnet.py @@ -11,6 +11,7 @@ from ..base import _get_design from ..glmnet import GLMNet +from ..glm import GLMState from ..elnet import (_check_and_set_limits, _check_and_set_vp, _design_wrapper_args) @@ -128,6 +129,9 @@ def fit(self, self.coefs_ = result['coefs'] self.intercepts_ = result['intercepts'] + self.state_ = GLMState(self.coefs_[-1], + self.intercepts_[-1]) + self.lambda_values_ = result['lambda_values'] nfits = self.lambda_values_.shape[0] dev_ratios_ = self._fit['dev'][:nfits] diff --git a/glmnet/regularized_glm.py b/glmnet/regularized_glm.py index 2285e68..f21bcb6 100644 --- a/glmnet/regularized_glm.py +++ b/glmnet/regularized_glm.py @@ -113,7 +113,7 @@ def newton_step(self, warm = (cur_state.coef, cur_state.intercept, - cur_state.linear_parameter) # just X\beta -- doesn't include offset + cur_state.linear_predictor) # just X\beta -- doesn't include offset elnet_fit = self.elnet_estimator.fit(design, z, @@ -141,6 +141,13 @@ class RegGLM(GLM, control: RegGLMControl = field(default_factory=RegGLMControl) + def get_LM(self): + return GLM(family=self.family, + fit_intercept=self.fit_intercept, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id) + def _get_regularizer(self, X): From a60a88b8d92cb4a5425438f1b6a49df92ccaba46 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 26 Oct 2023 23:32:50 -0700 Subject: [PATCH 07/44] refit on selected to get better estimate of variance at mle, not lasso --- glmnet/inference.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 1ba3e5e..13ae587 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -86,7 +86,11 @@ def lasso_inference(glmnet_obj, unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) C_full = unreg_LM.covariance_ - selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_E).sum(), 0, 1) + unreg_sel_LM = glmnet_obj.get_LM() + unreg_sel_LM.summarize = True + unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) + C_sel = unreg_sel_LM.covariance_ + selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) print(selection_proportion) return con From 2d5c165699187dcd6f0f3397983a1cadfb219c38 Mon Sep 17 00:00:00 2001 From: kevinbfry <15952367+kevinbfry@users.noreply.github.com> Date: Tue, 31 Oct 2023 14:42:30 -0700 Subject: [PATCH 08/44] lasso_inference runs, but intervals wrong --- glmnet/inference.py | 94 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 21 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 13ae587..28a2218 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -4,10 +4,12 @@ """ +import warnings from warnings import warn from copy import copy import numpy as np import pandas as pd +from scipy.stats import norm as normal_dbn from sklearn.utils.validation import check_is_fitted from sklearn.base import clone @@ -28,7 +30,8 @@ def fixed_lambda_estimator(glmnet_obj, def lasso_inference(glmnet_obj, lambda_val, selection_data, - full_data): + full_data, + level=.9): fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) X_sel, Y_sel, weight_sel = selection_data @@ -43,7 +46,7 @@ def lasso_inference(glmnet_obj, state = FL.state_ information = FL._family.information(state, weight_sel) - + Q_E = FL.design_.quadratic_form(information, columns=active_set) keep = np.zeros(Q_E.shape[0], bool) @@ -57,16 +60,19 @@ def lasso_inference(glmnet_obj, signs = np.hstack([0, signs]) keep[0] = 1 keep[1 + active_set] = 1 + print("fitting intercept") + # C_E_active = C_E[keep] Q_E = Q_E[keep] stacked = np.hstack([state.intercept, state.coef[active_set]]) else: keep[active_set] = 1 - Q_E = Q_E[keep] + # C_E_active = C_E[keep] + Q_E = Q_E[keep][:,1:] stacked = state.coef[active_set] C_E = np.linalg.inv(Q_E) - delta = np.zeros(Q_E.shape[0]) + delta = np.zeros(C_E.shape[0]) delta[1:] = lambda_val * penfac[1:] * signs[1:] delta = C_E @ delta noisy_mle = stacked + C_E @ delta @@ -89,10 +95,40 @@ def lasso_inference(glmnet_obj, unreg_sel_LM = glmnet_obj.get_LM() unreg_sel_LM.summarize = True unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) + print(unreg_sel_LM.summary_) C_sel = unreg_sel_LM.covariance_ - selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) - print(selection_proportion) - return con + # selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) + + ## TODO: will this handle fit_intercept? + if FL.fit_intercept: + stacked = np.hstack([unreg_LM.intercept_, + unreg_LM.coef_]) + else: + full_mle = unreg_LM.coef_ + + ## iterate over coordinates + Ls = np.zeros_like(noisy_mle) + Us = np.zeros_like(noisy_mle) + for i in range(len(noisy_mle)): + e_i = np.zeros_like(noisy_mle) + e_i[i] = 1. + ## call selection_interval and return + # print(selection_proportion) + L, U = selection_interval( + support_directions=con.linear_part, + support_offsets=con.offset, + covariance_noisy=C_sel, + covariance_full=C_full, + noisy_observation=noisy_mle, + observation=full_mle, + direction_of_interest=e_i, + # tol=, + level=level, + # UMAU=, + ) + Ls[i] = L + Us[i] = U + return (Ls, Us) class constraints(object): @@ -422,7 +458,7 @@ def interval(self, direction_of_interest, Y, Returns ------- - [U,L] : selection interval + [L,U] : selection interval """ @@ -434,7 +470,7 @@ def interval(self, direction_of_interest, Y, self.covariance, Y, direction_of_interest, - alpha=alpha, + level=1. - alpha, UMAU=UMAU) def covariance_factors(self, force=True): @@ -519,6 +555,10 @@ def solve(self, direction): sqrt_inv = self.covariance_factors()[1] return sqrt_inv.T.dot(sqrt_inv.dot(direction)) + + + + def stack(*cons): """ Combine constraints into a large constaint @@ -645,11 +685,12 @@ def interval_constraints(support_directions, def selection_interval(support_directions, support_offsets, - covariance, + covariance_noisy, + covariance_full, noisy_observation, observation, direction_of_interest, - percent_smoothing, + # percent_smoothing, ## sqrt(alpha) tol = 1.e-4, level = 0.90, UMAU=True): @@ -696,18 +737,29 @@ def selection_interval(support_directions, """ (lower_bound, - noisy_estimate, + _, upper_bound, - sigma) = interval_constraints(support_directions, - support_offsets, - covariance, - noisy_observation, - direction_of_interest, - tol=tol) - + _) = interval_constraints(support_directions, + support_offsets, + covariance_noisy, + noisy_observation, + direction_of_interest, + tol=tol) + ## lars path lockhart tibs^2 taylor paper + sigma = np.sqrt(direction_of_interest.T @ covariance_full @ direction_of_interest) + ## sqrt(alpha) is just sigma_noisy - sigma_full + smoothing_sigma = np.sqrt(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2) grid = np.linspace(lower_bound - 4 * sigma, upper_bound + 4 * sigma, 801) - weight = normal_dbn.cdf((upper_bound - grid) / sigma) - normal_dbn.cdf((lower_bound - grid) / sigma) + weight = ( + normal_dbn.cdf( + (upper_bound - grid) / (smoothing_sigma) + ) - normal_dbn.cdf( + (lower_bound - grid) / (smoothing_sigma) + ) + ) + weight *= normal_dbn.pdf(grid / sigma) estimate = (direction_of_interest * observation).sum() + # assert(0==1) return discrete_family(grid, weight).equal_tailed_interval(estimate, alpha=1-level) @@ -781,7 +833,7 @@ def __init__(self, sufficient_stat, weights, theta=0.): The weights are normalized to sum to 1. """ - xw = np.array(sorted(zip(sufficient_stat, weights)), np.float) + xw = np.array(sorted(zip(sufficient_stat, weights)), float) self._x = xw[:,0] self._w = xw[:,1] self._lw = np.log(xw[:,1]) From d67a1e7f0a4c0744354ddba602db5ecc446a7d3c Mon Sep 17 00:00:00 2001 From: kevinbfry <15952367+kevinbfry@users.noreply.github.com> Date: Tue, 31 Oct 2023 14:56:35 -0700 Subject: [PATCH 09/44] lasso_inference intervals look plausible --- glmnet/inference.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 28a2218..16be87d 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -90,12 +90,12 @@ def lasso_inference(glmnet_obj, unreg_LM = glmnet_obj.get_LM() unreg_LM.summarize = True unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) + print(unreg_LM.summary_) C_full = unreg_LM.covariance_ unreg_sel_LM = glmnet_obj.get_LM() unreg_sel_LM.summarize = True unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) - print(unreg_sel_LM.summary_) C_sel = unreg_sel_LM.covariance_ # selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) @@ -109,12 +109,14 @@ def lasso_inference(glmnet_obj, ## iterate over coordinates Ls = np.zeros_like(noisy_mle) Us = np.zeros_like(noisy_mle) + mles = np.zeros_like(noisy_mle) + pvals = np.zeros_like(noisy_mle) for i in range(len(noisy_mle)): e_i = np.zeros_like(noisy_mle) e_i[i] = 1. ## call selection_interval and return # print(selection_proportion) - L, U = selection_interval( + L, U, mle, p = selection_interval( support_directions=con.linear_part, support_offsets=con.offset, covariance_noisy=C_sel, @@ -128,7 +130,9 @@ def lasso_inference(glmnet_obj, ) Ls[i] = L Us[i] = U - return (Ls, Us) + mles[i] = mle + pvals[i] = p + return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}) class constraints(object): @@ -760,8 +764,17 @@ def selection_interval(support_directions, weight *= normal_dbn.pdf(grid / sigma) estimate = (direction_of_interest * observation).sum() # assert(0==1) - return discrete_family(grid, weight).equal_tailed_interval(estimate, - alpha=1-level) + sel_distr = discrete_family(grid, weight) + L, U = sel_distr.equal_tailed_interval(estimate, + alpha=1-level) + mle, _, _ = sel_distr.MLE(estimate) + mle *= sigma**2 + pval = sel_distr.cdf(0, estimate) + pval = 2 * min(pval, 1-pval) + L *= sigma**2 + U *= sigma**2 + + return L, U, mle, pval def find_root(f, y, lb, ub, tol=1e-6): """ From bd04add0d0c9187437fa75ad6adb8e1014951dbb Mon Sep 17 00:00:00 2001 From: kevinbfry <15952367+kevinbfry@users.noreply.github.com> Date: Tue, 31 Oct 2023 15:11:24 -0700 Subject: [PATCH 10/44] dispersion=None signature in GLMBase.fit() --- glmnet/glm.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/glmnet/glm.py b/glmnet/glm.py index 620e629..3878ba5 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -407,7 +407,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 4 options non sklearn API - dispersion=1, + dispersion=None, check=True, fit_null=True): @@ -521,12 +521,14 @@ def obj_function(response, self._set_coef_intercept(state) - if (hasattr(self._family, "base") and + if dispersion is not None: + self.dispersion_ = dispersion + elif (hasattr(self._family, "base") and isinstance(self._family.base, sm_family.Gaussian)): # GLM specific # usual estimate of sigma^2 self.dispersion_ = self.deviance_ / (nobs-nvar-self.fit_intercept) else: - self.dispersion_ = dispersion + self.dispersion_ = 1 return self fit.__doc__ = ''' From edfd5e169e2cdedbd41ca3a221033534c47018fb Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 14 Nov 2023 12:54:21 -0800 Subject: [PATCH 11/44] BF: delta not quite right, fix grid in case of infinite limits --- glmnet/inference.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 16be87d..543c817 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -55,30 +55,30 @@ def lasso_inference(glmnet_obj, else: penfac = np.ones(active_set.shape[0]) signs = np.sign(FL.coef_[active_set]) + if FL.fit_intercept: penfac = np.hstack([0, penfac]) signs = np.hstack([0, signs]) keep[0] = 1 keep[1 + active_set] = 1 - print("fitting intercept") - # C_E_active = C_E[keep] Q_E = Q_E[keep] stacked = np.hstack([state.intercept, state.coef[active_set]]) + delta = np.zeros(Q_E.shape[0]) + delta[1:] = lambda_val * penfac[1:] * signs[1:] else: keep[active_set] = 1 # C_E_active = C_E[keep] Q_E = Q_E[keep][:,1:] stacked = state.coef[active_set] + delta = lambda_val * penfac * signs C_E = np.linalg.inv(Q_E) - delta = np.zeros(C_E.shape[0]) - delta[1:] = lambda_val * penfac[1:] * signs[1:] delta = C_E @ delta - noisy_mle = stacked + C_E @ delta + noisy_mle = stacked + delta penalized = penfac > 0 - sel_P = -signs[penalized][:,None] * np.eye(C_E.shape[0])[penalized] + sel_P = -np.diag(signs[penalized]) @ np.eye(C_E.shape[0])[penalized] con = constraints(sel_P, -signs[penalized] * delta[penalized], covariance=C_E) # adjust @@ -90,7 +90,6 @@ def lasso_inference(glmnet_obj, unreg_LM = glmnet_obj.get_LM() unreg_LM.summarize = True unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) - print(unreg_LM.summary_) C_full = unreg_LM.covariance_ unreg_sel_LM = glmnet_obj.get_LM() @@ -101,8 +100,8 @@ def lasso_inference(glmnet_obj, ## TODO: will this handle fit_intercept? if FL.fit_intercept: - stacked = np.hstack([unreg_LM.intercept_, - unreg_LM.coef_]) + full_mle = np.hstack([unreg_LM.intercept_, + unreg_LM.coef_]) else: full_mle = unreg_LM.coef_ @@ -115,7 +114,6 @@ def lasso_inference(glmnet_obj, e_i = np.zeros_like(noisy_mle) e_i[i] = 1. ## call selection_interval and return - # print(selection_proportion) L, U, mle, p = selection_interval( support_directions=con.linear_part, support_offsets=con.offset, @@ -658,7 +656,7 @@ def interval_constraints(support_directions, direction_of_interest) U = A.dot(X) - b - if not np.all(U < tol * np.fabs(U).max()) and WARNINGS: + if not np.all(U < tol * np.fabs(U).max()): warn('constraints not satisfied: %s' % repr(U)) Sw = S.dot(w) @@ -753,7 +751,8 @@ def selection_interval(support_directions, sigma = np.sqrt(direction_of_interest.T @ covariance_full @ direction_of_interest) ## sqrt(alpha) is just sigma_noisy - sigma_full smoothing_sigma = np.sqrt(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2) - grid = np.linspace(lower_bound - 4 * sigma, upper_bound + 4 * sigma, 801) + estimate = (direction_of_interest * observation).sum() + grid = np.linspace(estimate - 10 * sigma, estimate + 10 * sigma, 801) weight = ( normal_dbn.cdf( (upper_bound - grid) / (smoothing_sigma) @@ -762,7 +761,7 @@ def selection_interval(support_directions, ) ) weight *= normal_dbn.pdf(grid / sigma) - estimate = (direction_of_interest * observation).sum() + # assert(0==1) sel_distr = discrete_family(grid, weight) L, U = sel_distr.equal_tailed_interval(estimate, From 85c0083e01b17938b52693a15987f1f7f1390c53 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 14 Nov 2023 13:30:14 -0800 Subject: [PATCH 12/44] BF: weight sum factor --- glmnet/inference.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 543c817..0922035 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -35,6 +35,9 @@ def lasso_inference(glmnet_obj, fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) X_sel, Y_sel, weight_sel = selection_data + if weight_sel is None: + weight_sel = np.ones(X_sel.shape[0]) + fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) FL = fixed_lambda # shorthand @@ -56,6 +59,8 @@ def lasso_inference(glmnet_obj, penfac = np.ones(active_set.shape[0]) signs = np.sign(FL.coef_[active_set]) + # we multiply by weight_sel.sum() due to this factor + # appearing in glmnet objective... if FL.fit_intercept: penfac = np.hstack([0, penfac]) signs = np.hstack([0, signs]) @@ -65,13 +70,13 @@ def lasso_inference(glmnet_obj, stacked = np.hstack([state.intercept, state.coef[active_set]]) delta = np.zeros(Q_E.shape[0]) - delta[1:] = lambda_val * penfac[1:] * signs[1:] + delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] else: keep[active_set] = 1 # C_E_active = C_E[keep] Q_E = Q_E[keep][:,1:] stacked = state.coef[active_set] - delta = lambda_val * penfac * signs + delta = weight_sel.sum() * lambda_val * penfac * signs C_E = np.linalg.inv(Q_E) delta = C_E @ delta @@ -91,7 +96,6 @@ def lasso_inference(glmnet_obj, unreg_LM.summarize = True unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) C_full = unreg_LM.covariance_ - unreg_sel_LM = glmnet_obj.get_LM() unreg_sel_LM.summarize = True unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) @@ -743,16 +747,16 @@ def selection_interval(support_directions, upper_bound, _) = interval_constraints(support_directions, support_offsets, - covariance_noisy, + covariance_full, noisy_observation, direction_of_interest, tol=tol) ## lars path lockhart tibs^2 taylor paper sigma = np.sqrt(direction_of_interest.T @ covariance_full @ direction_of_interest) ## sqrt(alpha) is just sigma_noisy - sigma_full - smoothing_sigma = np.sqrt(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2) + smoothing_sigma = np.sqrt(max(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2, 1e-12 * sigma**2)) estimate = (direction_of_interest * observation).sum() - grid = np.linspace(estimate - 10 * sigma, estimate + 10 * sigma, 801) + grid = np.linspace(estimate - 20 * sigma, estimate + 20 * sigma, 801) weight = ( normal_dbn.cdf( (upper_bound - grid) / (smoothing_sigma) From 519ba93b5ced8bd1efe4ace4681e38584cddb337 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 14 Nov 2023 14:51:38 -0800 Subject: [PATCH 13/44] cleanup of Q_E, affine_con --- glmnet/inference.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 0922035..f8c916a 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -35,6 +35,7 @@ def lasso_inference(glmnet_obj, fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) X_sel, Y_sel, weight_sel = selection_data + if weight_sel is None: weight_sel = np.ones(X_sel.shape[0]) @@ -46,13 +47,19 @@ def lasso_inference(glmnet_obj, raise NotImplementedError('upper/lower limits coming soon') active_set = np.nonzero(FL.coef_ != 0)[0] + + # find selection data covariance + + unreg_sel_LM = glmnet_obj.get_LM() + unreg_sel_LM.summarize = True + unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) + C_sel = unreg_sel_LM.covariance_ + state = FL.state_ information = FL._family.information(state, weight_sel) - Q_E = FL.design_.quadratic_form(information, - columns=active_set) - keep = np.zeros(Q_E.shape[0], bool) + keep = np.zeros(FL.design_.shape[1], bool) if hasattr(FL, 'penalty_factors'): penfac = FL.penalty_factors[active_set] else: @@ -66,27 +73,20 @@ def lasso_inference(glmnet_obj, signs = np.hstack([0, signs]) keep[0] = 1 keep[1 + active_set] = 1 - Q_E = Q_E[keep] stacked = np.hstack([state.intercept, state.coef[active_set]]) - delta = np.zeros(Q_E.shape[0]) + delta = np.zeros(keep.sum()) delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] else: keep[active_set] = 1 - # C_E_active = C_E[keep] - Q_E = Q_E[keep][:,1:] stacked = state.coef[active_set] delta = weight_sel.sum() * lambda_val * penfac * signs - C_E = np.linalg.inv(Q_E) - delta = C_E @ delta + delta = C_sel @ delta noisy_mle = stacked + delta penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(C_E.shape[0])[penalized] - con = constraints(sel_P, - -signs[penalized] * delta[penalized], - covariance=C_E) # adjust + sel_P = -np.diag(signs[penalized]) @ np.eye(C_sel.shape[0])[penalized] # fit full model @@ -96,10 +96,7 @@ def lasso_inference(glmnet_obj, unreg_LM.summarize = True unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) C_full = unreg_LM.covariance_ - unreg_sel_LM = glmnet_obj.get_LM() - unreg_sel_LM.summarize = True - unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) - C_sel = unreg_sel_LM.covariance_ + # selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) ## TODO: will this handle fit_intercept? @@ -119,16 +116,14 @@ def lasso_inference(glmnet_obj, e_i[i] = 1. ## call selection_interval and return L, U, mle, p = selection_interval( - support_directions=con.linear_part, - support_offsets=con.offset, + support_directions=sel_P, + support_offsets=-signs[penalized] * delta[penalized], covariance_noisy=C_sel, covariance_full=C_full, noisy_observation=noisy_mle, observation=full_mle, direction_of_interest=e_i, - # tol=, level=level, - # UMAU=, ) Ls[i] = L Us[i] = U From 35a2ba3ec79a2f4e222ea90101d66d967599dd23 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 15 Nov 2023 09:30:13 -0800 Subject: [PATCH 14/44] adding Lee pvalues, dependency on mpmath as well --- glmnet/inference.py | 119 ++++++++++++++++++++++++++++++++++++-------- pyproject.toml | 1 + 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index f8c916a..c2145ca 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -11,6 +11,9 @@ import pandas as pd from scipy.stats import norm as normal_dbn +import mpmath as mp +mp.dps = 80 + from sklearn.utils.validation import check_is_fitted from sklearn.base import clone @@ -691,7 +694,6 @@ def selection_interval(support_directions, noisy_observation, observation, direction_of_interest, - # percent_smoothing, ## sqrt(alpha) tol = 1.e-4, level = 0.90, UMAU=True): @@ -749,31 +751,106 @@ def selection_interval(support_directions, ## lars path lockhart tibs^2 taylor paper sigma = np.sqrt(direction_of_interest.T @ covariance_full @ direction_of_interest) ## sqrt(alpha) is just sigma_noisy - sigma_full - smoothing_sigma = np.sqrt(max(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2, 1e-12 * sigma**2)) + smoothing_sigma = np.sqrt(max(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2, 0)) estimate = (direction_of_interest * observation).sum() - grid = np.linspace(estimate - 20 * sigma, estimate + 20 * sigma, 801) - weight = ( - normal_dbn.cdf( - (upper_bound - grid) / (smoothing_sigma) - ) - normal_dbn.cdf( - (lower_bound - grid) / (smoothing_sigma) + + if smoothing_sigma > 1e-6 * sigma: + + grid = np.linspace(estimate - 20 * sigma, estimate + 20 * sigma, 801) + weight = ( + normal_dbn.cdf( + (upper_bound - grid) / (smoothing_sigma) + ) - normal_dbn.cdf( + (lower_bound - grid) / (smoothing_sigma) + ) ) - ) - weight *= normal_dbn.pdf(grid / sigma) - - # assert(0==1) - sel_distr = discrete_family(grid, weight) - L, U = sel_distr.equal_tailed_interval(estimate, - alpha=1-level) - mle, _, _ = sel_distr.MLE(estimate) - mle *= sigma**2 - pval = sel_distr.cdf(0, estimate) - pval = 2 * min(pval, 1-pval) - L *= sigma**2 - U *= sigma**2 + weight *= normal_dbn.pdf(grid / sigma) + + # assert(0==1) + sel_distr = discrete_family(grid, weight) + L, U = sel_distr.equal_tailed_interval(estimate, + alpha=1-level) + mle, _, _ = sel_distr.MLE(estimate) + mle *= sigma**2 + pval = sel_distr.cdf(0, estimate) + pval = 2 * min(pval, 1-pval) + L *= sigma**2 + U *= sigma**2 + else: + + lb = estimate - 20 * sigma + ub = estimate + 20 * sigma + + if estimate < lower_bound or estimate > upper_bound: + warn('Constraints not satisfied: returning [-np.inf, np.inf]') + return -np.inf, np.inf, np.nan, np.nan + + def F(theta): + + Z = (estimate - theta) / sigma + Z_L = (lower_bound - theta) / sigma + Z_U = (upper_bound - theta) / sigma + num = norm_interval(Z_L, Z) + den = norm_interval(Z_L, Z_U) + if Z_L > 0 and den < 1e-10: + C = np.fabs(Z_L) + D = Z-Z_L + cdf = 1-np.exp(-C*D) + elif Z_U < 0 and den < 1e-10: + C = np.fabs(Z_U) + D = Z_U-Z + if C*D < 0: + raise ValueError + cdf = np.exp(-C*D) + else: + cdf = num / den + return cdf + + + pval = F(0) + pval = 2 * min(pval, 1-pval) + + lb = lower_bound - 20 * sigma + ub = upper_bound + 20 * sigma + + alpha = 0.5 * (1 - level) + L = find_root(F, 1.0 - 0.5 * alpha, lb, ub) + if np.isnan(L): + L = -np.inf + U = find_root(F, 0.5 * alpha, lb, ub) + if np.isnan(U): + U = np.inf + + mle = np.nan + return L, U, mle, pval +def norm_interval(lower, upper): + r""" + A multiprecision evaluation of + + .. math:: + + \Phi(U) - \Phi(L) + + Parameters + ---------- + + lower : float + The lower limit $L$ + + upper : float + The upper limit $U$ + + """ + #cdf = normal_dbn.cdf + cdf = mp.ncdf + if lower > 0 and upper > 0: + return cdf(-lower) - cdf(-upper) + else: + return cdf(upper) - cdf(lower) + def find_root(f, y, lb, ub, tol=1e-6): """ searches for solution to f(x) = y in (lb, ub), where diff --git a/pyproject.toml b/pyproject.toml index b3b5e36..c6ddb3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ requires = ["setuptools>=42", "matplotlib>=3.3.3", "versioneer", "statsmodels", + "mpmath", "coxdev"] build-backend = "setuptools.build_meta" From d821c945739424a06c0b483dffb0b9e81cfec6dd Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 15 Nov 2023 09:31:14 -0800 Subject: [PATCH 15/44] removing debug assertion --- glmnet/inference.py | 1 - 1 file changed, 1 deletion(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index c2145ca..433f702 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -766,7 +766,6 @@ def selection_interval(support_directions, ) weight *= normal_dbn.pdf(grid / sigma) - # assert(0==1) sel_distr = discrete_family(grid, weight) L, U = sel_distr.equal_tailed_interval(estimate, alpha=1-level) From d75094a1cafa7493040e816163b7fb1f16942c38 Mon Sep 17 00:00:00 2001 From: kevinbfry <15952367+kevinbfry@users.noreply.github.com> Date: Wed, 15 Nov 2023 13:24:44 -0800 Subject: [PATCH 16/44] added bootstrap inf --- glmnet/inference.py | 110 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/glmnet/inference.py b/glmnet/inference.py index 433f702..2451a1b 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -134,6 +134,116 @@ def lasso_inference(glmnet_obj, pvals[i] = p return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}) +def lasso_bootstrap_inference(glmnet_obj, + lambda_val, + selection_data, ## X = \Sigma^{-1/2}, Y = \Sigma^{-1/2}\hat\beta + full_data, + # C_full, ## bootstrap variance + alpha, + level=.9): + + fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) + X_sel, Y_sel, weight_sel = selection_data + + if weight_sel is None: + weight_sel = np.ones(X_sel.shape[0]) + + fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) + FL = fixed_lambda # shorthand + + if (not np.all(FL.upper_limits == np.inf) or + not np.all(FL.lower_limits == -np.inf)): + raise NotImplementedError('upper/lower limits coming soon') + + active_set = np.nonzero(FL.coef_ != 0)[0] + + # find selection data covariance + + # unreg_sel_LM = glmnet_obj.get_LM() + # unreg_sel_LM.summarize = True + # unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) + # C_sel = unreg_sel_LM.covariance_ + C_full = np.eye(len(active_set)) + C_sel = (1 + alpha) * C_full + + state = FL.state_ + information = FL._family.information(state, + weight_sel) + + keep = np.zeros(FL.design_.shape[1], bool) + if hasattr(FL, 'penalty_factors'): + penfac = FL.penalty_factors[active_set] + else: + penfac = np.ones(active_set.shape[0]) + signs = np.sign(FL.coef_[active_set]) + + # we multiply by weight_sel.sum() due to this factor + # appearing in glmnet objective... + if FL.fit_intercept: + penfac = np.hstack([0, penfac]) + signs = np.hstack([0, signs]) + keep[0] = 1 + keep[1 + active_set] = 1 + stacked = np.hstack([state.intercept, + state.coef[active_set]]) + delta = np.zeros(keep.sum()) + delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] + else: + keep[active_set] = 1 + stacked = state.coef[active_set] + delta = weight_sel.sum() * lambda_val * penfac * signs + + delta = C_sel @ delta + noisy_mle = stacked + delta + + penalized = penfac > 0 + sel_P = -np.diag(signs[penalized]) @ np.eye(C_sel.shape[0])[penalized] + + # fit full model + + X_full, Y_full, weight_full = full_data + + unreg_LM = glmnet_obj.get_LM() + unreg_LM.summarize = True + unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) + # C_full = unreg_LM.covariance_ + + # selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) + + ## TODO: will this handle fit_intercept? + if FL.fit_intercept: + full_mle = np.hstack([unreg_LM.intercept_, + unreg_LM.coef_]) + else: + full_mle = unreg_LM.coef_ + + ## iterate over coordinates + Ls = np.zeros_like(noisy_mle) + Us = np.zeros_like(noisy_mle) + mles = np.zeros_like(noisy_mle) + pvals = np.zeros_like(noisy_mle) + for i in range(len(noisy_mle)): + e_i = np.zeros_like(noisy_mle) + e_i[i] = 1. + ## call selection_interval and return + L, U, mle, p = selection_interval( + support_directions=sel_P, + support_offsets=-signs[penalized] * delta[penalized], + covariance_noisy=C_sel, + covariance_full=C_full, + noisy_observation=noisy_mle, + observation=full_mle, + direction_of_interest=e_i, + level=level, + ) + Ls[i] = L + Us[i] = U + mles[i] = mle + pvals[i] = p + + return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}) + + class constraints(object): r""" From 74ae5f4b940e0f841e6692de9cb54ec3108894ff Mon Sep 17 00:00:00 2001 From: kevinbfry <15952367+kevinbfry@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:14:06 -0800 Subject: [PATCH 17/44] fixing bootstrap inf --- glmnet/inference.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 2451a1b..ac9a2fb 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd from scipy.stats import norm as normal_dbn +import statsmodels.api as sm import mpmath as mp mp.dps = 80 @@ -81,7 +82,8 @@ def lasso_inference(glmnet_obj, delta = np.zeros(keep.sum()) delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] else: - keep[active_set] = 1 + # keep[active_set] = 1 + keep[1 + active_set] = 1 stacked = state.coef[active_set] delta = weight_sel.sum() * lambda_val * penfac * signs @@ -132,17 +134,24 @@ def lasso_inference(glmnet_obj, Us[i] = U mles[i] = mle pvals[i] = p - return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}) + + idx = (active_set).tolist() + if FL.fit_intercept: + idx = [-1] + idx + return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) def lasso_bootstrap_inference(glmnet_obj, lambda_val, - selection_data, ## X = \Sigma^{-1/2}, Y = \Sigma^{-1/2}\hat\beta + selection_data, ## X = \Sigma^{-1/2}, Y = \Sigma^{-1/2}\hat\beta^{(b)} full_data, - # C_full, ## bootstrap variance + C_full, ## bootstrap variance alpha, level=.9): fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) + fixed_lambda.fit_intercept = False + gaussian = sm.families.Gaussian() + fixed_lambda.family = gaussian X_sel, Y_sel, weight_sel = selection_data if weight_sel is None: @@ -163,8 +172,7 @@ def lasso_bootstrap_inference(glmnet_obj, # unreg_sel_LM.summarize = True # unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) # C_sel = unreg_sel_LM.covariance_ - C_full = np.eye(len(active_set)) - C_sel = (1 + alpha) * C_full + # C_full = np.eye(len(active_set)) state = FL.state_ information = FL._family.information(state, @@ -189,10 +197,14 @@ def lasso_bootstrap_inference(glmnet_obj, delta = np.zeros(keep.sum()) delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] else: - keep[active_set] = 1 + keep[1 + active_set] = 1 stacked = state.coef[active_set] delta = weight_sel.sum() * lambda_val * penfac * signs + C_full_ = C_full.copy() + C_full = C_full[keep[1:]][:,keep[1:]] + C_sel = (1 + alpha) * C_full + delta = C_sel @ delta noisy_mle = stacked + delta @@ -225,6 +237,7 @@ def lasso_bootstrap_inference(glmnet_obj, for i in range(len(noisy_mle)): e_i = np.zeros_like(noisy_mle) e_i[i] = 1. + # e_i = np.linalg.pinv(X_sel) @ e_i ## call selection_interval and return L, U, mle, p = selection_interval( support_directions=sel_P, @@ -240,9 +253,11 @@ def lasso_bootstrap_inference(glmnet_obj, Us[i] = U mles[i] = mle pvals[i] = p - - return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}) + idx = (active_set).tolist() + if FL.fit_intercept: + idx = [-1] + idx + return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) class constraints(object): From 25b0a449f916858c8dfce587babc2047ea8092ca Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 10 Jan 2024 16:50:42 -0800 Subject: [PATCH 18/44] WIP: selectinf, trying to match onestep and OLS estimator --- glmnet/glm.py | 15 ++++++-- glmnet/glmnet.py | 7 +++- glmnet/inference.py | 77 ++++++++++++++++++++++++++++----------- glmnet/regularized_glm.py | 11 +++++- 4 files changed, 82 insertions(+), 28 deletions(-) diff --git a/glmnet/glm.py b/glmnet/glm.py index 30a4be5..a658700 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -109,6 +109,7 @@ def null_fit(self, self, offset, None) + else: state = GLMState(np.zeros(1), 0) state.link_parameter = offset @@ -160,7 +161,7 @@ def get_response_and_weights(self, return pseudo_response, newton_weights - def default_scorers(self): + def _default_scorers(self): fam_name = self.base.__class__.__name__ @@ -383,7 +384,7 @@ class GLMBase(BaseEstimator, GLMBaseSpec): def _get_regularizer(self, - X): + nvars=None): return GLMRegularizer(fit_intercept=self.fit_intercept) # no standardization for GLM @@ -415,6 +416,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True, fit_null=True): @@ -454,9 +456,12 @@ def fit(self, # the regularizer stores the warm start if regularizer is None: - regularizer = self._get_regularizer(X) + regularizer = self._get_regularizer(nvars=design.X.shape[0]) self.regularizer_ = regularizer + if warm_state is not None: + self.regularizer_.warm_state = warm_state + if fit_null or not hasattr(self.regularizer_, 'warm_state'): (null_state, self.null_deviance_) = self._family.get_null_deviance( @@ -642,6 +647,7 @@ def fit(self, y, sample_weight=None, regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True): @@ -649,6 +655,7 @@ def fit(self, y, sample_weight=sample_weight, regularizer=regularizer, + warm_state=warm_state, dispersion=dispersion, check=check) @@ -736,6 +743,7 @@ def fit(self, y, sample_weight=None, regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True): @@ -750,6 +758,7 @@ def fit(self, y, sample_weight=sample_weight, regularizer=regularizer, # last 4 options non sklearn API + warm_state=warm_state, dispersion=dispersion, check=check) diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index 0ed19aa..c640df6 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -89,6 +89,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 3 options non sklearn API + warm_state=None, interpolation_grid=None): if not hasattr(self, "_family"): @@ -124,7 +125,11 @@ def fit(self, exclude=self.exclude ) - self.reg_glm_est_.fit(X, y, None, fit_null=False) + self.reg_glm_est_.fit(X, + y, + None, + fit_null=False, + warm_state=warm_state) regularizer_ = self.reg_glm_est_.regularizer_ state, keep_ = self._get_initial_state(X, diff --git a/glmnet/inference.py b/glmnet/inference.py index ac9a2fb..182d43b 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -21,15 +21,26 @@ def fixed_lambda_estimator(glmnet_obj, lambda_val): check_is_fitted(glmnet_obj, ["coefs_", "feature_names_in_"]) - estimator = clone(glmnet_obj.reg_glm_est_) - estimator.lambda_val = lambda_val + estimator = glmnet_obj.regularized_estimator( + lambda_val=lambda_val, + family=glmnet_obj.family, + alpha=glmnet_obj.alpha, + penalty_factor=glmnet_obj.penalty_factor, + lower_limits=glmnet_obj.lower_limits, + upper_limits=glmnet_obj.upper_limits, + fit_intercept=glmnet_obj.fit_intercept, + standardize=glmnet_obj.standardize, + control=glmnet_obj.control, + offset_id=glmnet_obj.offset_id, + weight_id=glmnet_obj.weight_id, + response_id=glmnet_obj.response_id, + exclude=glmnet_obj.exclude + ) + coefs, intercepts = glmnet_obj.interpolate_coefs([lambda_val]) cls = glmnet_obj.state_.__class__ state = cls(coefs[0], intercepts[0]) - estimator.regularizer_ = glmnet_obj.reg_glm_est_.regularizer_ - estimator.regularizer_.warm_state = state - - return estimator + return estimator, state def lasso_inference(glmnet_obj, lambda_val, @@ -37,17 +48,21 @@ def lasso_inference(glmnet_obj, full_data, level=.9): - fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) + fixed_lambda, warm_state = fixed_lambda_estimator(glmnet_obj, lambda_val) X_sel, Y_sel, weight_sel = selection_data if weight_sel is None: weight_sel = np.ones(X_sel.shape[0]) - fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) + fixed_lambda.fit(X_sel, + Y_sel, + sample_weight=weight_sel, + warm_state=warm_state) + FL = fixed_lambda # shorthand - if (not np.all(FL.upper_limits == np.inf) or - not np.all(FL.lower_limits == -np.inf)): + if (not np.all(FL.upper_limits >= FL.control.big) or + not np.all(FL.lower_limits <= -FL.control.big)): raise NotImplementedError('upper/lower limits coming soon') active_set = np.nonzero(FL.coef_ != 0)[0] @@ -59,10 +74,6 @@ def lasso_inference(glmnet_obj, unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) C_sel = unreg_sel_LM.covariance_ - state = FL.state_ - information = FL._family.information(state, - weight_sel) - keep = np.zeros(FL.design_.shape[1], bool) if hasattr(FL, 'penalty_factors'): penfac = FL.penalty_factors[active_set] @@ -77,24 +88,33 @@ def lasso_inference(glmnet_obj, signs = np.hstack([0, signs]) keep[0] = 1 keep[1 + active_set] = 1 - stacked = np.hstack([state.intercept, - state.coef[active_set]]) - delta = np.zeros(keep.sum()) - delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] + stacked = np.hstack([FL.intercept_, + FL.coef_[active_set]]) + delta = weight_sel.sum() * lambda_val * penfac * signs + print(penfac) else: # keep[active_set] = 1 keep[1 + active_set] = 1 - stacked = state.coef[active_set] + stacked = FL.coef_[active_set] delta = weight_sel.sum() * lambda_val * penfac * signs - delta = C_sel @ delta + print(active_set, 'huh') + + D = np.column_stack([np.ones(X_sel.shape[0]), X_sel[:,active_set]]) + other_mle = np.linalg.pinv(D) @ Y_sel + A_sel = C_sel / unreg_sel_LM.dispersion_ + print(np.linalg.norm(A_sel - np.linalg.inv(unreg_sel_LM.design_.quadratic_form(unreg_sel_LM._information))), 'normie') + delta = A_sel @ delta noisy_mle = stacked + delta penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(C_sel.shape[0])[penalized] + sel_P = -np.diag(signs[penalized]) @ np.eye(A_sel.shape[0])[penalized] # fit full model + subgrad1 = X_sel[:,active_set].T @ (Y_sel - FL.design_ @ np.hstack([FL.intercept_, FL.coef_])) + print(subgrad1, 'subgrad1') + X_full, Y_full, weight_full = full_data unreg_LM = glmnet_obj.get_LM() @@ -111,6 +131,12 @@ def lasso_inference(glmnet_obj, else: full_mle = unreg_LM.coef_ + print(other_mle, 'other') + print(noisy_mle, 'noisy') + print(full_mle, 'full') + + stop + ## iterate over coordinates Ls = np.zeros_like(noisy_mle) Us = np.zeros_like(noisy_mle) @@ -783,6 +809,8 @@ def interval_constraints(support_directions, direction_of_interest) U = A.dot(X) - b + print(U, np.fabs(U).max()* tol, b, '?') + if not np.all(U < tol * np.fabs(U).max()): warn('constraints not satisfied: %s' % repr(U)) @@ -810,6 +838,7 @@ def interval_constraints(support_directions, else: lower_bound = -np.inf + print(lower_bound, V, upper_bound, sigma) return lower_bound, V, upper_bound, sigma def selection_interval(support_directions, @@ -865,7 +894,7 @@ def selection_interval(support_directions, """ (lower_bound, - _, + V, upper_bound, _) = interval_constraints(support_directions, support_offsets, @@ -878,7 +907,10 @@ def selection_interval(support_directions, ## sqrt(alpha) is just sigma_noisy - sigma_full smoothing_sigma = np.sqrt(max(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2, 0)) estimate = (direction_of_interest * observation).sum() + noisy_estimate = (direction_of_interest * noisy_observation).sum() + print(V, estimate, noisy_estimate, 'huh') + if smoothing_sigma > 1e-6 * sigma: grid = np.linspace(estimate - 20 * sigma, estimate + 20 * sigma, 801) @@ -906,6 +938,7 @@ def selection_interval(support_directions, ub = estimate + 20 * sigma if estimate < lower_bound or estimate > upper_bound: + print(estimate, lower_bound, upper_bound) warn('Constraints not satisfied: returning [-np.inf, np.inf]') return -np.inf, np.inf, np.nan, np.nan diff --git a/glmnet/regularized_glm.py b/glmnet/regularized_glm.py index f21bcb6..3cda87a 100644 --- a/glmnet/regularized_glm.py +++ b/glmnet/regularized_glm.py @@ -149,17 +149,21 @@ def get_LM(self): response_id=self.response_id) def _get_regularizer(self, - X): + nvars=None): # self.design_ will have been set by now + if nvars is None: + check_is_fitted(self, ["design_"]) + nvars = self.design_.X.shape[1] + return ElNetRegularizer(lambda_val=self.lambda_val, alpha=self.alpha, penalty_factor=self.penalty_factor, lower_limits=self.lower_limits * self.design_.scaling_, upper_limits=self.upper_limits * self.design_.scaling_, fit_intercept=self.fit_intercept, - nvars=X.shape[1], + nvars=nvars, control=self.control, exclude=self.exclude) @@ -177,6 +181,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 3 options non sklearn API + warm_state=None, check=True, fit_null=True): @@ -184,6 +189,7 @@ def fit(self, y, sample_weight=sample_weight, regularizer=regularizer, + warm_state=warm_state, dispersion=1, check=check) @@ -220,6 +226,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True): From de7db402ca3f6568b1aa3745564dd4da21fb89a7 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 12:22:59 -0800 Subject: [PATCH 19/44] some pyproject.toml work --- .gitattributes | 1 + glmnet/_version.py | 127 +++++++++++++++++++++++++++++---------------- pyproject.toml | 8 +++ setup.py | 5 +- 4 files changed, 95 insertions(+), 46 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..8edfacf --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +glmnet/_version.py export-subst diff --git a/glmnet/_version.py b/glmnet/_version.py index 9b01ea2..3f737f4 100644 --- a/glmnet/_version.py +++ b/glmnet/_version.py @@ -5,8 +5,9 @@ # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. -# This file is released into the public domain. Generated by -# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer) +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" @@ -15,10 +16,11 @@ import re import subprocess import sys -from typing import Callable, Dict +from typing import Any, Callable, Dict, List, Optional, Tuple +import functools -def get_keywords(): +def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must @@ -34,17 +36,24 @@ def get_keywords(): class VersioneerConfig: """Container for Versioneer configuration parameters.""" + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool -def get_config(): + +def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" - cfg.tag_prefix = "" - cfg.parentdir_prefix = "ISLP-" - cfg.versionfile_source = "ISLP/_version.py" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "glmnet-" + cfg.versionfile_source = "glmnet/_version.py" cfg.verbose = False return cfg @@ -57,9 +66,9 @@ class NotThisMethod(Exception): HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} @@ -68,11 +77,25 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + for command in commands: try: dispcmd = str([command] + args) @@ -80,10 +103,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr - else None)) + else None), **popen_kwargs) break - except OSError: - e = sys.exc_info()[1] + except OSError as e: if e.errno == errno.ENOENT: continue if verbose: @@ -103,7 +125,11 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, process.returncode -def versions_from_parentdir(parentdir_prefix, root, verbose): +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both @@ -128,13 +154,13 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): @register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. - keywords = {} + keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: @@ -156,7 +182,11 @@ def git_get_keywords(versionfile_abs): @register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") @@ -220,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): +def git_pieces_from_vcs( + tag_prefix: str, + root: str, + verbose: bool, + runner: Callable = run_command +) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -228,13 +263,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): version string, meaning we're inside a checked out source tree. """ GITS = ["git"] - TAG_PREFIX_REGEX = "*" if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - TAG_PREFIX_REGEX = r"\*" + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -242,11 +282,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", - "%s%s" % (tag_prefix, TAG_PREFIX_REGEX)], - cwd=root) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -256,7 +295,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() - pieces = {} + pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None @@ -335,8 +374,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() @@ -348,14 +387,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" -def render_pep440(pieces): +def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you @@ -380,7 +419,7 @@ def render_pep440(pieces): return rendered -def render_pep440_branch(pieces): +def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards @@ -410,7 +449,7 @@ def render_pep440_branch(pieces): return rendered -def pep440_split_post(ver): +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the @@ -420,7 +459,7 @@ def pep440_split_post(ver): return vc[0], int(vc[1] or 0) if len(vc) == 2 else None -def render_pep440_pre(pieces): +def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: @@ -432,7 +471,7 @@ def render_pep440_pre(pieces): tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: - rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: @@ -444,7 +483,7 @@ def render_pep440_pre(pieces): return rendered -def render_pep440_post(pieces): +def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards @@ -471,7 +510,7 @@ def render_pep440_post(pieces): return rendered -def render_pep440_post_branch(pieces): +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. @@ -500,7 +539,7 @@ def render_pep440_post_branch(pieces): return rendered -def render_pep440_old(pieces): +def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. @@ -522,7 +561,7 @@ def render_pep440_old(pieces): return rendered -def render_git_describe(pieces): +def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. @@ -542,7 +581,7 @@ def render_git_describe(pieces): return rendered -def render_git_describe_long(pieces): +def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. @@ -562,7 +601,7 @@ def render_git_describe_long(pieces): return rendered -def render(pieces, style): +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", @@ -598,7 +637,7 @@ def render(pieces, style): "date": pieces.get("date")} -def get_versions(): +def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some diff --git a/pyproject.toml b/pyproject.toml index 978dbe5..a6be16e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,3 +42,11 @@ dynamic = ['version'] [project.license] file = 'LICENSE.txt' +[tool.versioneer] +VCS = "git" +style = "pep440" +versionfile_source = "glmnet/_version.py" +versionfile_build = "glmnet/_version.py" +tag_prefix = "v" +parentdir_prefix = "glmnet-" + diff --git a/setup.py b/setup.py index e4ca64a..2f42b2c 100644 --- a/setup.py +++ b/setup.py @@ -43,9 +43,10 @@ 'gaussnet', 'multigaussnet']] +long_description = open('README.md', 'rt', encoding='utf-8').read() + def main(**extra_args): setup(name='glmnet', - url="http://github.org/intro-stat-learning/glmnetpy", version=versioneer.get_version(), packages = ['glmnet', 'glmnet.paths'], @@ -54,7 +55,7 @@ def main(**extra_args): include_package_data=True, data_files=[], scripts=[], - #long_description=long_description, + long_description=long_description, long_description_content_type=long_description_content_type, cmdclass = cmdclass, **extra_args From aade7e61ea6a8aabe1cec8a5b56747ff5c561ddc Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 25 Jan 2024 14:48:33 -0800 Subject: [PATCH 20/44] WIP: inference --- tests/flex/__init__.py | 0 tests/paths/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/flex/__init__.py create mode 100644 tests/paths/__init__.py diff --git a/tests/flex/__init__.py b/tests/flex/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/paths/__init__.py b/tests/paths/__init__.py new file mode 100644 index 0000000..e69de29 From af85169fadd127dad95995fe7f249f879d21d967 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 25 Jan 2024 14:51:35 -0800 Subject: [PATCH 21/44] rename _check to get_data_arrays --- glmnet/cox.py | 21 ++++++++++++--------- glmnet/glm.py | 13 +++++++------ glmnet/glmnet.py | 14 +++++++------- glmnet/paths/fastnet.py | 7 ++++--- glmnet/paths/fishnet.py | 7 +++++-- glmnet/paths/lognet.py | 7 +++++-- glmnet/paths/multiclassnet.py | 6 +++--- glmnet/paths/multigaussnet.py | 11 ++++++++--- 8 files changed, 51 insertions(+), 35 deletions(-) diff --git a/glmnet/cox.py b/glmnet/cox.py index 444572e..a23a459 100644 --- a/glmnet/cox.py +++ b/glmnet/cox.py @@ -190,10 +190,10 @@ def _get_family_spec(self, status_id=self.family.status_id, start_id=self.family.start_id) - def _check(self, - X, - y, - check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -243,7 +243,10 @@ class RegCoxLM(RegGLM): fit_intercept: Literal[False] = False - def _check(self, X, y, check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -268,10 +271,10 @@ class CoxNet(GLMNet): fit_intercept: Literal[False] = False regularized_estimator: BaseEstimator = RegCoxLM - def _check(self, - X, - y, - check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, diff --git a/glmnet/glm.py b/glmnet/glm.py index a658700..3873cbc 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -403,7 +403,7 @@ def _get_family_spec(self, elif isinstance(self.family, GLMFamilySpec): return self.family - def _check(self, X, y, check=True): + def get_data_arrays(self, X, y, check=True): return _get_data(self, X, y, @@ -431,7 +431,7 @@ def fit(self, if not hasattr(self, "_family"): self._family = self._get_family_spec(y) - X, y, response, offset, weight = self._check(X, y, check=check) + X, y, response, offset, weight = self.get_data_arrays(X, y, check=check) sample_weight = weight self.sample_weight_ = normed_sample_weight = sample_weight / sample_weight.sum() @@ -456,7 +456,7 @@ def fit(self, # the regularizer stores the warm start if regularizer is None: - regularizer = self._get_regularizer(nvars=design.X.shape[0]) + regularizer = self._get_regularizer(nvars=design.X.shape[1]) self.regularizer_ = regularizer if warm_state is not None: @@ -542,6 +542,7 @@ def obj_function(response, else: self.dispersion_ = dispersion + self.state_ = state return self fit.__doc__ = ''' Fit a GLM. @@ -659,7 +660,7 @@ def fit(self, dispersion=dispersion, check=check) - weight = self._check(X, y, check=False)[-1] + weight = self.get_data_arrays(X, y, check=False)[-1] if self.summarize: self.covariance_, self.summary_ = self._summarize(self.exclude, @@ -728,9 +729,9 @@ class BinomialGLM(ClassifierMixin, GLM): family: sm_family.Family = field(default_factory=sm_family.Binomial) - def _check(self, X, y, check=True): + def get_data_arrays(self, X, y, check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) encoder = LabelEncoder() labels = np.asfortranarray(encoder.fit_transform(response)) self.classes_ = encoder.classes_ diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index c640df6..7a2ed39 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -66,10 +66,10 @@ class GLMNetSpec(object): class GLMNet(BaseEstimator, GLMNetSpec): - def _check(self, - X, - y, - check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -95,7 +95,7 @@ def fit(self, if not hasattr(self, "_family"): self._family = self._get_family_spec(y) - X, y, response, offset, weight = self._check(X, y) + X, y, response, offset, weight = self.get_data_arrays(X, y) if isinstance(X, pd.DataFrame): self.feature_names_in_ = list(X.columns) @@ -315,7 +315,7 @@ def cross_validation_path(self, # truncate to the size we got predictions = predictions[:,:self.lambda_values_.shape[0]] - response, offset, weight = self._check(X, y, check=False)[2:] + response, offset, weight = self.get_data_arrays(X, y, check=False)[2:] # adjust for offset # because predictions are just X\beta @@ -464,7 +464,7 @@ def _get_initial_state(self, intercept_ = glm.intercept_ else: if self.fit_intercept: - response, offset, weight = self._check(X, y, check=False)[2:] + response, offset, weight = self.get_data_arrays(X, y, check=False)[2:] state0 = self._family.null_fit(response, weight, offset, diff --git a/glmnet/paths/fastnet.py b/glmnet/paths/fastnet.py index e3f8c0f..248336f 100644 --- a/glmnet/paths/fastnet.py +++ b/glmnet/paths/fastnet.py @@ -62,7 +62,7 @@ def fit(self, else: self.feature_names_in_ = ['X{}'.format(i) for i in range(X.shape[1])] - X, y, response, offset, weight = self._check(X, y) + X, y, response, offset, weight = self.get_data_arrays(X, y) if not scipy.sparse.issparse(X): X = np.asfortranarray(X) @@ -129,8 +129,9 @@ def fit(self, self.coefs_ = result['coefs'] self.intercepts_ = result['intercepts'] - self.state_ = GLMState(self.coefs_[-1], - self.intercepts_[-1]) + if self.coefs_.ndim == 1: + self.state_ = GLMState(self.coefs_[-1], + self.intercepts_[-1]) self.lambda_values_ = result['lambda_values'] nfits = self.lambda_values_.shape[0] diff --git a/glmnet/paths/fishnet.py b/glmnet/paths/fishnet.py index e638c6f..da28123 100644 --- a/glmnet/paths/fishnet.py +++ b/glmnet/paths/fishnet.py @@ -28,9 +28,12 @@ class FishNet(FastNetMixin): def __post_init__(self): self._family = GLMFamilySpec(base=sm_family.Poisson()) - def _check(self, X, y, check=True): + def get_data_arrays(self, + X, + y, + check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) if np.any(response < 0): raise ValueError("negative responses encountered; not permitted for Poisson family") response = np.asarray(response, float).copy() diff --git a/glmnet/paths/lognet.py b/glmnet/paths/lognet.py index bdd4dfd..b783ec1 100644 --- a/glmnet/paths/lognet.py +++ b/glmnet/paths/lognet.py @@ -41,9 +41,12 @@ def _extract_fits(self, V = super()._extract_fits(X_shape, response_shape) return V - def _check(self, X, y, check=True): + def get_data_arrays(self, + X, + y, + check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) encoder = LabelEncoder() labels = np.asfortranarray(encoder.fit_transform(response)) self.classes_ = encoder.classes_ diff --git a/glmnet/paths/multiclassnet.py b/glmnet/paths/multiclassnet.py index 676831c..05486e4 100644 --- a/glmnet/paths/multiclassnet.py +++ b/glmnet/paths/multiclassnet.py @@ -26,7 +26,7 @@ @dataclass class MultiClassFamily(object): - def default_scorers(self): + def _default_scorers(self): return [accuracy_scorer, misclass_scorer, @@ -68,9 +68,9 @@ def _offset_predictions(self, value = exp_value / exp_value.sum(-1)[:,:,None] return value - def _check(self, X, y, check=True): + def get_data_arrays(self, X, y, check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) encoder = OneHotEncoder(sparse_output=False) y_onehot = np.asfortranarray(encoder.fit_transform(response.reshape((-1,1)))) self.categories_ = encoder.categories_[0] diff --git a/glmnet/paths/multigaussnet.py b/glmnet/paths/multigaussnet.py index 7f30c38..b2a3943 100644 --- a/glmnet/paths/multigaussnet.py +++ b/glmnet/paths/multigaussnet.py @@ -25,7 +25,7 @@ @dataclass class MultiClassFamily(object): - def default_scorers(self): + def _default_scorers(self): return [mse_scorer, mae_scorer] @dataclass @@ -49,8 +49,13 @@ def _extract_fits(self, self._fit['dev'] = self._fit['rsq'] return super()._extract_fits(X_shape, response_shape) - def _check(self, X, y, check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + def get_data_arrays(self, + X, + y, + check=True): + X, y, response, offset, weight = super().get_data_arrays(X, + y, + check=check) return X, y, np.asfortranarray(response), offset, weight def _wrapper_args(self, From 302e8b2ddd210aa5139f6a65c7badefdcd42e1a8 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 25 Jan 2024 17:53:09 -0800 Subject: [PATCH 22/44] WIP: inference --- glmnet/inference.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 182d43b..c42b33b 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -50,7 +50,10 @@ def lasso_inference(glmnet_obj, fixed_lambda, warm_state = fixed_lambda_estimator(glmnet_obj, lambda_val) X_sel, Y_sel, weight_sel = selection_data - + X_sel, Y_sel, _, _, weight_sel = fixed_lambda.get_data_arrays(X_sel, + Y_sel) + Y_sel = np.asarray(Y_sel) + if weight_sel is None: weight_sel = np.ones(X_sel.shape[0]) From e07bf4647804b70cb412436e6e24be8ff55e1a02 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 26 Jan 2024 14:57:01 -0800 Subject: [PATCH 23/44] using score of family, seems to have good coverage under global null --- glmnet/cox.py | 3 ++- glmnet/glm.py | 1 + glmnet/inference.py | 26 ++++++------------ tests/test_inference.py | 58 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 19 deletions(-) create mode 100644 tests/test_inference.py diff --git a/glmnet/cox.py b/glmnet/cox.py index a23a459..10335dd 100644 --- a/glmnet/cox.py +++ b/glmnet/cox.py @@ -57,11 +57,12 @@ def logl_score(self, family, y, sample_weight): + link_parameter = self.link_parameter family._result = family._coxdev(link_parameter, sample_weight) # the gradient is the gradient of the deviance - # we want deviance of the log-likelihood + # we want gradient of the log-likelihood return - family._result.gradient / 2 @dataclass diff --git a/glmnet/glm.py b/glmnet/glm.py index 3873cbc..9992509 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -305,6 +305,7 @@ def logl_score(self, dmu_deta = family.link.inverse_deriv(self.link_parameter) # compute working residual + y = np.asarray(y).reshape(-1) r = (y - self.mu) return sample_weight * r * dmu_deta / varmu diff --git a/glmnet/inference.py b/glmnet/inference.py index c42b33b..df36cb7 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -52,8 +52,6 @@ def lasso_inference(glmnet_obj, X_sel, Y_sel, weight_sel = selection_data X_sel, Y_sel, _, _, weight_sel = fixed_lambda.get_data_arrays(X_sel, Y_sel) - Y_sel = np.asarray(Y_sel) - if weight_sel is None: weight_sel = np.ones(X_sel.shape[0]) @@ -94,19 +92,16 @@ def lasso_inference(glmnet_obj, stacked = np.hstack([FL.intercept_, FL.coef_[active_set]]) delta = weight_sel.sum() * lambda_val * penfac * signs - print(penfac) else: # keep[active_set] = 1 keep[1 + active_set] = 1 stacked = FL.coef_[active_set] delta = weight_sel.sum() * lambda_val * penfac * signs - print(active_set, 'huh') - D = np.column_stack([np.ones(X_sel.shape[0]), X_sel[:,active_set]]) other_mle = np.linalg.pinv(D) @ Y_sel A_sel = C_sel / unreg_sel_LM.dispersion_ - print(np.linalg.norm(A_sel - np.linalg.inv(unreg_sel_LM.design_.quadratic_form(unreg_sel_LM._information))), 'normie') + delta = A_sel @ delta noisy_mle = stacked + delta @@ -115,8 +110,10 @@ def lasso_inference(glmnet_obj, # fit full model - subgrad1 = X_sel[:,active_set].T @ (Y_sel - FL.design_ @ np.hstack([FL.intercept_, FL.coef_])) - print(subgrad1, 'subgrad1') + Y_sel = np.asarray(Y_sel) + + # X'(Y-X\hat{\beta}) (appropriately weighted) + subgrad1 = FL.design_.T @ FL.state_.logl_score(FL._family, Y_sel, sample_weight=weight_sel) X_full, Y_full, weight_full = full_data @@ -134,12 +131,6 @@ def lasso_inference(glmnet_obj, else: full_mle = unreg_LM.coef_ - print(other_mle, 'other') - print(noisy_mle, 'noisy') - print(full_mle, 'full') - - stop - ## iterate over coordinates Ls = np.zeros_like(noisy_mle) Us = np.zeros_like(noisy_mle) @@ -812,7 +803,7 @@ def interval_constraints(support_directions, direction_of_interest) U = A.dot(X) - b - print(U, np.fabs(U).max()* tol, b, '?') + #print(U, np.fabs(U).max()* tol, b, '?') if not np.all(U < tol * np.fabs(U).max()): warn('constraints not satisfied: %s' % repr(U)) @@ -841,7 +832,6 @@ def interval_constraints(support_directions, else: lower_bound = -np.inf - print(lower_bound, V, upper_bound, sigma) return lower_bound, V, upper_bound, sigma def selection_interval(support_directions, @@ -912,7 +902,7 @@ def selection_interval(support_directions, estimate = (direction_of_interest * observation).sum() noisy_estimate = (direction_of_interest * noisy_observation).sum() - print(V, estimate, noisy_estimate, 'huh') + #print(V, estimate, noisy_estimate, 'huh') if smoothing_sigma > 1e-6 * sigma: @@ -941,7 +931,7 @@ def selection_interval(support_directions, ub = estimate + 20 * sigma if estimate < lower_bound or estimate > upper_bound: - print(estimate, lower_bound, upper_bound) + #print(estimate, lower_bound, upper_bound) warn('Constraints not satisfied: returning [-np.inf, np.inf]') return -np.inf, np.inf, np.nan, np.nan diff --git a/tests/test_inference.py b/tests/test_inference.py new file mode 100644 index 0000000..95f4a19 --- /dev/null +++ b/tests/test_inference.py @@ -0,0 +1,58 @@ +import pytest + +import numpy as np +import pandas as pd +import statsmodels.api as sm + +from glmnet import GLMNet +from glmnet.inference import (fixed_lambda_estimator, + lasso_inference) +@pytest.mark.parametrize('n', [500]) +@pytest.mark.parametrize('p', [103]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +def test_inference(n, p, fit_intercept): + run_inference(n, p, fit_intercept) + +def run_inference(n, p, fit_intercept): + + rng = np.random.default_rng(0) + X = rng.standard_normal((n, p)) + Y = np.random.standard_normal(n) * 2 + Df = pd.DataFrame({'response':Y}) + + fam = sm.families.Gaussian() + GN = GLMNet(response_id='response', + family=fam, + fit_intercept=fit_intercept) + GN.fit(X, Df) + return lasso_inference(GN, + GN.lambda_values_[10], + (X[:50], Df.iloc[:50], None), + (X, Df, None)) + +# ncover = 0 +# nsel = 0 +# niter = 1000 +# for i in range(niter): +# GN = GLMNet(response_id='response', +# family=fam, +# control=GNcontrol, +# fit_intercept=False, +# ) + +# X = rng.standard_normal((n, p)) +# Y = rng.standard_normal(n) * 1 +# Df = pd.DataFrame({'response':Y}) +# GN.fit(X, Df) +# sel_slice = slice(0, 4*n//5) +# res = lasso_inference(GN, +# GN.lambda_values_[10], +# (X[sel_slice], Df.iloc[sel_slice], None), +# (X, Df, None), +# level=0.8) + +# ncover += ((res['lower'] < 0) & (res['upper'] > 0)).sum() +# nsel += res.shape[0] + +# ncover / nsel + From 68180f65e07093c08ec877d4da497b8acdd8e054 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 12 Apr 2024 12:01:28 -0700 Subject: [PATCH 24/44] BF: quadratic form not computed correctly because X was modified in place when not sparse, added option for the implied Q of transformed or raw --- glmnet/base.py | 65 ++++++++++++++++++--------- tests/test_design.py | 102 ++++++++++++++++++++++--------------------- 2 files changed, 98 insertions(+), 69 deletions(-) diff --git a/glmnet/base.py b/glmnet/base.py index 257fbe0..2517071 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -95,8 +95,15 @@ def _rmatvec(self, r): def quadratic_form(self, G=None, - columns=None): + columns=None, + transformed=False): ''' + if transformed is False: compute + + [1 X]'G[1 X[:,E]] + + if transformed is True: compute + A'GA[:,E] where A is the effective matrix @@ -104,6 +111,7 @@ def quadratic_form(self, [1, XS^{-1} - 1 (xm/xs)'] and E is a subset of columns + ''' # A is effective matrix @@ -112,6 +120,9 @@ def quadratic_form(self, # A'G1 = S^{-1}(X' - xm 1')G1 = S^{-1}X'G1 - S^{-1}xm 1'G1 # A'GA = S^{-1}X'GXS^{-1} - S^{-1}X'G1 xm/xs' - xm/xs 1'GXS^{-1} + xm/xs 1'G1 (xm/xs)' + # or, the reverse + # X'GX = SA'GAS + X'G1xm' + xm1'GX - xm1'G1xm' + n, p = self.shape[0], self.shape[1] - 1 if columns is None: @@ -122,35 +133,49 @@ def quadratic_form(self, if G is None: XX_block = self.X.T @ X_R # have to assume this is not too expensive + if scipy.sparse.issparse(self.X): + XX_block = XX_block.toarray() X1_block = self.X.sum(0) # X'1 G_sum = n - + else: GX = G @ X_R G1 = G @ np.ones(G.shape[0]) XX_block = self.X.T @ GX X1_block = self.X.T @ G1 G_sum = G1.sum() - - if scipy.sparse.issparse(XX_block): - XX_block = XX_block.toarray() - - # correct XX_block for standardize - + xm, xs = self.centers_, self.scaling_ + if scipy.sparse.issparse(self.X): # in this case X has not been transformed - if columns is not None: - XX_block -= (np.multiply.outer(X1_block, xm[columns]) + np.multiply.outer(xm, X1_block[columns])) - XX_block += np.multiply.outer(xm, xm[columns]) * G_sum - XX_block /= np.multiply.outer(xs, xs[columns]) - else: - XX_block -= (np.multiply.outer(X1_block, xm) + np.multiply.outer(xm, X1_block)) - XX_block += np.multiply.outer(xm, xm) * G_sum - XX_block /= np.multiply.outer(xs, xs) + # correct XX_block for standardize + + if transformed: + if columns is not None: + XX_block -= (np.multiply.outer(X1_block, xm[columns]) + np.multiply.outer(xm, X1_block[columns])) + XX_block += np.multiply.outer(xm, xm[columns]) * G_sum + XX_block /= np.multiply.outer(xs, xs[columns]) + else: + XX_block -= (np.multiply.outer(X1_block, xm) + np.multiply.outer(xm, X1_block)) + XX_block += np.multiply.outer(xm, xm) * G_sum + XX_block /= np.multiply.outer(xs, xs) + + X1_block -= G_sum * xm + X1_block /= xs + + else: # X will already have been transformed, so + # we only have to undo it if transformed=False + + # or, the reverse + # X'GX = SA'GAS + X'G1xm' + xm1'GX - xm1'G1xm' + + if not transformed: + XX_block *= np.multiply.outer(xs, xs[columns]) + X1_block *= xs + XX_block += (np.multiply.outer(X1_block, xm[columns]) + np.multiply.outer(xm, X1_block[columns])) + XX_block += np.multiply.outer(xm, xm[columns]) * G_sum + X1_block += G_sum * xm - X1_block -= G_sum * xm - X1_block /= xs - Q = np.zeros((XX_block.shape[0] + 1, XX_block.shape[1] + 1)) Q[1:,1:] = XX_block @@ -160,7 +185,7 @@ def quadratic_form(self, else: Q[0,1:] = X1_block Q[0,0] = G_sum - + return Q @add_dataclass_docstring diff --git a/tests/test_design.py b/tests/test_design.py index af6c895..98357f2 100644 --- a/tests/test_design.py +++ b/tests/test_design.py @@ -86,55 +86,59 @@ def test_design(X, weights, standardize, intercept): assert np.allclose(design.T @ V_l, design_s.T @ V_l) -# @pytest.mark.parametrize('X', [rng.standard_normal((n, p)), -# scipy.sparse.csc_array(rng.standard_normal((n, p)))] ) -# @pytest.mark.parametrize('weights', [np.ones(n), rng.uniform(1, 2, size=(n,))]) -# @pytest.mark.parametrize('standardize', [True, False]) -# @pytest.mark.parametrize('intercept', [True, False]) -# @pytest.mark.parametrize('gls', [None, -# rng.uniform(1, 2, size=(n,)), -# rng.uniform(1, 2, size=(n,n))]) -# def test_quadratic_form(X, weights, standardize, intercept, gls): -# """ -# test the linear / adjoint maps of Design from an np.ndarray and a scipy.sparse.csc_array -# """ +@pytest.mark.parametrize('X', [rng.standard_normal((n, p)), + scipy.sparse.csc_array(rng.standard_normal((n, p)))] ) +@pytest.mark.parametrize('weights', [np.ones(n), rng.uniform(1, 2, size=(n,))]) +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +@pytest.mark.parametrize('gls', [None, + np.diag(rng.uniform(1, 2, size=(n,))), + rng.uniform(1, 2, size=(n,n))]) +def test_quadratic_form(X, weights, standardize, intercept, gls): + """ + test the linear / adjoint maps of Design from an np.ndarray and a scipy.sparse.csc_array + """ -# if gls is not None and gls.ndim == 2: -# gls = (gls + gls.T) / 2 -# X_s = scipy.sparse.csc_array(X) -# design = Design(X, W, standardize=stand) -# design_s = Design(X_s, W, standardize=stand) - -# Q_s = design_s.quadratic_form(G=gls, columns=columns) -# Q = design.quadratic_form(G=gls, columns=columns) - -# X_eff = scipy.sparse.csc_array(X).toarray() -# if stand: -# xm = (X_eff * W[:,None]).sum(0) / W.sum() -# x2 = (X_eff**2 * W[:,None]).sum(0) / W.sum() -# xs = np.sqrt(x2 - xm**2) -# else: -# xm = np.zeros(p) -# xs = np.ones(p) - -# X_eff = X_eff / xs[None,:] - np.multiply.outer(np.ones(n), xm / xs) -# X_eff = np.concatenate([np.ones((n,1)), X_eff], axis=1) - -# if columns is not None: -# columns = np.array(columns) -# columns += 1 -# columns = np.hstack([[0], columns]) - -# if gls is None: -# Q_eff = X_eff.T @ X_eff -# elif gls.ndim == 1: -# Q_eff = X_eff.T @ (gls[:, None] * X_eff) -# else: -# Q_eff = X_eff.T @ gls @ X_eff -# if columns is not None: -# Q_eff = Q_eff[:, columns] -# assert np.allclose(Q, Q_eff) # calculation is done correctly - -# assert np.allclose(Q_s, Q) # sparse and dense agree + if gls is not None and gls.ndim == 2: + gls = (gls + gls.T) / 2 + + X_s = scipy.sparse.csc_array(X) + W = weights + design = Design(X, W, standardize=standardize) + design_s = Design(X_s, W, standardize=standardize) + + columns = [0,1,3] + Q_s = design_s.quadratic_form(G=gls, columns=columns, transformed=standardize) + Q = design.quadratic_form(G=gls, columns=columns, transformed=standardize) + + X_eff = scipy.sparse.csc_array(X).toarray() + + if standardize: + xm = (X_eff * W[:,None]).sum(0) / W.sum() + x2 = (X_eff**2 * W[:,None]).sum(0) / W.sum() + xs = np.sqrt(x2 - xm**2) + else: + xm = np.zeros(p) + xs = np.ones(p) + + X_eff = X_eff / xs[None,:] - np.multiply.outer(np.ones(n), xm / xs) + X_eff = np.concatenate([np.ones((n,1)), X_eff], axis=1) + + if columns is not None: + columns = np.array(columns) + columns += 1 + columns = np.hstack([[0], columns]) + + if gls is None: + Q_eff = X_eff.T @ X_eff + elif gls.ndim == 1: + Q_eff = X_eff.T @ (gls[:, None] * X_eff) + else: + Q_eff = X_eff.T @ gls @ X_eff + if columns is not None: + Q_eff = Q_eff[:, columns] + assert np.allclose(Q, Q_eff) # calculation is done correctly + + assert np.allclose(Q_s, Q) # sparse and dense agree From c1eab7336f93821388dddfc448874c6b5da95ea1 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 12 Apr 2024 15:07:28 -0700 Subject: [PATCH 25/44] data split inference working well (slightly better without standardizing) --- glmnet/cox.py | 3 +- glmnet/glm.py | 4 +- glmnet/glmnet.py | 2 +- glmnet/inference.py | 352 ++++++++++++---------------------------- tests/test_inference.py | 143 ++++++++++++---- 5 files changed, 215 insertions(+), 289 deletions(-) diff --git a/glmnet/cox.py b/glmnet/cox.py index 10335dd..4256850 100644 --- a/glmnet/cox.py +++ b/glmnet/cox.py @@ -213,7 +213,8 @@ def _summarize(self, # IRLS used normalized weights, # this unnormalizes them... - unscaled_precision_ = self.design_.quadratic_form(self._information) + unscaled_precision_ = self.design_.quadratic_form(self._information, + transformed=False) keep = np.ones(unscaled_precision_.shape[0]-1, bool) if exclude is not []: diff --git a/glmnet/glm.py b/glmnet/glm.py index 9992509..1d28c94 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -669,7 +669,6 @@ def fit(self, weight, X.shape) - else: self.summary_ = self.covariance_ = None @@ -684,7 +683,8 @@ def _summarize(self, # IRLS used normalized weights, # this unnormalizes them... - unscaled_precision_ = self.design_.quadratic_form(self._information) + unscaled_precision_ = self.design_.quadratic_form(self._information, + transformed=False) keep = np.ones(unscaled_precision_.shape[0]-1, bool) if exclude is not []: diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index 7a2ed39..e51236a 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -475,7 +475,7 @@ def _get_initial_state(self, intercept_ = 0 return GLMState(coef_, intercept_), keep.astype(float) - def get_LM(self): + def get_GLM(self): return GLM(family=self.family, fit_intercept=self.fit_intercept, offset_id=self.offset_id, diff --git a/glmnet/inference.py b/glmnet/inference.py index df36cb7..0193c41 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -18,6 +18,8 @@ from sklearn.utils.validation import check_is_fitted from sklearn.base import clone +from .base import _get_design + def fixed_lambda_estimator(glmnet_obj, lambda_val): check_is_fitted(glmnet_obj, ["coefs_", "feature_names_in_"]) @@ -54,7 +56,7 @@ def lasso_inference(glmnet_obj, Y_sel) if weight_sel is None: weight_sel = np.ones(X_sel.shape[0]) - + fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel, @@ -68,137 +70,51 @@ def lasso_inference(glmnet_obj, active_set = np.nonzero(FL.coef_ != 0)[0] - # find selection data covariance + # fit unpenalized model on selection data - unreg_sel_LM = glmnet_obj.get_LM() - unreg_sel_LM.summarize = True - unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) - C_sel = unreg_sel_LM.covariance_ + unreg_sel_GLM = glmnet_obj.get_GLM() - keep = np.zeros(FL.design_.shape[1], bool) - if hasattr(FL, 'penalty_factors'): - penfac = FL.penalty_factors[active_set] - else: - penfac = np.ones(active_set.shape[0]) - signs = np.sign(FL.coef_[active_set]) + unreg_sel_GLM.summarize = True + unreg_sel_GLM.fit(X_sel[:,active_set], + Y_sel, + sample_weight=weight_sel) - # we multiply by weight_sel.sum() due to this factor - # appearing in glmnet objective... - if FL.fit_intercept: - penfac = np.hstack([0, penfac]) - signs = np.hstack([0, signs]) - keep[0] = 1 - keep[1 + active_set] = 1 - stacked = np.hstack([FL.intercept_, - FL.coef_[active_set]]) - delta = weight_sel.sum() * lambda_val * penfac * signs - else: - # keep[active_set] = 1 - keep[1 + active_set] = 1 - stacked = FL.coef_[active_set] - delta = weight_sel.sum() * lambda_val * penfac * signs + # quadratic approximation up to scaling and a factor of weight_sel.sum() - D = np.column_stack([np.ones(X_sel.shape[0]), X_sel[:,active_set]]) - other_mle = np.linalg.pinv(D) @ Y_sel - A_sel = C_sel / unreg_sel_LM.dispersion_ - - delta = A_sel @ delta - noisy_mle = stacked + delta - - penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(A_sel.shape[0])[penalized] - - # fit full model - - Y_sel = np.asarray(Y_sel) - - # X'(Y-X\hat{\beta}) (appropriately weighted) - subgrad1 = FL.design_.T @ FL.state_.logl_score(FL._family, Y_sel, sample_weight=weight_sel) + design_sel = _get_design(X_sel[:,active_set], + weight_sel, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + scaling_sel = design_sel.scaling_ + P_sel = design_sel.quadratic_form(unreg_sel_GLM._information, + transformed=True) + Q_sel = np.linalg.inv(P_sel) + if not FL.fit_intercept: + Q_sel = Q_sel[1:,1:] + + # fit unpenalized model on full data X_full, Y_full, weight_full = full_data - - unreg_LM = glmnet_obj.get_LM() - unreg_LM.summarize = True - unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) - C_full = unreg_LM.covariance_ - - # selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) - - ## TODO: will this handle fit_intercept? - if FL.fit_intercept: - full_mle = np.hstack([unreg_LM.intercept_, - unreg_LM.coef_]) - else: - full_mle = unreg_LM.coef_ - - ## iterate over coordinates - Ls = np.zeros_like(noisy_mle) - Us = np.zeros_like(noisy_mle) - mles = np.zeros_like(noisy_mle) - pvals = np.zeros_like(noisy_mle) - for i in range(len(noisy_mle)): - e_i = np.zeros_like(noisy_mle) - e_i[i] = 1. - ## call selection_interval and return - L, U, mle, p = selection_interval( - support_directions=sel_P, - support_offsets=-signs[penalized] * delta[penalized], - covariance_noisy=C_sel, - covariance_full=C_full, - noisy_observation=noisy_mle, - observation=full_mle, - direction_of_interest=e_i, - level=level, - ) - Ls[i] = L - Us[i] = U - mles[i] = mle - pvals[i] = p - - idx = (active_set).tolist() - if FL.fit_intercept: - idx = [-1] + idx - return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) - -def lasso_bootstrap_inference(glmnet_obj, - lambda_val, - selection_data, ## X = \Sigma^{-1/2}, Y = \Sigma^{-1/2}\hat\beta^{(b)} - full_data, - C_full, ## bootstrap variance - alpha, - level=.9): - - fixed_lambda = fixed_lambda_estimator(glmnet_obj, lambda_val) - fixed_lambda.fit_intercept = False - gaussian = sm.families.Gaussian() - fixed_lambda.family = gaussian - X_sel, Y_sel, weight_sel = selection_data - - if weight_sel is None: - weight_sel = np.ones(X_sel.shape[0]) + if weight_full is None: + weight_full = np.ones(X_full.shape[0]) - fixed_lambda.fit(X_sel, Y_sel, sample_weight=weight_sel) - FL = fixed_lambda # shorthand - - if (not np.all(FL.upper_limits == np.inf) or - not np.all(FL.lower_limits == -np.inf)): - raise NotImplementedError('upper/lower limits coming soon') - - active_set = np.nonzero(FL.coef_ != 0)[0] - - # find selection data covariance - - # unreg_sel_LM = glmnet_obj.get_LM() - # unreg_sel_LM.summarize = True - # unreg_sel_LM.fit(X_sel[:,active_set], Y_sel, sample_weight=weight_sel) - # C_sel = unreg_sel_LM.covariance_ - # C_full = np.eye(len(active_set)) + unreg_GLM = glmnet_obj.get_GLM() + unreg_GLM.summarize = True + unreg_GLM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) + + # quadratic approximation + + design_full = _get_design(X_full[:,active_set], + weight_full, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + scaling_full = design_full.scaling_ + P_full = design_full.quadratic_form(unreg_GLM._information, + transformed=True) + Q_full = np.linalg.inv(P_full) + if not FL.fit_intercept: + Q_full = Q_full[1:,1:] - state = FL.state_ - information = FL._family.information(state, - weight_sel) - - keep = np.zeros(FL.design_.shape[1], bool) if hasattr(FL, 'penalty_factors'): penfac = FL.penalty_factors[active_set] else: @@ -207,76 +123,77 @@ def lasso_bootstrap_inference(glmnet_obj, # we multiply by weight_sel.sum() due to this factor # appearing in glmnet objective... + if FL.fit_intercept: + # correct the scaling penfac = np.hstack([0, penfac]) signs = np.hstack([0, signs]) - keep[0] = 1 - keep[1 + active_set] = 1 - stacked = np.hstack([state.intercept, - state.coef[active_set]]) - delta = np.zeros(keep.sum()) - delta[1:] = weight_sel.sum() * lambda_val * penfac[1:] * signs[1:] + stacked = np.hstack([FL.state_.intercept, + FL.state_.coef[active_set]]) else: - keep[1 + active_set] = 1 - stacked = state.coef[active_set] - delta = weight_sel.sum() * lambda_val * penfac * signs + stacked = FL.state_.coef[active_set] - C_full_ = C_full.copy() - C_full = C_full[keep[1:]][:,keep[1:]] - C_sel = (1 + alpha) * C_full + delta = lambda_val * penfac * signs - delta = C_sel @ delta - noisy_mle = stacked + delta + # remember loss of glmnet is normalized by sum of weights + # when taking newton step adjust by weight_sel.sum() + + delta = Q_sel @ delta * weight_sel.sum() + noisy_mle = stacked + delta # unitless scale + + transform_to_scaled = np.diag(np.hstack([1, 1/scaling_sel])) + transform_to_scaled[0,1:] = -design_sel.centers_/scaling_sel + DEVEL = False + if DEVEL: # compare to coefs and intercept on scales with units + scaled_mle = transform_to_scaled @ noisy_mle + #scaled_mle[1:] /= scaling_sel + #scaled_mle[0] = scaled_mle[0] - (scaled_mle[1:] * design_sel.centers_).sum() + assert( np.allclose(scaled_mle, np.hstack([unreg_sel_GLM.intercept_, unreg_sel_GLM.coef_]))) penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(C_sel.shape[0])[penalized] - - # fit full model - - X_full, Y_full, weight_full = full_data - - unreg_LM = glmnet_obj.get_LM() - unreg_LM.summarize = True - unreg_LM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) - # C_full = unreg_LM.covariance_ + sel_P = -np.diag(signs[penalized]) @ np.eye(Q_sel.shape[0])[penalized] + assert (np.all(sel_P @ noisy_mle < -signs[penalized] * delta[penalized])) - # selection_proportion = np.clip(np.diag(C_full).sum() / np.diag(C_sel).sum(), 0, 1) - ## TODO: will this handle fit_intercept? if FL.fit_intercept: - full_mle = np.hstack([unreg_LM.intercept_, - unreg_LM.coef_]) + full_mle = np.hstack([unreg_GLM.state_.intercept, + unreg_GLM.state_.coef]) else: - full_mle = unreg_LM.coef_ + full_mle = unreg_GLM.state_.coef ## iterate over coordinates Ls = np.zeros_like(noisy_mle) Us = np.zeros_like(noisy_mle) mles = np.zeros_like(noisy_mle) pvals = np.zeros_like(noisy_mle) - for i in range(len(noisy_mle)): - e_i = np.zeros_like(noisy_mle) - e_i[i] = 1. - # e_i = np.linalg.pinv(X_sel) @ e_i + + if FL.fit_intercept: + transform_to_scaled = np.diag(np.hstack([1, 1/scaling_full])) + transform_to_scaled[0,1:] = -design_full.centers_/scaling_full + else: + transform_to_scaled = np.diag(1/scaling_full) + + for i in range(transform_to_scaled.shape[0]): ## call selection_interval and return L, U, mle, p = selection_interval( support_directions=sel_P, support_offsets=-signs[penalized] * delta[penalized], - covariance_noisy=C_sel, - covariance_full=C_full, + Q_noisy=Q_sel, + Q_full=Q_full, noisy_observation=noisy_mle, observation=full_mle, - direction_of_interest=e_i, + direction_of_interest=transform_to_scaled[i], level=level, + dispersion=unreg_GLM.dispersion_ ) Ls[i] = L Us[i] = U mles[i] = mle pvals[i] = p - + idx = (active_set).tolist() if FL.fit_intercept: - idx = [-1] + idx + idx = ['intercept'] + idx return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) class constraints(object): @@ -407,71 +324,6 @@ def value(self, Y): """ return (self.linear_part.dot(Y) - self.offset).max() - def conditional(self, - linear_part, - value, - rank=None): - """ - Return an equivalent constraint - after having conditioned on a linear equality. - - Let the inequality constraints be specified by - `(A,b)` and the equality constraints be specified - by `(C,d)`. We form equivalent inequality constraints by - considering the residual - - .. math:: - - AY - E(AY|CY=d) - - Parameters - ---------- - - linear_part : np.float((k,q)) - Linear part of equality constraint, `C` above. - - value : np.float(k) - Value of equality constraint, `b` above. - - rank : int - If not None, this should specify - the rank of `linear_part`. Defaults - to `min(k,q)`. - - Returns - ------- - - conditional_con : `constraints` - Affine constraints having applied equality constraint. - - """ - - S = self.covariance - C, d = linear_part, value - - M1 = S.dot(C.T) - M2 = C.dot(M1) - - if M2.shape: - M2i = np.linalg.pinv(M2) - delta_cov = M1.dot(M2i.dot(M1.T)) - delta_mean = M1.dot(M2i.dot(C.dot(self.mean) - d)) - else: - delta_cov = np.multiply.outer(M1, M1) / M2 - delta_mean = M1 * (C.dot(self.mean) - d) / M2 - - if rank is None: - if len(linear_part.shape) == 2: - rank = min(linear_part.shape) - else: - rank = 1 - - return constraints(self.linear_part, - self.offset, - covariance=self.covariance - delta_cov, - mean=self.mean - delta_mean, - rank=self.rank - rank) - def bounds(self, direction_of_interest, Y): r""" For a realization $Y$ of the random variable $N(\mu,\Sigma)$ @@ -803,9 +655,9 @@ def interval_constraints(support_directions, direction_of_interest) U = A.dot(X) - b - #print(U, np.fabs(U).max()* tol, b, '?') if not np.all(U < tol * np.fabs(U).max()): + stop warn('constraints not satisfied: %s' % repr(U)) Sw = S.dot(w) @@ -836,13 +688,14 @@ def interval_constraints(support_directions, def selection_interval(support_directions, support_offsets, - covariance_noisy, - covariance_full, + Q_noisy, + Q_full, noisy_observation, observation, direction_of_interest, tol = 1.e-4, level = 0.90, + dispersion=1, UMAU=True): """ Given an affine in cone constraint $\{z:Az+b \leq 0\}$ (elementwise) @@ -890,23 +743,24 @@ def selection_interval(support_directions, V, upper_bound, _) = interval_constraints(support_directions, - support_offsets, - covariance_full, - noisy_observation, - direction_of_interest, - tol=tol) + support_offsets, + Q_full, + noisy_observation, + direction_of_interest, + tol=tol) + ## lars path lockhart tibs^2 taylor paper - sigma = np.sqrt(direction_of_interest.T @ covariance_full @ direction_of_interest) + sigma = np.sqrt(direction_of_interest.T @ Q_full @ direction_of_interest * dispersion) ## sqrt(alpha) is just sigma_noisy - sigma_full - smoothing_sigma = np.sqrt(max(direction_of_interest.T @ covariance_noisy @ direction_of_interest - sigma**2, 0)) + noisy_sigma = np.sqrt(direction_of_interest.T @ Q_noisy @ direction_of_interest * dispersion) + smoothing_sigma = np.sqrt(max(noisy_sigma**2 - sigma**2, 0)) + estimate = (direction_of_interest * observation).sum() noisy_estimate = (direction_of_interest * noisy_observation).sum() - #print(V, estimate, noisy_estimate, 'huh') - if smoothing_sigma > 1e-6 * sigma: - grid = np.linspace(estimate - 20 * sigma, estimate + 20 * sigma, 801) + grid = np.linspace(estimate - 10 * sigma, estimate + 10 * sigma, 2001) weight = ( normal_dbn.cdf( (upper_bound - grid) / (smoothing_sigma) @@ -914,24 +768,23 @@ def selection_interval(support_directions, (lower_bound - grid) / (smoothing_sigma) ) ) - weight *= normal_dbn.pdf(grid / sigma) + weight *= normal_dbn.pdf((grid - estimate) / sigma) sel_distr = discrete_family(grid, weight) L, U = sel_distr.equal_tailed_interval(estimate, - alpha=1-level) + alpha=1-level) mle, _, _ = sel_distr.MLE(estimate) - mle *= sigma**2 - pval = sel_distr.cdf(0, estimate) + mle *= sigma**2; mle += estimate + pval = sel_distr.cdf(-estimate / sigma**2, estimate) pval = 2 * min(pval, 1-pval) - L *= sigma**2 - U *= sigma**2 + L *= sigma**2; L += estimate + U *= sigma**2; U += estimate else: lb = estimate - 20 * sigma ub = estimate + 20 * sigma - if estimate < lower_bound or estimate > upper_bound: - #print(estimate, lower_bound, upper_bound) + if noisy_estimate < lower_bound or noisy_estimate > upper_bound: warn('Constraints not satisfied: returning [-np.inf, np.inf]') return -np.inf, np.inf, np.nan, np.nan @@ -972,7 +825,6 @@ def F(theta): U = np.inf mle = np.nan - return L, U, mle, pval @@ -1072,9 +924,11 @@ def __init__(self, sufficient_stat, weights, theta=0.): The weights are normalized to sum to 1. """ xw = np.array(sorted(zip(sufficient_stat, weights)), float) + xw[:,1] = np.maximum(xw[:,1], 1e-40) self._x = xw[:,0] self._w = xw[:,1] self._lw = np.log(xw[:,1]) + self._w /= self._w.sum() # make sure they are a pmf self.n = len(xw) self._theta = np.nan diff --git a/tests/test_inference.py b/tests/test_inference.py index 95f4a19..a37851c 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -10,49 +10,120 @@ @pytest.mark.parametrize('n', [500]) @pytest.mark.parametrize('p', [103]) @pytest.mark.parametrize('fit_intercept', [True, False]) -def test_inference(n, p, fit_intercept): - run_inference(n, p, fit_intercept) +@pytest.mark.parametrize('standardize', [True, False]) +def test_inference(n, + p, + fit_intercept, + standardize, + ntrial=10): + # run a few times to be sure KKT conditions not violated -def run_inference(n, p, fit_intercept): + for _ in range(ntrial): + run_inference(n, + p, + fit_intercept, + standardize) - rng = np.random.default_rng(0) +def run_inference(n, + p, + fit_intercept, + standardize, + rng=None, + alt=False, + s=3, + prop=0.8): + + if rng is None: + rng = np.random.default_rng(0) X = rng.standard_normal((n, p)) - Y = np.random.standard_normal(n) * 2 + D = np.linspace(1, p, p) / p + 1 + X *= D[None,:] + Y = rng.standard_normal(n) * 2 + beta = np.zeros(p) + if alt: + beta[:s] = rng.standard_normal(s) + Y += X @ beta Df = pd.DataFrame({'response':Y}) fam = sm.families.Gaussian() GN = GLMNet(response_id='response', family=fam, - fit_intercept=fit_intercept) + fit_intercept=fit_intercept, + standardize=standardize) GN.fit(X, Df) - return lasso_inference(GN, - GN.lambda_values_[10], - (X[:50], Df.iloc[:50], None), - (X, Df, None)) - -# ncover = 0 -# nsel = 0 -# niter = 1000 -# for i in range(niter): -# GN = GLMNet(response_id='response', -# family=fam, -# control=GNcontrol, -# fit_intercept=False, -# ) - -# X = rng.standard_normal((n, p)) -# Y = rng.standard_normal(n) * 1 -# Df = pd.DataFrame({'response':Y}) -# GN.fit(X, Df) -# sel_slice = slice(0, 4*n//5) -# res = lasso_inference(GN, -# GN.lambda_values_[10], -# (X[sel_slice], Df.iloc[sel_slice], None), -# (X, Df, None), -# level=0.8) - -# ncover += ((res['lower'] < 0) & (res['upper'] > 0)).sum() -# nsel += res.shape[0] - -# ncover / nsel + m = int(prop*n) + + df = lasso_inference(GN, + GN.lambda_values_[10], + (X[:m], Df.iloc[:m], None), + (X, Df, None)) + if fit_intercept: + active_set = np.array(df.index[1:]).astype(int) + else: + active_set = np.array(df.index).astype(int) + X_sel = X[:,active_set] + + if fit_intercept: + X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) + targets = np.linalg.pinv(X_sel) @ X @ beta + + df['target'] = targets + return df + + +def main_null(fit_intercept=True, + standardize=True, + n=2000, + p=50, + ntrial=500, + rng=None): + + ncover = 0 + nsel = 0 + + dfs = [] + + rng = np.random.default_rng(0) + for i in range(ntrial): + df = run_inference(n, + p, + fit_intercept, + standardize, + rng=rng) + dfs.append(df) + all_df = pd.concat(dfs) + ncover += ((all_df['lower'] < 0) & (all_df['upper'] > 0)).sum() + nsel += all_df.shape[0] + + print('cover:', ncover / nsel, 'typeI:', (all_df['pval'] < 0.05).mean()) + + +def main_alt(fit_intercept=True, + standardize=True, + n=2000, + p=50, + ntrial=500, + rng=None): + + ncover = 0 + nsel = 0 + + dfs = [] + + rng = np.random.default_rng(0) + for i in range(ntrial): + df = run_inference(n, + p, + fit_intercept, + standardize, + rng=rng, + alt=True) + dfs.append(df) + all_df = pd.concat(dfs) + ncover += ((all_df['lower'] < all_df['target']) & (all_df['upper'] > all_df['target'])).sum() + nsel += all_df.shape[0] + + print('cover:', ncover / nsel) + return all_df + From f1ef8ca7efb050c06c9c07905682e6b45836ff9f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 12 Apr 2024 15:24:04 -0700 Subject: [PATCH 26/44] getting rid of some unused code --- glmnet/inference.py | 399 +--------------------------------------- tests/test_inference.py | 2 +- 2 files changed, 4 insertions(+), 397 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 0193c41..36ee070 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -196,400 +196,6 @@ def lasso_inference(glmnet_obj, idx = ['intercept'] + idx return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) -class constraints(object): - - r""" - This class is the core object for affine selection procedures. - It is meant to describe sets of the form $C$ - where - - .. math:: - - C = \left\{z: Az\leq b \right \} - - Its main purpose is to consider slices through $C$ - and the conditional distribution of a Gaussian $N(\mu,\Sigma)$ - restricted to such slices. - - Notes - ----- - - In this parameterization, the parameter `self.mean` corresponds - to the *reference measure* that is being truncated. It is not the - mean of the truncated Gaussian. - - >>> positive = constraints(-np.identity(2), np.zeros(2)) - >>> Y = np.array([3, 4.4]) - >>> eta = np.array([1, 1], np.float) - >>> list(positive.interval(eta, Y)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - [4.62..., 10.17...] - >>> positive.pivot(eta, Y) # doctest: +ELLIPSIS - 5.187...-07 - >>> list(positive.bounds(eta, Y)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - [1.399..., 7.4..., inf, 1.414...] - >>> - - """ - - def __init__(self, - linear_part, - offset, - covariance=None, - mean=None, - rank=None): - r""" - Create a new inequality. - - Parameters - ---------- - - linear_part : np.float((q,p)) - The linear part, $A$ of the affine constraint - $\{z:Az \leq b\}$. - - offset: np.float(q) - The offset part, $b$ of the affine constraint - $\{z:Az \leq b\}$. - - covariance : np.float((p,p)) - Covariance matrix of Gaussian distribution to be - truncated. Defaults to `np.identity(self.dim)`. - - mean : np.float(p) - Mean vector of Gaussian distribution to be - truncated. Defaults to `np.zeros(self.dim)`. - - rank : int - If not None, this should specify - the rank of the covariance matrix. Defaults - to self.dim. - - """ - - self.linear_part, self.offset = \ - linear_part, np.asarray(offset) - - if self.linear_part.ndim == 2: - self.dim = self.linear_part.shape[1] - else: - self.dim = self.linear_part.shape[0] - - if rank is None: - self.rank = self.dim - else: - self.rank = rank - - if covariance is None: - covariance = np.identity(self.dim) - self.covariance = covariance - - if mean is None: - mean = np.zeros(self.dim) - self.mean = mean - - def __copy__(self): - r""" - A copy of the constraints. - - Also copies _sqrt_cov, _sqrt_inv if attributes are present. - """ - con = constraints(copy(self.linear_part), - copy(self.offset), - mean=copy(self.mean), - covariance=copy(self.covariance), - rank=self.rank) - if hasattr(self, "_sqrt_cov"): - con._sqrt_cov = self._sqrt_cov.copy() - con._sqrt_inv = self._sqrt_inv.copy() - con._rowspace = self._rowspace.copy() - return con - - def __call__(self, Y, tol=1.e-3): - r""" - Check whether Y satisfies the linear - inequality constraints. - >>> A = np.array([[1., -1.], [1., -1.]]) - >>> B = np.array([1., 1.]) - >>> con = constraints(A, B) - >>> Y = np.array([-1., 1.]) - >>> con(Y) - True - """ - V1 = self.value(Y) - return np.all(V1 < tol * np.fabs(V1).max(0)) - - def value(self, Y): - r""" - Compute $\max(Ay-b)$. - """ - return (self.linear_part.dot(Y) - self.offset).max() - - def bounds(self, direction_of_interest, Y): - r""" - For a realization $Y$ of the random variable $N(\mu,\Sigma)$ - truncated to $C$ specified by `self.constraints` compute - the slice of the inequality constraints in a - given direction $\eta$. - - Parameters - ---------- - - direction_of_interest: np.float - A direction $\eta$ for which we may want to form - selection intervals or a test. - - Y : np.float - A realization of $N(\mu,\Sigma)$ where - $\Sigma$ is `self.covariance`. - - Returns - ------- - - L : np.float - Lower truncation bound. - - Z : np.float - The observed $\eta^TY$ - - U : np.float - Upper truncation bound. - - S : np.float - Standard deviation of $\eta^TY$. - - - """ - return interval_constraints(self.linear_part, - self.offset, - self.covariance, - Y, - direction_of_interest) - - def pivot(self, - direction_of_interest, - Y, - null_value=None, - alternative='greater'): - r""" - For a realization $Y$ of the random variable $N(\mu,\Sigma)$ - truncated to $C$ specified by `self.constraints` compute - the slice of the inequality constraints in a - given direction $\eta$ and test whether - $\eta^T\mu$ is greater then 0, less than 0 or equal to 0. - - Parameters - ---------- - - direction_of_interest: np.float - A direction $\eta$ for which we may want to form - selection intervals or a test. - - Y : np.float - A realization of $N(0,\Sigma)$ where - $\Sigma$ is `self.covariance`. - - alternative : ['greater', 'less', 'twosided'] - What alternative to use. - - Returns - ------- - - P : np.float - $p$-value of corresponding test. - - Notes - ----- - - All of the tests are based on the exact pivot $F$ given - by the truncated Gaussian distribution for the - given direction $\eta$. If the alternative is 'greater' - then we return $1-F$; if it is 'less' we return $F$ - and if it is 'twosided' we return $2 \min(F,1-F)$. - - """ - - if alternative not in ['greater', 'less', 'twosided']: - raise ValueError("alternative should be one of ['greater', 'less', 'twosided']") - L, Z, U, S = self.bounds(direction_of_interest, Y) - - if null_value is None: - meanZ = (direction_of_interest * self.mean).sum() - else: - meanZ = null_value - - P = truncnorm_cdf((Z-meanZ)/S, (L-meanZ)/S, (U-meanZ)/S) - - if alternative == 'greater': - return 1 - P - elif alternative == 'less': - return P - else: - return max(2 * min(P, 1-P), 0) - - def interval(self, direction_of_interest, Y, - alpha=0.05, UMAU=False): - r""" - For a realization $Y$ of the random variable $N(\mu,\Sigma)$ - truncated to $C$ specified by `self.constraints` compute - the slice of the inequality constraints in a - given direction $\eta$ and test whether - $\eta^T\mu$ is greater then 0, less than 0 or equal to 0. - - Parameters - ---------- - - direction_of_interest: np.float - - A direction $\eta$ for which we may want to form - selection intervals or a test. - - Y : np.float - - A realization of $N(0,\Sigma)$ where - $\Sigma$ is `self.covariance`. - - alpha : float - - What level of confidence? - - UMAU : bool - - Use the UMAU intervals? - - Returns - ------- - - [L,U] : selection interval - - - """ - ## THE DOCUMENTATION IS NOT GOOD ! HAS TO BE CHANGED ! - - return selection_interval( \ - self.linear_part, - self.offset, - self.covariance, - Y, - direction_of_interest, - level=1. - alpha, - UMAU=UMAU) - - def covariance_factors(self, force=True): - """ - Factor `self.covariance`, - finding a possibly non-square square-root. - - Parameters - ---------- - - force : bool - If True, force a recomputation of - the covariance. If not, assumes that - covariance has not changed. - - """ - if not hasattr(self, "_sqrt_cov") or force: - - # original matrix is np.dot(U, (D**2 * U).T) - - U, D = np.linalg.svd(self.covariance)[:2] - D = np.sqrt(D[:self.rank]) - U = U[:,:self.rank] - - self._sqrt_cov = U * D[None,:] - self._sqrt_inv = (U / D[None,:]).T - self._rowspace = U - - return self._sqrt_cov, self._sqrt_inv, self._rowspace - - def whiten(self): - """ - - Return a whitened version of constraints in a different - basis, and a change of basis matrix. - - If `self.covariance` is rank deficient, the change-of - basis matrix will not be square. - - Returns - ------- - - inverse_map : callable - - forward_map : callable - - white_con : `constraints` - - """ - sqrt_cov, sqrt_inv = self.covariance_factors()[:2] - - new_A = self.linear_part.dot(sqrt_cov) - den = np.sqrt((new_A**2).sum(1)) - new_b = self.offset - self.linear_part.dot(self.mean) - new_con = constraints(new_A / den[:,None], new_b / den) - - mu = self.mean.copy() - - def inverse_map(Z): - if Z.ndim == 2: - return sqrt_cov.dot(Z) + mu[:,None] - else: - return sqrt_cov.dot(Z) + mu - - forward_map = lambda W: sqrt_inv.dot(W - mu) - - return inverse_map, forward_map, new_con - - def project_rowspace(self, direction): - """ - Project a vector onto rowspace - of the covariance. - """ - rowspace = self.covariance_factors()[-1] - return rowspace.dot(rowspace.T.dot(direction)) - - def solve(self, direction): - """ - Compute the inverse of the covariance - times a direction vector. - """ - sqrt_inv = self.covariance_factors()[1] - return sqrt_inv.T.dot(sqrt_inv.dot(direction)) - - - - - -def stack(*cons): - """ - Combine constraints into a large constaint - by intersection. - - Parameters - ---------- - - cons : [`selection.affine.constraints`_] - A sequence of constraints. - - Returns - ------- - - intersection : `selection.affine.constraints`_ - - Notes - ----- - - Resulting constraint will have mean 0 and covariance $I$. - - """ - ineq, ineq_off = [], [] - for con in cons: - ineq.append(con.linear_part) - ineq_off.append(con.offset) - - intersection = constraints(np.vstack(ineq), - np.hstack(ineq_off)) - return intersection def interval_constraints(support_directions, support_offsets, @@ -780,11 +386,12 @@ def selection_interval(support_directions, L *= sigma**2; L += estimate U *= sigma**2; U += estimate else: - + warnings.warn('assuming data for selection is same as for inference -- using hard selection') + lb = estimate - 20 * sigma ub = estimate + 20 * sigma - if noisy_estimate < lower_bound or noisy_estimate > upper_bound: + if estimate < lower_bound or estimate > upper_bound: warn('Constraints not satisfied: returning [-np.inf, np.inf]') return -np.inf, np.inf, np.nan, np.nan diff --git a/tests/test_inference.py b/tests/test_inference.py index a37851c..1a6b768 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -41,7 +41,7 @@ def run_inference(n, Y = rng.standard_normal(n) * 2 beta = np.zeros(p) if alt: - beta[:s] = rng.standard_normal(s) + beta[rng.choice(p, s, replace=False)] = rng.standard_normal(s) / 2 Y += X @ beta Df = pd.DataFrame({'response':Y}) From 8d771119ec5f524a4ce4b7c4f2bb09825b3b4e76 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 12 Apr 2024 16:44:33 -0700 Subject: [PATCH 27/44] BF: GLM coefs are on original scale, need to transform to unitless for comparison to selection estimates --- glmnet/inference.py | 35 ++++++++++++-------------------- tests/test_inference.py | 45 +++++++++++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 36ee070..a1b11ad 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -121,9 +121,6 @@ def lasso_inference(glmnet_obj, penfac = np.ones(active_set.shape[0]) signs = np.sign(FL.coef_[active_set]) - # we multiply by weight_sel.sum() due to this factor - # appearing in glmnet objective... - if FL.fit_intercept: # correct the scaling penfac = np.hstack([0, penfac]) @@ -141,25 +138,19 @@ def lasso_inference(glmnet_obj, delta = Q_sel @ delta * weight_sel.sum() noisy_mle = stacked + delta # unitless scale - transform_to_scaled = np.diag(np.hstack([1, 1/scaling_sel])) - transform_to_scaled[0,1:] = -design_sel.centers_/scaling_sel - DEVEL = False - if DEVEL: # compare to coefs and intercept on scales with units - scaled_mle = transform_to_scaled @ noisy_mle - #scaled_mle[1:] /= scaling_sel - #scaled_mle[0] = scaled_mle[0] - (scaled_mle[1:] * design_sel.centers_).sum() - assert( np.allclose(scaled_mle, np.hstack([unreg_sel_GLM.intercept_, unreg_sel_GLM.coef_]))) - penalized = penfac > 0 sel_P = -np.diag(signs[penalized]) @ np.eye(Q_sel.shape[0])[penalized] assert (np.all(sel_P @ noisy_mle < -signs[penalized] * delta[penalized])) - ## TODO: will this handle fit_intercept? + ## the GLM's coef and intercept are on the original scale + ## we transform them here to the (typically) unitless "standardized" scale if FL.fit_intercept: - full_mle = np.hstack([unreg_GLM.state_.intercept, - unreg_GLM.state_.coef]) + intercept = unreg_GLM.state_.intercept + (unreg_GLM.state_.coef * design_full.centers_).sum() + coef = unreg_GLM.state_.coef * scaling_full + full_mle = np.hstack([intercept, + coef]) else: - full_mle = unreg_GLM.state_.coef + full_mle = unreg_GLM.state_.coef * scaling_full ## iterate over coordinates Ls = np.zeros_like(noisy_mle) @@ -168,12 +159,12 @@ def lasso_inference(glmnet_obj, pvals = np.zeros_like(noisy_mle) if FL.fit_intercept: - transform_to_scaled = np.diag(np.hstack([1, 1/scaling_full])) - transform_to_scaled[0,1:] = -design_full.centers_/scaling_full + transform_to_original = np.diag(np.hstack([1, 1/scaling_full])) + transform_to_original[0,1:] = -design_full.centers_/scaling_full else: - transform_to_scaled = np.diag(1/scaling_full) + transform_to_original = np.diag(1/scaling_full) - for i in range(transform_to_scaled.shape[0]): + for i in range(transform_to_original.shape[0]): ## call selection_interval and return L, U, mle, p = selection_interval( support_directions=sel_P, @@ -182,7 +173,7 @@ def lasso_inference(glmnet_obj, Q_full=Q_full, noisy_observation=noisy_mle, observation=full_mle, - direction_of_interest=transform_to_scaled[i], + direction_of_interest=transform_to_original[i], level=level, dispersion=unreg_GLM.dispersion_ ) @@ -350,7 +341,7 @@ def selection_interval(support_directions, upper_bound, _) = interval_constraints(support_directions, support_offsets, - Q_full, + Q_noisy, noisy_observation, direction_of_interest, tol=tol) diff --git a/tests/test_inference.py b/tests/test_inference.py index 1a6b768..1ed06b1 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -31,25 +31,36 @@ def run_inference(n, rng=None, alt=False, s=3, - prop=0.8): + prop=0.8, + family='gaussian'): if rng is None: rng = np.random.default_rng(0) X = rng.standard_normal((n, p)) - D = np.linspace(1, p, p) / p + 1 + D = np.linspace(1, p, p) / p + 0.2 X *= D[None,:] Y = rng.standard_normal(n) * 2 beta = np.zeros(p) if alt: - beta[rng.choice(p, s, replace=False)] = rng.standard_normal(s) / 2 - Y += X @ beta - Df = pd.DataFrame({'response':Y}) - - fam = sm.families.Gaussian() + subs = rng.choice(p, s, replace=False) + beta[subs] = rng.standard_normal(s) + mu = X @ beta + Y += mu + + if family == 'gaussian': + fam = sm.families.Gaussian() + elif family == 'probit': + fam = sm.families.Binomial(link=sm.families.links.Probit()) + Y = (mu + rng.standard_normal(n)) > 0 + else: + raise ValueError('only testing "gaussian" and "probit"') + GN = GLMNet(response_id='response', family=fam, fit_intercept=fit_intercept, standardize=standardize) + Df = pd.DataFrame({'response':Y}) + GN.fit(X, Df) m = int(prop*n) @@ -65,9 +76,13 @@ def run_inference(n, if fit_intercept: X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) - targets = np.linalg.pinv(X_sel) @ X @ beta + targets = np.linalg.pinv(X_sel) @ mu df['target'] = targets + if alt: + if family == 'probit' and not set(subs).issubset(active_set): + df['target'] *= np.nan + return df @@ -76,7 +91,8 @@ def main_null(fit_intercept=True, n=2000, p=50, ntrial=500, - rng=None): + rng=None, + family='gaussian'): ncover = 0 nsel = 0 @@ -89,7 +105,8 @@ def main_null(fit_intercept=True, p, fit_intercept, standardize, - rng=rng) + rng=rng, + family=family) dfs.append(df) all_df = pd.concat(dfs) ncover += ((all_df['lower'] < 0) & (all_df['upper'] > 0)).sum() @@ -100,10 +117,11 @@ def main_null(fit_intercept=True, def main_alt(fit_intercept=True, standardize=True, - n=2000, + n=200, p=50, ntrial=500, - rng=None): + rng=None, + family='gaussian'): ncover = 0 nsel = 0 @@ -117,9 +135,10 @@ def main_alt(fit_intercept=True, fit_intercept, standardize, rng=rng, + family=family, alt=True) dfs.append(df) - all_df = pd.concat(dfs) + all_df = pd.concat(dfs).dropna() ncover += ((all_df['lower'] < all_df['target']) & (all_df['upper'] > all_df['target'])).sum() nsel += all_df.shape[0] From a5f466916d511c9a08e7bdf445fc8695cbad7b50 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 2 May 2024 16:06:03 -0700 Subject: [PATCH 28/44] BF: misspelled penalty_factor --- glmnet/inference.py | 5 +++-- tests/test_inference.py | 13 +++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index a1b11ad..1804e97 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -22,6 +22,7 @@ def fixed_lambda_estimator(glmnet_obj, lambda_val): + check_is_fitted(glmnet_obj, ["coefs_", "feature_names_in_"]) estimator = glmnet_obj.regularized_estimator( lambda_val=lambda_val, @@ -115,10 +116,10 @@ def lasso_inference(glmnet_obj, if not FL.fit_intercept: Q_full = Q_full[1:,1:] - if hasattr(FL, 'penalty_factors'): - penfac = FL.penalty_factors[active_set] + penfac = FL.penalty_factor[active_set] else: penfac = np.ones(active_set.shape[0]) + signs = np.sign(FL.coef_[active_set]) if FL.fit_intercept: diff --git a/tests/test_inference.py b/tests/test_inference.py index 1ed06b1..cb8856e 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -32,6 +32,7 @@ def run_inference(n, alt=False, s=3, prop=0.8, + penalty_facs=True, family='gaussian'): if rng is None: @@ -55,17 +56,25 @@ def run_inference(n, else: raise ValueError('only testing "gaussian" and "probit"') + if penalty_facs: + penalty_factor = np.ones(X.shape[1]) + penalty_factor[:10] = 0.5 + else: + penalty_factor = None + GN = GLMNet(response_id='response', family=fam, fit_intercept=fit_intercept, - standardize=standardize) + standardize=standardize, + penalty_factor=penalty_factor) + Df = pd.DataFrame({'response':Y}) GN.fit(X, Df) m = int(prop*n) df = lasso_inference(GN, - GN.lambda_values_[10], + GN.lambda_values_[min(10, GN.lambda_values_.shape[0]-1)], (X[:m], Df.iloc[:m], None), (X, Df, None)) if fit_intercept: From 207c189e9183b3c73a2f2c6568d51df24c837bb1 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 13 Jun 2024 11:31:52 -0700 Subject: [PATCH 29/44] WIP: trying to get active and nonactive constraint as dataclass --- glmnet/glmnet.py | 25 ++++ glmnet/inference.py | 305 ++++++++++++++++++++++------------------ tests/test_inference.py | 10 +- 3 files changed, 196 insertions(+), 144 deletions(-) diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index e51236a..fdd3ff1 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -482,3 +482,28 @@ def get_GLM(self): weight_id=self.weight_id, response_id=self.response_id) + def get_fixed_lambda(self, + lambda_val): + + check_is_fitted(self, ["coefs_", "feature_names_in_"]) + + estimator = self.regularized_estimator( + lambda_val=lambda_val, + family=self.family, + alpha=self.alpha, + penalty_factor=self.penalty_factor, + lower_limits=self.lower_limits, + upper_limits=self.upper_limits, + fit_intercept=self.fit_intercept, + standardize=self.standardize, + control=self.control, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id, + exclude=self.exclude + ) + + coefs, intercepts = self.interpolate_coefs([lambda_val]) + cls = self.state_.__class__ + state = cls(coefs[0], intercepts[0]) + return estimator, state diff --git a/glmnet/inference.py b/glmnet/inference.py index 1804e97..69b0a6d 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -7,7 +7,9 @@ import warnings from warnings import warn from copy import copy + import numpy as np +import scipy.sparse import pandas as pd from scipy.stats import norm as normal_dbn import statsmodels.api as sm @@ -15,35 +17,112 @@ import mpmath as mp mp.dps = 80 -from sklearn.utils.validation import check_is_fitted -from sklearn.base import clone - from .base import _get_design -def fixed_lambda_estimator(glmnet_obj, - lambda_val): - - check_is_fitted(glmnet_obj, ["coefs_", "feature_names_in_"]) - estimator = glmnet_obj.regularized_estimator( - lambda_val=lambda_val, - family=glmnet_obj.family, - alpha=glmnet_obj.alpha, - penalty_factor=glmnet_obj.penalty_factor, - lower_limits=glmnet_obj.lower_limits, - upper_limits=glmnet_obj.upper_limits, - fit_intercept=glmnet_obj.fit_intercept, - standardize=glmnet_obj.standardize, - control=glmnet_obj.control, - offset_id=glmnet_obj.offset_id, - weight_id=glmnet_obj.weight_id, - response_id=glmnet_obj.response_id, - exclude=glmnet_obj.exclude - ) - - coefs, intercepts = glmnet_obj.interpolate_coefs([lambda_val]) - cls = glmnet_obj.state_.__class__ - state = cls(coefs[0], intercepts[0]) - return estimator, state +from dataclasses import dataclass +from typing import Optional + +@dataclass +class AffineConstraint(object): + + linear: np.ndarray + offset: np.ndarray + observed: np.ndarray + cov_selected: Optional[np.ndarray] = None # covariance with coefs in selected model + + def __post_init__(self): + + if not all (self.linear @ self.observed - self.offset <= 0): + raise ValueError('constraint not satisfied for observed data') + + def interval_constraints(self, + target, + gamma, + tol = 1.e-4): + r""" + Given an affine constraint $\{z:Az \leq b \leq \}$ (elementwise) + specified with $A$ as `support_directions` and $b$ as + `support_offset`, a new direction of interest $\eta$, and + an `observed_data` is Gaussian vector $Z \sim N(\mu,\Sigma)$ + with `covariance` matrix $\Sigma$, this + function returns $\eta^TZ$ as well as an interval + bounding this value. + + The interval constructed is such that the endpoints are + independent of $\eta^TZ$, hence the $p$-value + of `Kac Rice`_ + can be used to form an exact pivot. + + Parameters + ---------- + + support_directions : np.float + Matrix specifying constraint, $A$. + + support_offsets : np.float + Offset in constraint, $b$. + + covariance : np.float + Covariance matrix of `observed_data`. + + observed_data : np.float + Observations. + + direction_of_interest : np.float + Direction in which we're interested for the + contrast. + + tol : float + Relative tolerance parameter for deciding + sign of $Az-b$. + + Returns + ------- + + lower_bound : float + + observed : float + + upper_bound : float + + sigma : float + + """ + + # shorthand + A, b, D = (self.linear, + self.offset, + self.observed) + + N = D - gamma * target + + U = A.dot(N) - b + V = A @ gamma + + # Inequalities are now U + V @ target <= 0 + + # adding the zero_coords in the denominator ensures that + # there are no divide-by-zero errors in RHS + # these coords are never used in upper_bound or lower_bound + + zero_coords = V == 0 + RHS = -U / (V + zero_coords) + RHS[zero_coords] = np.nan + pos_coords = V > tol * np.fabs(V).max() + neg_coords = V < -tol * np.fabs(V).max() + + if np.any(pos_coords): + upper_bound = RHS[pos_coords].min() + else: + upper_bound = np.inf + + if np.any(neg_coords): + lower_bound = RHS[neg_coords].max() + else: + lower_bound = -np.inf + + print(lower_bound, upper_bound) + return lower_bound, upper_bound def lasso_inference(glmnet_obj, lambda_val, @@ -51,7 +130,7 @@ def lasso_inference(glmnet_obj, full_data, level=.9): - fixed_lambda, warm_state = fixed_lambda_estimator(glmnet_obj, lambda_val) + fixed_lambda, warm_state = glmnet_obj.get_fixed_lambda(lambda_val) X_sel, Y_sel, weight_sel = selection_data X_sel, Y_sel, _, _, weight_sel = fixed_lambda.get_data_arrays(X_sel, Y_sel) @@ -64,12 +143,13 @@ def lasso_inference(glmnet_obj, warm_state=warm_state) FL = fixed_lambda # shorthand - + if (not np.all(FL.upper_limits >= FL.control.big) or not np.all(FL.lower_limits <= -FL.control.big)): raise NotImplementedError('upper/lower limits coming soon') active_set = np.nonzero(FL.coef_ != 0)[0] + inactive_set = np.nonzero(FL.coef_ == 0)[0] # fit unpenalized model on selection data @@ -89,6 +169,7 @@ def lasso_inference(glmnet_obj, scaling_sel = design_sel.scaling_ P_sel = design_sel.quadratic_form(unreg_sel_GLM._information, transformed=True) + Q_sel = np.linalg.inv(P_sel) if not FL.fit_intercept: Q_sel = Q_sel[1:,1:] @@ -165,11 +246,56 @@ def lasso_inference(glmnet_obj, else: transform_to_original = np.diag(1/scaling_full) + linear = sel_P + offset = -signs[penalized] * delta[penalized] + active_con = AffineConstraint(linear=linear, + offset=offset, + observed=noisy_mle) + cov_selected = Q_sel + + inactive = True + if inactive: + pf = FL.regularizer_.penalty_factor + scale = 1 / (pf + (pf <= 0)) + + logl_score = FL.state_.logl_score(FL._family, + Y_sel, + weight_sel / weight_sel.sum()) + + # X_{-E}'(Y-X\hat{\beta}_E) -- \hat{\beta}_E is the LASSO soln, not GLM ! + # this is (2nd order Taylor series sense) equivalent to + # X_{-E}'(Y-X\bar{\beta}_E) + X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E + # with \bar{\beta}_E the GLM soln + + score_ = (FL.design_.T @ logl_score)[1:] + + score_ *= scale + + # we now know that `score_` is bounded by \pm 1 + + I = scipy.sparse.eye(score_.shape[0]) + L = scipy.sparse.vstack([I, -I]) + O = np.ones(L.shape[0]) + + # X'WX_E + # here, X is the effective matrix implied by scale / intercept choices + P_inactive = FL.design_.quadratic_form(unreg_sel_GLM._information, + transformed=True, + columns=active_set)[1:] + if not FL.fit_intercept: + P_inactive = P_inactive[:,1:] + + # (\lambda_{-E})^{-1} X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E + I_inactive = (scale[:,None] * (P_inactive @ delta))[inactive_set] + + inactive_con = AffineConstraint(linear=L, + offset=O, + observed=score_) + for i in range(transform_to_original.shape[0]): ## call selection_interval and return L, U, mle, p = selection_interval( - support_directions=sel_P, - support_offsets=-signs[penalized] * delta[penalized], + active_con=active_con, Q_noisy=Q_sel, Q_full=Q_full, noisy_observation=noisy_mle, @@ -189,103 +315,7 @@ def lasso_inference(glmnet_obj, return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) -def interval_constraints(support_directions, - support_offsets, - covariance, - observed_data, - direction_of_interest, - tol = 1.e-4): - r""" - Given an affine constraint $\{z:Az \leq b \leq \}$ (elementwise) - specified with $A$ as `support_directions` and $b$ as - `support_offset`, a new direction of interest $\eta$, and - an `observed_data` is Gaussian vector $Z \sim N(\mu,\Sigma)$ - with `covariance` matrix $\Sigma$, this - function returns $\eta^TZ$ as well as an interval - bounding this value. - - The interval constructed is such that the endpoints are - independent of $\eta^TZ$, hence the $p$-value - of `Kac Rice`_ - can be used to form an exact pivot. - - Parameters - ---------- - - support_directions : np.float - Matrix specifying constraint, $A$. - - support_offsets : np.float - Offset in constraint, $b$. - - covariance : np.float - Covariance matrix of `observed_data`. - - observed_data : np.float - Observations. - - direction_of_interest : np.float - Direction in which we're interested for the - contrast. - - tol : float - Relative tolerance parameter for deciding - sign of $Az-b$. - - Returns - ------- - - lower_bound : float - - observed : float - - upper_bound : float - - sigma : float - - """ - - # shorthand - A, b, S, X, w = (support_directions, - support_offsets, - covariance, - observed_data, - direction_of_interest) - - U = A.dot(X) - b - - if not np.all(U < tol * np.fabs(U).max()): - stop - warn('constraints not satisfied: %s' % repr(U)) - - Sw = S.dot(w) - sigma = np.sqrt((w*Sw).sum()) - alpha = A.dot(Sw) / sigma**2 - V = (w*X).sum() # \eta^TZ - - # adding the zero_coords in the denominator ensures that - # there are no divide-by-zero errors in RHS - # these coords are never used in upper_bound or lower_bound - - zero_coords = alpha == 0 - RHS = (-U + V * alpha) / (alpha + zero_coords) - RHS[zero_coords] = np.nan - - pos_coords = alpha > tol * np.fabs(alpha).max() - if np.any(pos_coords): - upper_bound = RHS[pos_coords].min() - else: - upper_bound = np.inf - neg_coords = alpha < -tol * np.fabs(alpha).max() - if np.any(neg_coords): - lower_bound = RHS[neg_coords].max() - else: - lower_bound = -np.inf - - return lower_bound, V, upper_bound, sigma - -def selection_interval(support_directions, - support_offsets, +def selection_interval(active_con, Q_noisy, Q_full, noisy_observation, @@ -337,24 +367,21 @@ def selection_interval(support_directions, """ + noisy_var = direction_of_interest.T @ Q_noisy @ direction_of_interest + noisy_estimate = (direction_of_interest * noisy_observation).sum() + estimate = (direction_of_interest * observation).sum() + (lower_bound, - V, - upper_bound, - _) = interval_constraints(support_directions, - support_offsets, - Q_noisy, - noisy_observation, - direction_of_interest, - tol=tol) + upper_bound) = active_con.interval_constraints(noisy_estimate, + Q_noisy @ direction_of_interest / unnormalized_var, + tol=tol) ## lars path lockhart tibs^2 taylor paper sigma = np.sqrt(direction_of_interest.T @ Q_full @ direction_of_interest * dispersion) ## sqrt(alpha) is just sigma_noisy - sigma_full - noisy_sigma = np.sqrt(direction_of_interest.T @ Q_noisy @ direction_of_interest * dispersion) + noisy_sigma = np.sqrt(noisy_var * dispersion) smoothing_sigma = np.sqrt(max(noisy_sigma**2 - sigma**2, 0)) - estimate = (direction_of_interest * observation).sum() - noisy_estimate = (direction_of_interest * noisy_observation).sum() if smoothing_sigma > 1e-6 * sigma: diff --git a/tests/test_inference.py b/tests/test_inference.py index cb8856e..88d1545 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -5,12 +5,12 @@ import statsmodels.api as sm from glmnet import GLMNet -from glmnet.inference import (fixed_lambda_estimator, - lasso_inference) -@pytest.mark.parametrize('n', [500]) -@pytest.mark.parametrize('p', [103]) -@pytest.mark.parametrize('fit_intercept', [True, False]) +from glmnet.inference import lasso_inference + @pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize('p', [103]) +@pytest.mark.parametrize('n', [500]) def test_inference(n, p, fit_intercept, From b7b0c52331f3210db3afdbb260bcde7408e05e95 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 13 Jun 2024 11:44:40 -0700 Subject: [PATCH 30/44] adding auto test, BF: inactive bounded by lambda_val --- glmnet/inference.py | 6 +++--- tests/test_inference.py | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 69b0a6d..b0b1906 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -271,11 +271,11 @@ def lasso_inference(glmnet_obj, score_ *= scale - # we now know that `score_` is bounded by \pm 1 + # we now know that `score_` is bounded by \pm lambda_val I = scipy.sparse.eye(score_.shape[0]) L = scipy.sparse.vstack([I, -I]) - O = np.ones(L.shape[0]) + O = np.ones(L.shape[0]) * lambda_val # X'WX_E # here, X is the effective matrix implied by scale / intercept choices @@ -373,7 +373,7 @@ def selection_interval(active_con, (lower_bound, upper_bound) = active_con.interval_constraints(noisy_estimate, - Q_noisy @ direction_of_interest / unnormalized_var, + Q_noisy @ direction_of_interest / noisy_var, tol=tol) ## lars path lockhart tibs^2 taylor paper diff --git a/tests/test_inference.py b/tests/test_inference.py index 88d1545..422b4de 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -24,6 +24,31 @@ def test_inference(n, fit_intercept, standardize) +def test_Auto(): + + df = sm.datasets.get_rdataset('Auto', package='ISLR').data + df = df.set_index('name') + X = np.asarray(df.drop(columns=['mpg'])) + y = np.asarray(df['mpg']) + + GN = GLMNet(response_id='response', + fit_intercept=True, + standardize=True) + + Df = pd.DataFrame({'response':y}) + + GN.fit(X, Df) + prop = 0.8 + n = df.shape[0] + m = int(prop*n) + + df = lasso_inference(GN, + GN.lambda_values_[min(10, GN.lambda_values_.shape[0]-1)], + (X[:m], Df.iloc[:m], None), + (X, Df, None)) + print(df) + + def run_inference(n, p, fit_intercept, From ad282b1fced1b2ac8dc6d45fcc25616f42e2ff8d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Tue, 18 Jun 2024 14:37:40 -0700 Subject: [PATCH 31/44] WIP: cleaning up affine code --- glmnet/inference.py | 52 ++++++++++++++++++++--------------------- tests/test_inference.py | 10 ++++---- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index b0b1906..51925a3 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -28,7 +28,6 @@ class AffineConstraint(object): linear: np.ndarray offset: np.ndarray observed: np.ndarray - cov_selected: Optional[np.ndarray] = None # covariance with coefs in selected model def __post_init__(self): @@ -166,10 +165,10 @@ def lasso_inference(glmnet_obj, weight_sel, standardize=glmnet_obj.standardize, intercept=glmnet_obj.fit_intercept) - scaling_sel = design_sel.scaling_ P_sel = design_sel.quadratic_form(unreg_sel_GLM._information, transformed=True) + scaling_sel = design_sel.scaling_ Q_sel = np.linalg.inv(P_sel) if not FL.fit_intercept: Q_sel = Q_sel[1:,1:] @@ -196,7 +195,6 @@ def lasso_inference(glmnet_obj, Q_full = np.linalg.inv(P_full) if not FL.fit_intercept: Q_full = Q_full[1:,1:] - penfac = FL.penalty_factor[active_set] else: penfac = np.ones(active_set.shape[0]) @@ -209,20 +207,23 @@ def lasso_inference(glmnet_obj, signs = np.hstack([0, signs]) stacked = np.hstack([FL.state_.intercept, FL.state_.coef[active_set]]) + noisy_mle = np.hstack([unreg_sel_GLM.state_.intercept, + unreg_sel_GLM.state_.coef]) else: stacked = FL.state_.coef[active_set] + noisy_mle = unreg_sel_GLM.state_.coef - delta = lambda_val * penfac * signs + # delta = lambda_val * penfac * signs - # remember loss of glmnet is normalized by sum of weights - # when taking newton step adjust by weight_sel.sum() + # # remember loss of glmnet is normalized by sum of weights + # # when taking newton step adjust by weight_sel.sum() - delta = Q_sel @ delta * weight_sel.sum() - noisy_mle = stacked + delta # unitless scale + # delta = Q_sel @ delta * weight_sel.sum() + # noisy_mle = stacked + delta # unitless scale penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(Q_sel.shape[0])[penalized] - assert (np.all(sel_P @ noisy_mle < -signs[penalized] * delta[penalized])) + sel_P = -np.diag(signs[penalized]) @ np.eye(Q_full.shape[0])[penalized] +# assert (np.all(sel_P @ noisy_mle < -signs[penalized] * delta[penalized])) ## the GLM's coef and intercept are on the original scale ## we transform them here to the (typically) unitless "standardized" scale @@ -235,10 +236,10 @@ def lasso_inference(glmnet_obj, full_mle = unreg_GLM.state_.coef * scaling_full ## iterate over coordinates - Ls = np.zeros_like(noisy_mle) - Us = np.zeros_like(noisy_mle) - mles = np.zeros_like(noisy_mle) - pvals = np.zeros_like(noisy_mle) + Ls = np.zeros_like(stacked) + Us = np.zeros_like(stacked) + mles = np.zeros_like(stacked) + pvals = np.zeros_like(stacked) if FL.fit_intercept: transform_to_original = np.diag(np.hstack([1, 1/scaling_full])) @@ -247,12 +248,10 @@ def lasso_inference(glmnet_obj, transform_to_original = np.diag(1/scaling_full) linear = sel_P - offset = -signs[penalized] * delta[penalized] + offset = np.zeros(sel_P.shape[0]) # -signs[penalized] * delta[penalized] active_con = AffineConstraint(linear=linear, offset=offset, - observed=noisy_mle) - cov_selected = Q_sel - + observed=stacked) inactive = True if inactive: pf = FL.regularizer_.penalty_factor @@ -268,9 +267,8 @@ def lasso_inference(glmnet_obj, # with \bar{\beta}_E the GLM soln score_ = (FL.design_.T @ logl_score)[1:] - score_ *= scale - + score_ = score_[inactive_set] # we now know that `score_` is bounded by \pm lambda_val I = scipy.sparse.eye(score_.shape[0]) @@ -279,14 +277,14 @@ def lasso_inference(glmnet_obj, # X'WX_E # here, X is the effective matrix implied by scale / intercept choices - P_inactive = FL.design_.quadratic_form(unreg_sel_GLM._information, - transformed=True, - columns=active_set)[1:] - if not FL.fit_intercept: - P_inactive = P_inactive[:,1:] + # P_inactive = FL.design_.quadratic_form(unreg_sel_GLM._information, + # transformed=True, + # columns=active_set)[1:] + # if not FL.fit_intercept: + # P_inactive = P_inactive[:,1:] - # (\lambda_{-E})^{-1} X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E - I_inactive = (scale[:,None] * (P_inactive @ delta))[inactive_set] + # # (\lambda_{-E})^{-1} X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E + # I_inactive = (scale[:,None] * (P_inactive @ delta))[inactive_set] inactive_con = AffineConstraint(linear=L, offset=O, diff --git a/tests/test_inference.py b/tests/test_inference.py index 422b4de..89a2907 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -56,7 +56,7 @@ def run_inference(n, rng=None, alt=False, s=3, - prop=0.8, + prop=0.75, penalty_facs=True, family='gaussian'): @@ -122,8 +122,8 @@ def run_inference(n, def main_null(fit_intercept=True, standardize=True, - n=2000, - p=50, + n=500, + p=75, ntrial=500, rng=None, family='gaussian'): @@ -151,8 +151,8 @@ def main_null(fit_intercept=True, def main_alt(fit_intercept=True, standardize=True, - n=200, - p=50, + n=1000, + p=75, ntrial=500, rng=None, family='gaussian'): From 542582d5fc0ffa807bb8e4cd8e067a65164a506b Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 24 Jun 2024 11:32:13 -0400 Subject: [PATCH 32/44] added scale/unscale operators to convert between raw and scaled coefs within Design --- glmnet/base.py | 87 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_design.py | 35 +++++++++++++++++- 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/glmnet/base.py b/glmnet/base.py index 2517071..498a9e2 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -55,6 +55,10 @@ def __post_init__(self, standardize, intercept): self.X = (self.X - self.centers_[None,:]) / self.scaling_[None,:] self.X = np.asfortranarray(self.X) + self.unscaler_ = UnscaleOperator(centers=self.centers_, + scaling=self.scaling_) + self.scaler_ = ScaleOperator(centers=self.centers_, + scaling=self.scaling_) # LinearOperator API def _matvec(self, x): @@ -96,6 +100,7 @@ def _rmatvec(self, r): def quadratic_form(self, G=None, columns=None, + intercept=True, transformed=False): ''' if transformed is False: compute @@ -188,6 +193,88 @@ def quadratic_form(self, return Q + def scaled_to_raw(self, state): + """ + Take a "scaled" (intercept, coef) (these are the params + used by glmnet) and return a (intercept, coef) on "raw" scale. + """ + unscaled = self.unscaler_ @ state._stack + coef = unscaled[1:] + intercept = unscaled[0] + klass = state.__class__ + return klass(coef=coef, + intercept=intercept) + +@dataclass +class UnscaleOperator(LinearOperator): + + scaling : np.ndarray + centers: np.ndarray + + def __post_init__(self): + ncoef = self.scaling.shape[0] + self.shape = (ncoef+1,)*2 + self.dtype = float + + # LinearOperator API + def _matvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[1:] = coef / self.scaling + result[0] = intercept - (result[1:] * self.centers).sum() + + return result + + def _rmatvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[0] = intercept + result[1:] = coef / self.scaling + result[1:] -= intercept * self.centers / self.scaling + + return result + +@dataclass +class ScaleOperator(LinearOperator): + + scaling : np.ndarray + centers: np.ndarray + + def __post_init__(self): + ncoef = self.scaling.shape[0] + self.shape = (ncoef+1,)*2 + self.dtype = float + + # LinearOperator API + def _matvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[1:] = coef * self.scaling + result[0] = intercept + (coef * self.centers).sum() + + return result + + def _rmatvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[0] = intercept + result[1:] = coef * self.scaling + result[1:] += intercept * self.centers + + return result + @add_dataclass_docstring @dataclass class DiagonalOperator(LinearOperator): diff --git a/tests/test_design.py b/tests/test_design.py index 98357f2..df9b8bd 100644 --- a/tests/test_design.py +++ b/tests/test_design.py @@ -141,4 +141,37 @@ def test_quadratic_form(X, weights, standardize, intercept, gls): assert np.allclose(Q_s, Q) # sparse and dense agree - + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_unscaler(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + M = D.unscaler_ @ np.identity(6) + MT = D.unscaler_.T @ np.identity(6) + assert np.allclose(MT, M.T) + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_scaler(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + M = D.scaler_ @ np.identity(6) + MT = D.scaler_.T @ np.identity(6) + assert np.allclose(MT, M.T) + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_scaler_unscaler_inv(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + M = D.unscaler_ @ np.identity(6) + MI = D.scaler_ @ np.identity(6) + assert np.allclose(M @ MI, np.identity(6)) + From 626c62d82be537516fcfcbc80d023770608993c4 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 24 Jun 2024 11:37:39 -0400 Subject: [PATCH 33/44] adding raw_to_scaled method --- glmnet/base.py | 12 ++++++++++++ tests/test_design.py | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/glmnet/base.py b/glmnet/base.py index 498a9e2..0a3c174 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -205,6 +205,18 @@ def scaled_to_raw(self, state): return klass(coef=coef, intercept=intercept) + def raw_to_scaled(self, state): + """ + Take a "raw" (intercept, coef) and return a (intercept, coef) + on "scaled" scale (i.e. the scale GLMnet uses in its objective). + """ + unscaled = self.scaler_ @ state._stack + coef = unscaled[1:] + intercept = unscaled[0] + klass = state.__class__ + return klass(coef=coef, + intercept=intercept) + @dataclass class UnscaleOperator(LinearOperator): diff --git a/tests/test_design.py b/tests/test_design.py index df9b8bd..2c7dada 100644 --- a/tests/test_design.py +++ b/tests/test_design.py @@ -4,6 +4,7 @@ import statsmodels.api as sm from glmnet.base import Design +from glmnet.glm import GLMState rng = np.random.default_rng(0) n, p, q = 100, 5, 3 @@ -174,4 +175,28 @@ def test_scaler_unscaler_inv(intercept, M = D.unscaler_ @ np.identity(6) MI = D.scaler_ @ np.identity(6) assert np.allclose(M @ MI, np.identity(6)) + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_scaler_to_raw_conversion(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + raw_state = GLMState(coef=np.random.standard_normal(5), + intercept=np.random.standard_normal()) + scaled_state = D.raw_to_scaled(raw_state) + raw_state_roundtrip = D.scaled_to_raw(scaled_state) + + assert np.allclose(raw_state._stack, + raw_state_roundtrip._stack) + + scaled_state = GLMState(coef=np.random.standard_normal(5), + intercept=np.random.standard_normal()) + raw_state = D.scaled_to_raw(scaled_state) + scaled_state_roundtrip = D.raw_to_scaled(raw_state) + + assert np.allclose(scaled_state._stack, + scaled_state_roundtrip._stack) + From 30ff33b6ec7d58b0668c8131b3aa5105ab2f7b52 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 24 Jun 2024 12:25:13 -0400 Subject: [PATCH 34/44] inference rework, using the scale/unscale operators --- glmnet/base.py | 1 - glmnet/glm.py | 8 ++--- glmnet/inference.py | 85 +++++++++++++++++++-------------------------- 3 files changed, 38 insertions(+), 56 deletions(-) diff --git a/glmnet/base.py b/glmnet/base.py index 0a3c174..18708c0 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -100,7 +100,6 @@ def _rmatvec(self, r): def quadratic_form(self, G=None, columns=None, - intercept=True, transformed=False): ''' if transformed is False: compute diff --git a/glmnet/glm.py b/glmnet/glm.py index 1d28c94..dd4470f 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -611,11 +611,9 @@ def score(self, X, y, sample_weight=None): '''.format(**_docstrings).strip() def _set_coef_intercept(self, state): - self.coef_ = state.coef.copy() # this makes a copy -- important to make this copy because `state.coef` is persistent - if hasattr(self, 'standardize') and self.standardize: - self.scaling_ = self.design_.scaling_ - self.coef_ /= self.scaling_ - self.intercept_ = state.intercept - (self.coef_ * self.design_.centers_).sum() + raw_state = self.design_.scaled_to_raw(state) + self.coef_ = raw_state.coef + self.intercept_ = raw_state.intercept GLMBase.__doc__ = ''' Base class to fit a Generalized Linear Model (GLM). Base class for `GLMNet`. diff --git a/glmnet/inference.py b/glmnet/inference.py index 51925a3..9324f7c 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -120,7 +120,6 @@ def interval_constraints(self, else: lower_bound = -np.inf - print(lower_bound, upper_bound) return lower_bound, upper_bound def lasso_inference(glmnet_obj, @@ -159,7 +158,7 @@ def lasso_inference(glmnet_obj, Y_sel, sample_weight=weight_sel) - # quadratic approximation up to scaling and a factor of weight_sel.sum() + # # quadratic approximation up to scaling and a factor of weight_sel.sum() design_sel = _get_design(X_sel[:,active_set], weight_sel, @@ -168,11 +167,8 @@ def lasso_inference(glmnet_obj, P_sel = design_sel.quadratic_form(unreg_sel_GLM._information, transformed=True) - scaling_sel = design_sel.scaling_ Q_sel = np.linalg.inv(P_sel) - if not FL.fit_intercept: - Q_sel = Q_sel[1:,1:] - + # fit unpenalized model on full data X_full, Y_full, weight_full = full_data @@ -185,33 +181,32 @@ def lasso_inference(glmnet_obj, # quadratic approximation - design_full = _get_design(X_full[:,active_set], - weight_full, - standardize=glmnet_obj.standardize, - intercept=glmnet_obj.fit_intercept) - scaling_full = design_full.scaling_ - P_full = design_full.quadratic_form(unreg_GLM._information, - transformed=True) + D = _get_design(X_full[:,active_set], + weight_full, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + P_full = D.quadratic_form(unreg_GLM._information, + transformed=True) Q_full = np.linalg.inv(P_full) if not FL.fit_intercept: - Q_full = Q_full[1:,1:] penfac = FL.penalty_factor[active_set] + Q_full = Q_full[1:,1:] + Q_sel = Q_sel[1:,1:] else: penfac = np.ones(active_set.shape[0]) signs = np.sign(FL.coef_[active_set]) + noisy_mle = D.raw_to_scaled(unreg_sel_GLM.state_) if FL.fit_intercept: - # correct the scaling penfac = np.hstack([0, penfac]) signs = np.hstack([0, signs]) stacked = np.hstack([FL.state_.intercept, FL.state_.coef[active_set]]) - noisy_mle = np.hstack([unreg_sel_GLM.state_.intercept, - unreg_sel_GLM.state_.coef]) + noisy_mle = noisy_mle._stack else: stacked = FL.state_.coef[active_set] - noisy_mle = unreg_sel_GLM.state_.coef + noisy_mle = noisy_mle.coef # delta = lambda_val * penfac * signs @@ -227,13 +222,11 @@ def lasso_inference(glmnet_obj, ## the GLM's coef and intercept are on the original scale ## we transform them here to the (typically) unitless "standardized" scale + full_mle = D.raw_to_scaled(unreg_GLM.state_) if FL.fit_intercept: - intercept = unreg_GLM.state_.intercept + (unreg_GLM.state_.coef * design_full.centers_).sum() - coef = unreg_GLM.state_.coef * scaling_full - full_mle = np.hstack([intercept, - coef]) + full_mle = full_mle._stack else: - full_mle = unreg_GLM.state_.coef * scaling_full + full_mle = full_mle.coef ## iterate over coordinates Ls = np.zeros_like(stacked) @@ -241,11 +234,10 @@ def lasso_inference(glmnet_obj, mles = np.zeros_like(stacked) pvals = np.zeros_like(stacked) - if FL.fit_intercept: - transform_to_original = np.diag(np.hstack([1, 1/scaling_full])) - transform_to_original[0,1:] = -design_full.centers_/scaling_full - else: - transform_to_original = np.diag(1/scaling_full) + transform_to_raw = (D.unscaler_ @ + np.identity(D.shape[1])) + if not FL.fit_intercept: + transform_to_raw = transform_to_raw[1:,1:] linear = sel_P offset = np.zeros(sel_P.shape[0]) # -signs[penalized] * delta[penalized] @@ -275,22 +267,11 @@ def lasso_inference(glmnet_obj, L = scipy.sparse.vstack([I, -I]) O = np.ones(L.shape[0]) * lambda_val - # X'WX_E - # here, X is the effective matrix implied by scale / intercept choices - # P_inactive = FL.design_.quadratic_form(unreg_sel_GLM._information, - # transformed=True, - # columns=active_set)[1:] - # if not FL.fit_intercept: - # P_inactive = P_inactive[:,1:] - - # # (\lambda_{-E})^{-1} X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E - # I_inactive = (scale[:,None] * (P_inactive @ delta))[inactive_set] - inactive_con = AffineConstraint(linear=L, offset=O, observed=score_) - for i in range(transform_to_original.shape[0]): + for i in range(transform_to_raw.shape[0]): ## call selection_interval and return L, U, mle, p = selection_interval( active_con=active_con, @@ -298,7 +279,7 @@ def lasso_inference(glmnet_obj, Q_full=Q_full, noisy_observation=noisy_mle, observation=full_mle, - direction_of_interest=transform_to_original[i], + direction_of_interest=transform_to_raw[i], level=level, dispersion=unreg_GLM.dispersion_ ) @@ -366,20 +347,17 @@ def selection_interval(active_con, """ noisy_var = direction_of_interest.T @ Q_noisy @ direction_of_interest + full_var = direction_of_interest.T @ Q_full @ direction_of_interest noisy_estimate = (direction_of_interest * noisy_observation).sum() estimate = (direction_of_interest * observation).sum() (lower_bound, upper_bound) = active_con.interval_constraints(noisy_estimate, - Q_noisy @ direction_of_interest / noisy_var, + Q_full @ direction_of_interest / full_var, tol=tol) - ## lars path lockhart tibs^2 taylor paper - sigma = np.sqrt(direction_of_interest.T @ Q_full @ direction_of_interest * dispersion) - ## sqrt(alpha) is just sigma_noisy - sigma_full - noisy_sigma = np.sqrt(noisy_var * dispersion) - smoothing_sigma = np.sqrt(max(noisy_sigma**2 - sigma**2, 0)) - + sigma = np.sqrt(full_var * dispersion) + smoothing_sigma = np.sqrt(max(noisy_var - full_var, 0) * dispersion) if smoothing_sigma > 1e-6 * sigma: @@ -472,8 +450,15 @@ def norm_interval(lower, upper): """ #cdf = normal_dbn.cdf cdf = mp.ncdf - if lower > 0 and upper > 0: - return cdf(-lower) - cdf(-upper) + if lower > 0: + if lower < 4: + return cdf(-lower) - cdf(-upper) + else: + return np.exp(-np.fabs(lower*(upper-lower))) + elif upper < 0: + if upper < -4: + return np.exp(-np.fabs(upper*(upper-lower))) + else: return cdf(upper) - cdf(lower) From 6291a390aca7722ad5ad0c67ba873d4ef9d3ba92 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 24 Jun 2024 12:31:23 -0400 Subject: [PATCH 35/44] fixing handling of dispersion --- glmnet/inference.py | 48 ++++++++++++++++++++--------------------- tests/test_inference.py | 2 +- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 9324f7c..53fa6c2 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -160,14 +160,14 @@ def lasso_inference(glmnet_obj, # # quadratic approximation up to scaling and a factor of weight_sel.sum() - design_sel = _get_design(X_sel[:,active_set], - weight_sel, - standardize=glmnet_obj.standardize, - intercept=glmnet_obj.fit_intercept) - P_sel = design_sel.quadratic_form(unreg_sel_GLM._information, - transformed=True) + D_sel = _get_design(X_sel[:,active_set], + weight_sel, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + P_noisy = D_sel.quadratic_form(unreg_sel_GLM._information, + transformed=True) - Q_sel = np.linalg.inv(P_sel) + Q_noisy = np.linalg.inv(P_noisy) # fit unpenalized model on full data @@ -187,7 +187,7 @@ def lasso_inference(glmnet_obj, intercept=glmnet_obj.fit_intercept) P_full = D.quadratic_form(unreg_GLM._information, transformed=True) - Q_full = np.linalg.inv(P_full) + Q_full = np.linalg.inv(P_full) if not FL.fit_intercept: penfac = FL.penalty_factor[active_set] Q_full = Q_full[1:,1:] @@ -273,15 +273,14 @@ def lasso_inference(glmnet_obj, for i in range(transform_to_raw.shape[0]): ## call selection_interval and return - L, U, mle, p = selection_interval( + L, U, mle, p = _split_interval( active_con=active_con, - Q_noisy=Q_sel, - Q_full=Q_full, + Q_noisy=Q_noisy * unreg_GLM.dispersion_, + Q_full=Q_full * unreg_GLM.dispersion_, noisy_observation=noisy_mle, observation=full_mle, direction_of_interest=transform_to_raw[i], - level=level, - dispersion=unreg_GLM.dispersion_ + level=level ) Ls[i] = L Us[i] = U @@ -294,16 +293,15 @@ def lasso_inference(glmnet_obj, return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) -def selection_interval(active_con, - Q_noisy, - Q_full, - noisy_observation, - observation, - direction_of_interest, - tol = 1.e-4, - level = 0.90, - dispersion=1, - UMAU=True): +def _split_interval(active_con, + Q_noisy, + Q_full, + noisy_observation, + observation, + direction_of_interest, + tol = 1.e-4, + level = 0.90, + dispersion=1): """ Given an affine in cone constraint $\{z:Az+b \leq 0\}$ (elementwise) specified with $A$ as `support_directions` and $b$ as @@ -356,8 +354,8 @@ def selection_interval(active_con, Q_full @ direction_of_interest / full_var, tol=tol) - sigma = np.sqrt(full_var * dispersion) - smoothing_sigma = np.sqrt(max(noisy_var - full_var, 0) * dispersion) + sigma = np.sqrt(full_var) + smoothing_sigma = np.sqrt(max(noisy_var - full_var, 0)) if smoothing_sigma > 1e-6 * sigma: diff --git a/tests/test_inference.py b/tests/test_inference.py index 89a2907..62f4e7f 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -65,7 +65,7 @@ def run_inference(n, X = rng.standard_normal((n, p)) D = np.linspace(1, p, p) / p + 0.2 X *= D[None,:] - Y = rng.standard_normal(n) * 2 + Y = rng.standard_normal(n) * np.fabs(1+np.random.standard_normal()) beta = np.zeros(p) if alt: subs = rng.choice(p, s, replace=False) From d3ec1b965966f4ee3ea3c125a03378c45bf25842 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 27 Jun 2024 17:03:37 -0700 Subject: [PATCH 36/44] WIP: orthogonal working well under null --- glmnet/glm.py | 4 +- glmnet/inference.py | 445 ++++++++++++++++++++++------------------ tests/test_inference.py | 218 +++++++++++++------- 3 files changed, 383 insertions(+), 284 deletions(-) diff --git a/glmnet/glm.py b/glmnet/glm.py index dd4470f..e2ee28c 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -418,7 +418,7 @@ def fit(self, sample_weight=None, # ignored regularizer=None, # last 4 options non sklearn API warm_state=None, - dispersion=1, + dispersion=None, check=True, fit_null=True): @@ -536,7 +536,7 @@ def obj_function(response, self._set_coef_intercept(state) - if (hasattr(self._family, "base") and + if (dispersion is None and hasattr(self._family, "base") and isinstance(self._family.base, sm_family.Gaussian)): # GLM specific # usual estimate of sigma^2 self.dispersion_ = self.deviance_ / (nobs-nvar-self.fit_intercept) diff --git a/glmnet/inference.py b/glmnet/inference.py index 53fa6c2..8f9fedc 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -126,7 +126,8 @@ def lasso_inference(glmnet_obj, lambda_val, selection_data, full_data, - level=.9): + level=.9, + dispersion=None): fixed_lambda, warm_state = glmnet_obj.get_fixed_lambda(lambda_val) X_sel, Y_sel, weight_sel = selection_data @@ -147,151 +148,161 @@ def lasso_inference(glmnet_obj, raise NotImplementedError('upper/lower limits coming soon') active_set = np.nonzero(FL.coef_ != 0)[0] - inactive_set = np.nonzero(FL.coef_ == 0)[0] + if active_set.shape[0] != 0: + inactive_set = np.nonzero(FL.coef_ == 0)[0] - # fit unpenalized model on selection data + # fit unpenalized model on selection data - unreg_sel_GLM = glmnet_obj.get_GLM() + unreg_sel_GLM = glmnet_obj.get_GLM() - unreg_sel_GLM.summarize = True - unreg_sel_GLM.fit(X_sel[:,active_set], - Y_sel, - sample_weight=weight_sel) + unreg_sel_GLM.summarize = True + unreg_sel_GLM.fit(X_sel[:,active_set], + Y_sel, + sample_weight=weight_sel, + dispersion=dispersion) - # # quadratic approximation up to scaling and a factor of weight_sel.sum() + # # quadratic approximation up to scaling and a factor of weight_sel.sum() - D_sel = _get_design(X_sel[:,active_set], - weight_sel, + D_sel = _get_design(X_sel[:,active_set], + weight_sel, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + P_noisy = D_sel.quadratic_form(unreg_sel_GLM._information, + transformed=True) + + # fit unpenalized model on full data + + X_full, Y_full, weight_full = full_data + if weight_full is None: + weight_full = np.ones(X_full.shape[0]) + + unreg_GLM = glmnet_obj.get_GLM() + unreg_GLM.summarize = True + unreg_GLM.fit(X_full[:,active_set], + Y_full, + sample_weight=weight_full, + dispersion=dispersion) + + # quadratic approximation + + D = _get_design(X_full[:,active_set], + weight_full, standardize=glmnet_obj.standardize, intercept=glmnet_obj.fit_intercept) - P_noisy = D_sel.quadratic_form(unreg_sel_GLM._information, - transformed=True) + P_full = D.quadratic_form(unreg_GLM._information, + transformed=True) + if not FL.fit_intercept: + if FL.penalty_factor is not None: + penfac = FL.penalty_factor[active_set] + else: + penfac = np.ones_like(active_set) + P_full = P_full[1:,1:] + P_noisy = P_noisy[1:,1:] + else: + penfac = np.ones(active_set.shape[0]) - Q_noisy = np.linalg.inv(P_noisy) + Q_full = np.linalg.inv(P_full) + Q_noisy = np.linalg.inv(P_noisy) - # fit unpenalized model on full data + signs = np.sign(FL.coef_[active_set]) - X_full, Y_full, weight_full = full_data - if weight_full is None: - weight_full = np.ones(X_full.shape[0]) - - unreg_GLM = glmnet_obj.get_GLM() - unreg_GLM.summarize = True - unreg_GLM.fit(X_full[:,active_set], Y_full, sample_weight=weight_full) - - # quadratic approximation - - D = _get_design(X_full[:,active_set], - weight_full, - standardize=glmnet_obj.standardize, - intercept=glmnet_obj.fit_intercept) - P_full = D.quadratic_form(unreg_GLM._information, - transformed=True) - Q_full = np.linalg.inv(P_full) - if not FL.fit_intercept: - penfac = FL.penalty_factor[active_set] - Q_full = Q_full[1:,1:] - Q_sel = Q_sel[1:,1:] - else: - penfac = np.ones(active_set.shape[0]) + noisy_mle = D.raw_to_scaled(unreg_sel_GLM.state_) - signs = np.sign(FL.coef_[active_set]) + if FL.fit_intercept: + penfac = np.hstack([0, penfac]) + signs = np.hstack([0, signs]) + stacked = np.hstack([FL.state_.intercept, + FL.state_.coef[active_set]]) + noisy_mle = noisy_mle._stack + else: + stacked = FL.state_.coef[active_set] + noisy_mle = noisy_mle.coef - noisy_mle = D.raw_to_scaled(unreg_sel_GLM.state_) - if FL.fit_intercept: - penfac = np.hstack([0, penfac]) - signs = np.hstack([0, signs]) - stacked = np.hstack([FL.state_.intercept, - FL.state_.coef[active_set]]) - noisy_mle = noisy_mle._stack - else: - stacked = FL.state_.coef[active_set] - noisy_mle = noisy_mle.coef + # delta = lambda_val * penfac * signs - # delta = lambda_val * penfac * signs + # # remember loss of glmnet is normalized by sum of weights + # # when taking newton step adjust by weight_sel.sum() - # # remember loss of glmnet is normalized by sum of weights - # # when taking newton step adjust by weight_sel.sum() - - # delta = Q_sel @ delta * weight_sel.sum() - # noisy_mle = stacked + delta # unitless scale - - penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(Q_full.shape[0])[penalized] -# assert (np.all(sel_P @ noisy_mle < -signs[penalized] * delta[penalized])) - - ## the GLM's coef and intercept are on the original scale - ## we transform them here to the (typically) unitless "standardized" scale - full_mle = D.raw_to_scaled(unreg_GLM.state_) - if FL.fit_intercept: - full_mle = full_mle._stack - else: - full_mle = full_mle.coef - - ## iterate over coordinates - Ls = np.zeros_like(stacked) - Us = np.zeros_like(stacked) - mles = np.zeros_like(stacked) - pvals = np.zeros_like(stacked) - - transform_to_raw = (D.unscaler_ @ - np.identity(D.shape[1])) - if not FL.fit_intercept: - transform_to_raw = transform_to_raw[1:,1:] - - linear = sel_P - offset = np.zeros(sel_P.shape[0]) # -signs[penalized] * delta[penalized] - active_con = AffineConstraint(linear=linear, - offset=offset, - observed=stacked) - inactive = True - if inactive: - pf = FL.regularizer_.penalty_factor - scale = 1 / (pf + (pf <= 0)) - - logl_score = FL.state_.logl_score(FL._family, - Y_sel, - weight_sel / weight_sel.sum()) - - # X_{-E}'(Y-X\hat{\beta}_E) -- \hat{\beta}_E is the LASSO soln, not GLM ! - # this is (2nd order Taylor series sense) equivalent to - # X_{-E}'(Y-X\bar{\beta}_E) + X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E - # with \bar{\beta}_E the GLM soln - - score_ = (FL.design_.T @ logl_score)[1:] - score_ *= scale - score_ = score_[inactive_set] - # we now know that `score_` is bounded by \pm lambda_val - - I = scipy.sparse.eye(score_.shape[0]) - L = scipy.sparse.vstack([I, -I]) - O = np.ones(L.shape[0]) * lambda_val - - inactive_con = AffineConstraint(linear=L, - offset=O, - observed=score_) - - for i in range(transform_to_raw.shape[0]): - ## call selection_interval and return - L, U, mle, p = _split_interval( - active_con=active_con, - Q_noisy=Q_noisy * unreg_GLM.dispersion_, - Q_full=Q_full * unreg_GLM.dispersion_, - noisy_observation=noisy_mle, - observation=full_mle, - direction_of_interest=transform_to_raw[i], - level=level - ) - Ls[i] = L - Us[i] = U - mles[i] = mle - pvals[i] = p - - idx = (active_set).tolist() - if FL.fit_intercept: - idx = ['intercept'] + idx - return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) + # delta = Q_sel @ delta * weight_sel.sum() + # noisy_mle = stacked + delta # unitless scale + + penalized = penfac > 0 + sel_P = -np.diag(signs[penalized]) @ np.eye(Q_full.shape[0])[penalized] + + ## the GLM's coef and intercept are on the original scale + ## we transform them here to the (typically) unitless "standardized" scale + full_mle = D.raw_to_scaled(unreg_GLM.state_) + if FL.fit_intercept: + full_mle = full_mle._stack + else: + full_mle = full_mle.coef + + ## iterate over coordinates + Ls = np.zeros_like(stacked) + Us = np.zeros_like(stacked) + mles = np.zeros_like(stacked) + pvals = np.zeros_like(stacked) + + transform_to_raw = (D.unscaler_ @ + np.identity(D.shape[1])) + if not FL.fit_intercept: + transform_to_raw = transform_to_raw[1:,1:] + + linear = sel_P + offset = np.zeros(sel_P.shape[0]) # -signs[penalized] * delta[penalized] + active_con = AffineConstraint(linear=linear, + offset=offset, + observed=stacked) + inactive = True + if inactive: + pf = FL.regularizer_.penalty_factor + scale = 1 / (pf + (pf <= 0)) + + logl_score = FL.state_.logl_score(FL._family, + Y_sel, + weight_sel / weight_sel.sum()) + + # X_{-E}'(Y-X\hat{\beta}_E) -- \hat{\beta}_E is the LASSO soln, not GLM ! + # this is (2nd order Taylor series sense) equivalent to + # X_{-E}'(Y-X\bar{\beta}_E) + X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E + # with \bar{\beta}_E the GLM soln + + score_ = (FL.design_.T @ logl_score)[1:] + score_ *= scale + score_ = score_[inactive_set] + # we now know that `score_` is bounded by \pm lambda_val + + I = scipy.sparse.eye(score_.shape[0]) + L = scipy.sparse.vstack([I, -I]) + O = np.ones(L.shape[0]) * lambda_val + + inactive_con = AffineConstraint(linear=L, + offset=O, + observed=score_) + + for i in range(transform_to_raw.shape[0]): + ## call selection_interval and return + L, U, mle, pval, _ = _split_interval( + active_con=active_con, + Q_noisy=Q_noisy * unreg_GLM.dispersion_, + Q_full=Q_full * unreg_GLM.dispersion_, + noisy_observation=noisy_mle, # these must be same scale / shift as glmnet + observation=full_mle, # these must be same scale / shift as glmnet + direction_of_interest=transform_to_raw[i], # this matrix describes the map from glmnet("transform") coords to raw("original") scale + level=level + ) + Ls[i] = L + Us[i] = U + mles[i] = mle + pvals[i] = pval + idx = active_set.tolist() + if FL.fit_intercept: + idx = ['intercept'] + idx + + return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) + else: + return None def _split_interval(active_con, Q_noisy, @@ -349,84 +360,23 @@ def _split_interval(active_con, noisy_estimate = (direction_of_interest * noisy_observation).sum() estimate = (direction_of_interest * observation).sum() + slice_dir = Q_full @ direction_of_interest / full_var (lower_bound, upper_bound) = active_con.interval_constraints(noisy_estimate, - Q_full @ direction_of_interest / full_var, + slice_dir, tol=tol) sigma = np.sqrt(full_var) smoothing_sigma = np.sqrt(max(noisy_var - full_var, 0)) - if smoothing_sigma > 1e-6 * sigma: - - grid = np.linspace(estimate - 10 * sigma, estimate + 10 * sigma, 2001) - weight = ( - normal_dbn.cdf( - (upper_bound - grid) / (smoothing_sigma) - ) - normal_dbn.cdf( - (lower_bound - grid) / (smoothing_sigma) - ) - ) - weight *= normal_dbn.pdf((grid - estimate) / sigma) + print(lower_bound, upper_bound, sigma, smoothing_sigma**2) - sel_distr = discrete_family(grid, weight) - L, U = sel_distr.equal_tailed_interval(estimate, - alpha=1-level) - mle, _, _ = sel_distr.MLE(estimate) - mle *= sigma**2; mle += estimate - pval = sel_distr.cdf(-estimate / sigma**2, estimate) - pval = 2 * min(pval, 1-pval) - L *= sigma**2; L += estimate - U *= sigma**2; U += estimate - else: - warnings.warn('assuming data for selection is same as for inference -- using hard selection') - - lb = estimate - 20 * sigma - ub = estimate + 20 * sigma - - if estimate < lower_bound or estimate > upper_bound: - warn('Constraints not satisfied: returning [-np.inf, np.inf]') - return -np.inf, np.inf, np.nan, np.nan - - def F(theta): - - Z = (estimate - theta) / sigma - Z_L = (lower_bound - theta) / sigma - Z_U = (upper_bound - theta) / sigma - num = norm_interval(Z_L, Z) - den = norm_interval(Z_L, Z_U) - if Z_L > 0 and den < 1e-10: - C = np.fabs(Z_L) - D = Z-Z_L - cdf = 1-np.exp(-C*D) - elif Z_U < 0 and den < 1e-10: - C = np.fabs(Z_U) - D = Z_U-Z - if C*D < 0: - raise ValueError - cdf = np.exp(-C*D) - else: - cdf = num / den - return cdf - - - pval = F(0) - pval = 2 * min(pval, 1-pval) - - lb = lower_bound - 20 * sigma - ub = upper_bound + 20 * sigma - - alpha = 0.5 * (1 - level) - L = find_root(F, 1.0 - 0.5 * alpha, lb, ub) - if np.isnan(L): - L = -np.inf - U = find_root(F, 0.5 * alpha, lb, ub) - if np.isnan(U): - U = np.inf - - mle = np.nan - - return L, U, mle, pval + return _truncated_inference(estimate, + sigma, + smoothing_sigma, + lower_bound, + upper_bound, + level=level) def norm_interval(lower, upper): r""" @@ -456,7 +406,8 @@ def norm_interval(lower, upper): elif upper < 0: if upper < -4: return np.exp(-np.fabs(upper*(upper-lower))) - + else: + return cdf(-lower) - cdf(-upper) else: return cdf(upper) - cdf(lower) @@ -552,7 +503,7 @@ def theta(self): def theta(self, _theta): if _theta != self._theta: _thetaX = _theta * self.sufficient_stat + self._lw - _largest = _thetaX.max() - 5 # try to avoid over/under flow, 5 seems arbitrary + _largest = _thetaX.max() - 15 # try to avoid over/under flow, 5 seems arbitrary _exp_thetaX = np.exp(_thetaX - _largest) _prod = _exp_thetaX self._partition = np.sum(_prod) @@ -909,3 +860,91 @@ def first_two_moments(x): return cur_est, 1. / hessian, grad +def _truncated_inference(estimate, + sigma, + smoothing_sigma, + lower_bound, + upper_bound, + basept=None, + level=0.9): + + if basept is None: + basept = estimate + + if smoothing_sigma > 1e-6 * sigma: + + grid = np.linspace(basept - 10 * sigma, basept + 10 * sigma, 2001) + + weight = (normal_dbn.cdf((upper_bound - grid) / smoothing_sigma) + - normal_dbn.cdf((lower_bound - grid) / smoothing_sigma)) + + weight *= normal_dbn.pdf((grid - basept) / sigma) + + sel_distr = discrete_family(grid, weight) + + use_sample = True + if use_sample: + rng = np.random.default_rng(0) + sample = rng.standard_normal(2000) * sigma + basept + weight_s = (normal_dbn.cdf((upper_bound - sample) / smoothing_sigma) + - normal_dbn.cdf((lower_bound - sample) / smoothing_sigma)) + sel_distr = discrete_family(sample, weight_s) + + L, U = sel_distr.equal_tailed_interval(estimate, + alpha=1-level) + mle, _, _ = sel_distr.MLE(estimate) + mle *= sigma**2; mle += estimate + pval = sel_distr.cdf(-basept / sigma**2, estimate) + pval = 2 * min(pval, 1-pval) + L *= sigma**2; L += basept + U *= sigma**2; U += basept + + else: + warnings.warn('assuming data for selection is same as for inference -- using hard selection') + + lb = estimate - 20 * sigma + ub = estimate + 20 * sigma + + if estimate < lower_bound or estimate > upper_bound: + warn('Constraints not satisfied: returning [-np.inf, np.inf]') + return -np.inf, np.inf, np.nan, np.nan + + def F(theta): + + Z = (estimate - theta) / sigma + Z_L = (lower_bound - theta) / sigma + Z_U = (upper_bound - theta) / sigma + num = norm_interval(Z_L, Z) + den = norm_interval(Z_L, Z_U) + if Z_L > 0 and den < 1e-10: + C = np.fabs(Z_L) + D = Z-Z_L + cdf = 1-np.exp(-C*D) + elif Z_U < 0 and den < 1e-10: + C = np.fabs(Z_U) + D = Z_U-Z + if C*D < 0: + raise ValueError + cdf = np.exp(-C*D) + else: + cdf = num / den + return cdf + + + pval = F(0) + pval = 2 * min(pval, 1-pval) + + lb = lower_bound - 20 * sigma + ub = upper_bound + 20 * sigma + + alpha = 0.5 * (1 - level) + L = find_root(F, 1.0 - 0.5 * alpha, lb, ub) + if np.isnan(L): + L = -np.inf + U = find_root(F, 0.5 * alpha, lb, ub) + if np.isnan(U): + U = np.inf + + mle = np.nan + + return L, U, mle, pval, sel_distr diff --git a/tests/test_inference.py b/tests/test_inference.py index 62f4e7f..01790fc 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -5,7 +5,8 @@ import statsmodels.api as sm from glmnet import GLMNet -from glmnet.inference import lasso_inference +from glmnet.inference import (lasso_inference, + _truncated_inference) @pytest.mark.parametrize('standardize', [True, False]) @pytest.mark.parametrize('fit_intercept', [True, False]) @@ -43,11 +44,9 @@ def test_Auto(): m = int(prop*n) df = lasso_inference(GN, - GN.lambda_values_[min(10, GN.lambda_values_.shape[0]-1)], + 2 / n, # GN.lambda_values_[min(10, GN.lambda_values_.shape[0]-1)], (X[:m], Df.iloc[:m], None), (X, Df, None)) - print(df) - def run_inference(n, p, @@ -58,20 +57,25 @@ def run_inference(n, s=3, prop=0.75, penalty_facs=True, - family='gaussian'): + family='gaussian', + orthogonal=False, + cv=False): if rng is None: rng = np.random.default_rng(0) - X = rng.standard_normal((n, p)) - D = np.linspace(1, p, p) / p + 0.2 - X *= D[None,:] - Y = rng.standard_normal(n) * np.fabs(1+np.random.standard_normal()) beta = np.zeros(p) - if alt: - subs = rng.choice(p, s, replace=False) - beta[subs] = rng.standard_normal(s) - mu = X @ beta - Y += mu + subs = rng.choice(p, s, replace=False) + + if not orthogonal: + X = rng.standard_normal((n, p)) + D = np.linspace(1, p, p) / p + 0.2 + X *= D[None,:] + Y = rng.standard_normal(n) * np.fabs(1+np.random.standard_normal()) + if alt: + beta[subs] = rng.uniform(3, 5) * rng.choice([-1,1], size=s, replace=True) / np.sqrt(n) + + mu = X @ beta + Y += mu if family == 'gaussian': fam = sm.families.Gaussian() @@ -82,51 +86,97 @@ def run_inference(n, raise ValueError('only testing "gaussian" and "probit"') if penalty_facs: - penalty_factor = np.ones(X.shape[1]) + penalty_factor = np.ones(p) penalty_factor[:10] = 0.5 else: penalty_factor = None - GN = GLMNet(response_id='response', - family=fam, - fit_intercept=fit_intercept, - standardize=standardize, - penalty_factor=penalty_factor) - - Df = pd.DataFrame({'response':Y}) + if not orthogonal: + + GN = GLMNet(response_id='response', + fit_intercept=fit_intercept, + standardize=standardize, + family=fam, + penalty_factor=penalty_factor) - GN.fit(X, Df) - m = int(prop*n) + m = int(prop*n) + + Df = pd.DataFrame({'response':Y}) + GN.fit(X[:m], Df.iloc[:m]) + if cv: + GN.cross_validation_path(X[:m], Df.iloc[:m]) + lamval = GN.index_best_['Mean Squared Error'] + else: + eps = rng.standard_normal((X.shape[0], 1000)) + lamval = 1.2 * np.fabs(X.T @ eps).max() / X.shape[0] + df = lasso_inference(GN, + lamval, + (X[:m], Df.iloc[:m], None), + (X, Df, None)) + + if df is not None: + if fit_intercept: + active_set = np.array(df.index[1:]).astype(int) + else: + active_set = np.array(df.index).astype(int) + X_sel = X[:,active_set] + + if fit_intercept: + X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) + targets = np.linalg.pinv(X_sel) @ mu + + df['target'] = targets - df = lasso_inference(GN, - GN.lambda_values_[min(10, GN.lambda_values_.shape[0]-1)], - (X[:m], Df.iloc[:m], None), - (X, Df, None)) - if fit_intercept: - active_set = np.array(df.index[1:]).astype(int) else: - active_set = np.array(df.index).astype(int) - X_sel = X[:,active_set] - - if fit_intercept: - X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) - targets = np.linalg.pinv(X_sel) @ mu - df['target'] = targets - if alt: + X = np.identity(p) + Y = rng.standard_normal(p) + Df = pd.DataFrame({'response':Y}) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + mu = X @ beta + Y += mu + + GN = GLMNet(response_id='response', + family=fam, + fit_intercept=False, + standardize=False, + penalty_factor=penalty_factor) + + + lamval = 2 + Y_sel = Y + np.sqrt((1 - prop) / prop) * rng.standard_normal(Y.shape) + + active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + GN.fit(X, Df) + if active_naive.shape[0] > 0: + df = lasso_inference(GN, + prop * lamval / p, + (X_sel, Df_sel, None), + (X, Df, None), + dispersion=1) + df['target'] = mu[df.index] + else: + return None + if alt and df is not None: if family == 'probit' and not set(subs).issubset(active_set): df['target'] *= np.nan return df -def main_null(fit_intercept=True, - standardize=True, - n=500, - p=75, - ntrial=500, - rng=None, - family='gaussian'): +def main(fit_intercept=True, + standardize=True, + n=500, + p=75, + ntrial=500, + rng=None, + family='gaussian', + alt=False, + cv=False, + orthogonal=False): ncover = 0 nsel = 0 @@ -140,43 +190,53 @@ def main_null(fit_intercept=True, fit_intercept, standardize, rng=rng, - family=family) - dfs.append(df) - all_df = pd.concat(dfs) - ncover += ((all_df['lower'] < 0) & (all_df['upper'] > 0)).sum() - nsel += all_df.shape[0] - - print('cover:', ncover / nsel, 'typeI:', (all_df['pval'] < 0.05).mean()) + family=family, + alt=alt, + cv=cv, + orthogonal=orthogonal) + if df is not None: + dfs.append(df) + if len(dfs) > 0: + all_df = pd.concat(dfs) + ncover += ((all_df['lower'] < 0) & (all_df['upper'] > 0)).sum() + nsel += all_df.shape[0] + print('cover:', ncover / nsel, 'power:', (all_df['pval'] < 0.05).mean()) -def main_alt(fit_intercept=True, - standardize=True, - n=1000, - p=75, - ntrial=500, - rng=None, - family='gaussian'): - ncover = 0 - nsel = 0 + +def test_truncated_inference(B=1000, + noisy_sigma=0.5, + sigma=1, + upper_bound=5, + lower_bound=2, + alt=False): - dfs = [] + pvals = [] + cover = [] rng = np.random.default_rng(0) - for i in range(ntrial): - df = run_inference(n, - p, - fit_intercept, - standardize, - rng=rng, - family=family, - alt=True) - dfs.append(df) - all_df = pd.concat(dfs).dropna() - ncover += ((all_df['lower'] < all_df['target']) & (all_df['upper'] > all_df['target'])).sum() - nsel += all_df.shape[0] - - print('cover:', ncover / nsel) - return all_df - - + + for _ in range(B): + mu = rng.standard_normal() + 3 + lower_bound = rng.uniform(0, 2) + upper_bound = lower_bound + rng.uniform(3, 5) + while True: + Z = rng.standard_normal() * sigma + if alt: + Z += mu + Z_noisy = Z + rng.standard_normal() * noisy_sigma + if (Z_noisy > lower_bound) and (Z_noisy < upper_bound): + L, U, _, pval, D = _truncated_inference(Z, + sigma, + noisy_sigma, + lower_bound, + upper_bound, + basept=Z, + level=0.90) + cover.append((Lmu)) + pvals.append(pval) + print(np.mean(cover), np.mean(np.array(pvals) < 0.05)) + break + + return D From da21a471f815ff7b50755264ca644eefa6a5eb9d Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 3 Jul 2024 17:18:27 -0700 Subject: [PATCH 37/44] WIP: orthogonal now working without penfacs --- glmnet/inference.py | 140 +++++++++++++++++++++++++++++++++------- tests/test_inference.py | 93 ++++++++++++++++++-------- 2 files changed, 181 insertions(+), 52 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index 8f9fedc..f610afa 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -22,6 +22,94 @@ from dataclasses import dataclass from typing import Optional +@dataclass +class TruncatedGaussian(object): + + estimate: float + sigma: float + smoothing_sigma: float + lower_bound: float + upper_bound: float + level: Optional[float] = 0.9 + num_sd: Optional[float] = 10 + num_grid: Optional[float] = 4000 + use_sample: Optional[bool] = False + seed: Optional[int] = 0 + + def _get_family(self, + basept=None): + + if basept is None: + basept = self.estimate + + self._rng = np.random.default_rng(self.seed) + + if not self.use_sample: + grid = np.linspace(basept - self.num_sd * self.sigma, + basept + self.num_sd * self.sigma, self.num_grid) + + weight = (normal_dbn.cdf((self.upper_bound - grid) / self.smoothing_sigma) + - normal_dbn.cdf((self.lower_bound - grid) / self.smoothing_sigma)) + + weight *= normal_dbn.pdf((grid - basept) / self.sigma) + + return discrete_family(grid, weight) + + else: + sample = self._rng.standard_normal(self.num_grid) * self.sigma + basept + weight = (normal_dbn.cdf((self.upper_bound - sample) / self.smoothing_sigma) + - normal_dbn.cdf((self.lower_bound - sample) / self.smoothing_sigma)) + return discrete_family(sample, weight) + + def pvalue(self, + null_value=0, + alternative='twosided', + basept=None): + + if basept is None: + basept = null_value + + if alternative not in ['twosided', 'greater', 'less']: + raise ValueError("alternative should be one of ['twosided', 'greater', 'less']") + _family = self._get_family(basept=basept) + tilt = (null_value - basept) / self.sigma**2 + if alternative in ['less', 'twosided']: + _cdf = _family.cdf(tilt, x=self.estimate) + if alternative == 'less': + pvalue = _cdf + else: + pvalue = 2 * min(_cdf, 1 - _cdf) + else: + pvalue = _family.sf(tilt, x=self.estimate) + return pvalue + + def interval(self, + basept=None): + + if basept is None: + basept = self.estimate + + _family = self._get_family(basept=basept) + + L, U = _family.equal_tailed_interval(self.estimate, + alpha=1-self.level) + L *= self.sigma**2; L += basept + U *= self.sigma**2; U += basept + + return L, U + + def MLE(self, + basept=None): + + if basept is None: + basept = self.estimate + + _family = self._get_family(basept=basept) + + mle, _, _ = _family.MLE(self.estimate) + mle *= self.sigma**2; mle += basept + return mle + @dataclass class AffineConstraint(object): @@ -242,7 +330,8 @@ def lasso_inference(glmnet_obj, Us = np.zeros_like(stacked) mles = np.zeros_like(stacked) pvals = np.zeros_like(stacked) - + TGs = [] + transform_to_raw = (D.unscaler_ @ np.identity(D.shape[1])) if not FL.fit_intercept: @@ -282,25 +371,27 @@ def lasso_inference(glmnet_obj, for i in range(transform_to_raw.shape[0]): ## call selection_interval and return - L, U, mle, pval, _ = _split_interval( - active_con=active_con, - Q_noisy=Q_noisy * unreg_GLM.dispersion_, - Q_full=Q_full * unreg_GLM.dispersion_, - noisy_observation=noisy_mle, # these must be same scale / shift as glmnet - observation=full_mle, # these must be same scale / shift as glmnet - direction_of_interest=transform_to_raw[i], # this matrix describes the map from glmnet("transform") coords to raw("original") scale - level=level - ) + TG = _split_interval(active_con=active_con, + Q_noisy=Q_noisy * unreg_GLM.dispersion_, + Q_full=Q_full * unreg_GLM.dispersion_, + noisy_observation=noisy_mle, # these must be same scale / shift as glmnet + observation=full_mle, # these must be same scale / shift as glmnet + direction_of_interest=transform_to_raw[i], # this matrix describes the map from glmnet("transform") coords to raw("original") scale + level=level) + + (L, U), mle, pval = (TG.interval(), TG.MLE(), TG.pvalue()) + Ls[i] = L Us[i] = U mles[i] = mle pvals[i] = pval - + TGs.append(TG) + idx = active_set.tolist() if FL.fit_intercept: idx = ['intercept'] + idx - return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx) + return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us, 'TG':TGs}, index=idx) else: return None @@ -369,16 +460,14 @@ def _split_interval(active_con, sigma = np.sqrt(full_var) smoothing_sigma = np.sqrt(max(noisy_var - full_var, 0)) - print(lower_bound, upper_bound, sigma, smoothing_sigma**2) - - return _truncated_inference(estimate, - sigma, - smoothing_sigma, - lower_bound, - upper_bound, - level=level) - -def norm_interval(lower, upper): + return TruncatedGaussian(estimate=estimate, + sigma=sigma, + smoothing_sigma=smoothing_sigma, + lower_bound=lower_bound, + upper_bound=upper_bound, + level=level) + +def _norm_interval(lower, upper): r""" A multiprecision evaluation of @@ -870,6 +959,7 @@ def _truncated_inference(estimate, if basept is None: basept = estimate + noisy_sigma = np.sqrt(smoothing_sigma**2 + sigma**2) if smoothing_sigma > 1e-6 * sigma: @@ -882,7 +972,7 @@ def _truncated_inference(estimate, sel_distr = discrete_family(grid, weight) - use_sample = True + use_sample = False if use_sample: rng = np.random.default_rng(0) sample = rng.standard_normal(2000) * sigma + basept @@ -914,8 +1004,8 @@ def F(theta): Z = (estimate - theta) / sigma Z_L = (lower_bound - theta) / sigma Z_U = (upper_bound - theta) / sigma - num = norm_interval(Z_L, Z) - den = norm_interval(Z_L, Z_U) + num = _norm_interval(Z_L, Z) + den = _norm_interval(Z_L, Z_U) if Z_L > 0 and den < 1e-10: C = np.fabs(Z_L) D = Z-Z_L diff --git a/tests/test_inference.py b/tests/test_inference.py index 01790fc..405825a 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -6,7 +6,7 @@ from glmnet import GLMNet from glmnet.inference import (lasso_inference, - _truncated_inference) + TruncatedGaussian) @pytest.mark.parametrize('standardize', [True, False]) @pytest.mark.parametrize('fit_intercept', [True, False]) @@ -131,11 +131,11 @@ def run_inference(n, X = np.identity(p) Y = rng.standard_normal(p) - Df = pd.DataFrame({'response':Y}) if alt: - beta[subs] = rng.standard_normal(s) * 2 - mu = X @ beta + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + mu = beta Y += mu + Df = pd.DataFrame({'response':Y}) GN = GLMNet(response_id='response', family=fam, @@ -143,11 +143,10 @@ def run_inference(n, standardize=False, penalty_factor=penalty_factor) - lamval = 2 Y_sel = Y + np.sqrt((1 - prop) / prop) * rng.standard_normal(Y.shape) - active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) X_sel = np.sqrt(prop) * X GN.fit(X, Df) @@ -158,6 +157,37 @@ def run_inference(n, (X, Df, None), dispersion=1) df['target'] = mu[df.index] + + pivots = [] + signs = np.sign(Y_sel[df.index]) + assert np.all(Y_sel[df.index] * signs >= lamval * penalty_factor[df.index]) + + for j, s in zip(df.index, signs): + if s == 1: + upper_bound = np.inf + if penalty_factor is not None: + lower_bound = lamval * penalty_factor[j] + else: + lower_bound = lamval + else: + if penalty_factor is not None: + upper_bound = -lamval * penalty_factor[j] + else: + upper_bound = -lamval + lower_bound = -np.inf + + tg = df.loc[j,'TG'] + + TG = TruncatedGaussian(estimate=Y[j], + sigma=1, + smoothing_sigma=np.sqrt((1 - prop) / prop), + lower_bound=lower_bound, + upper_bound=upper_bound, + level=0.90) + pval = TG.pvalue() + pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) + assert np.allclose(pval, df.loc[j]['pval']) + df['pivot'] = pivots else: return None if alt and df is not None: @@ -176,7 +206,8 @@ def main(fit_intercept=True, family='gaussian', alt=False, cv=False, - orthogonal=False): + orthogonal=False, + penalty_facs=False): ncover = 0 nsel = 0 @@ -193,24 +224,29 @@ def main(fit_intercept=True, family=family, alt=alt, cv=cv, - orthogonal=orthogonal) + orthogonal=orthogonal, + penalty_facs=penalty_facs) if df is not None: dfs.append(df) if len(dfs) > 0: all_df = pd.concat(dfs) - ncover += ((all_df['lower'] < 0) & (all_df['upper'] > 0)).sum() + ncover += ((all_df['lower'] < all_df['target']) & (all_df['upper'] > all_df['target'])).sum() nsel += all_df.shape[0] - print('cover:', ncover / nsel, 'power:', (all_df['pval'] < 0.05).mean()) + print('cover:', + ncover / nsel, 'power:', + (all_df['pval'] < 0.05).mean(), + (all_df['pivot'] < 0.05).mean(), all_df['pivot'].std()) def test_truncated_inference(B=1000, - noisy_sigma=0.5, + smoothing_sigma=np.sqrt(1/3), sigma=1, - upper_bound=5, - lower_bound=2, - alt=False): + upper_bound=None, + lower_bound=None, + alt=False, + level=0.9): pvals = [] cover = [] @@ -219,24 +255,27 @@ def test_truncated_inference(B=1000, for _ in range(B): mu = rng.standard_normal() + 3 - lower_bound = rng.uniform(0, 2) - upper_bound = lower_bound + rng.uniform(3, 5) + if not alt: + mu *= 0 + if lower_bound is None or upper_bound is None: + lower_bound = rng.uniform(0, 2) + upper_bound = lower_bound + rng.uniform(3, 5) while True: Z = rng.standard_normal() * sigma - if alt: - Z += mu - Z_noisy = Z + rng.standard_normal() * noisy_sigma + Z += mu + Z_noisy = Z + rng.standard_normal() * smoothing_sigma if (Z_noisy > lower_bound) and (Z_noisy < upper_bound): - L, U, _, pval, D = _truncated_inference(Z, - sigma, - noisy_sigma, - lower_bound, - upper_bound, - basept=Z, - level=0.90) + TG = TruncatedGaussian(estimate=Z, + sigma=sigma, + smoothing_sigma=smoothing_sigma, + lower_bound=lower_bound, + upper_bound=upper_bound, + level=level) + + (L, U), mle, pval = (TG.interval(), TG.MLE(), TG.pvalue()) + cover.append((Lmu)) pvals.append(pval) print(np.mean(cover), np.mean(np.array(pvals) < 0.05)) break - return D From 600d5ccce3fa5dff76c3bc461a726c29cf6eedeb Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Wed, 3 Jul 2024 17:22:43 -0700 Subject: [PATCH 38/44] pivots for non-orthogonal --- tests/test_inference.py | 56 +++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/tests/test_inference.py b/tests/test_inference.py index 405825a..f38a841 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -161,39 +161,41 @@ def run_inference(n, pivots = [] signs = np.sign(Y_sel[df.index]) assert np.all(Y_sel[df.index] * signs >= lamval * penalty_factor[df.index]) - - for j, s in zip(df.index, signs): - if s == 1: - upper_bound = np.inf - if penalty_factor is not None: - lower_bound = lamval * penalty_factor[j] - else: - lower_bound = lamval - else: - if penalty_factor is not None: - upper_bound = -lamval * penalty_factor[j] - else: - upper_bound = -lamval - lower_bound = -np.inf - - tg = df.loc[j,'TG'] - - TG = TruncatedGaussian(estimate=Y[j], - sigma=1, - smoothing_sigma=np.sqrt((1 - prop) / prop), - lower_bound=lower_bound, - upper_bound=upper_bound, - level=0.90) - pval = TG.pvalue() - pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) - assert np.allclose(pval, df.loc[j]['pval']) - df['pivot'] = pivots else: return None if alt and df is not None: if family == 'probit' and not set(subs).issubset(active_set): df['target'] *= np.nan + if df is not None: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] + # for j, s in zip(df.index, signs): + # if s == 1: + # upper_bound = np.inf + # if penalty_factor is not None: + # lower_bound = lamval * penalty_factor[j] + # else: + # lower_bound = lamval + # else: + # if penalty_factor is not None: + # upper_bound = -lamval * penalty_factor[j] + # else: + # upper_bound = -lamval + # lower_bound = -np.inf + + # tg = df.loc[j,'TG'] + + # TG = TruncatedGaussian(estimate=Y[j], + # sigma=1, + # smoothing_sigma=np.sqrt((1 - prop) / prop), + # lower_bound=lower_bound, + # upper_bound=upper_bound, + # level=0.90) + # pval = TG.pvalue() + # pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) + # assert np.allclose(pval, df.loc[j]['pval']) + # df['pivot'] = pivots + return df From 8ff293a0d7ed7863acd9b7c27f2ed34a401b0336 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 8 Jul 2024 13:02:08 -0700 Subject: [PATCH 39/44] added upper / lower limits --- glmnet/inference.py | 39 ++++- tests/test_inference.py | 306 +++++++++++++++++++++------------------- 2 files changed, 194 insertions(+), 151 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index f610afa..ed6cd91 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -277,6 +277,20 @@ def lasso_inference(glmnet_obj, weight_full, standardize=glmnet_obj.standardize, intercept=glmnet_obj.fit_intercept) + + if np.asarray(FL.upper_limits).shape == (): + upper_limits = np.ones(D_sel.shape[1] - 1) * FL.upper_limits + else: + upper_limits = FL.upper_limits[active_set] + + if np.asarray(FL.lower_limits).shape == (): + lower_limits = np.ones(D_sel.shape[1] - 1) * FL.lower_limits + else: + lower_limits = FL.lower_limits[active_set] + + upper_limits = (D_sel.scaler_ @ np.hstack([0, upper_limits]))[1:] + lower_limits = (D_sel.scaler_ @ np.hstack([0, lower_limits]))[1:] + P_full = D.quadratic_form(unreg_GLM._information, transformed=True) if not FL.fit_intercept: @@ -315,7 +329,17 @@ def lasso_inference(glmnet_obj, # noisy_mle = stacked + delta # unitless scale penalized = penfac > 0 - sel_P = -np.diag(signs[penalized]) @ np.eye(Q_full.shape[0])[penalized] + n_penalized = penalized.sum() + n_coef = penalized.shape[0] + row_idx = np.arange(n_penalized) + col_idx = np.nonzero(penalized)[0] + data = -signs[penalized] + sel_P = scipy.sparse.coo_matrix((data, (row_idx, col_idx)), shape=(n_penalized, n_coef)) + + if FL.fit_intercept: + sel_U = sel_L = scipy.sparse.dia_array((np.ones((1, n_coef)), [1]), shape=(n_coef-1, n_coef)) + else: + sel_U = sel_L = scipy.sparse.eye(n_coef) ## the GLM's coef and intercept are on the original scale ## we transform them here to the (typically) unitless "standardized" scale @@ -337,15 +361,14 @@ def lasso_inference(glmnet_obj, if not FL.fit_intercept: transform_to_raw = transform_to_raw[1:,1:] - linear = sel_P - offset = np.zeros(sel_P.shape[0]) # -signs[penalized] * delta[penalized] + linear = scipy.sparse.vstack([sel_P, sel_U, -sel_L]) + offset = np.hstack([np.zeros(sel_P.shape[0]), upper_limits, -lower_limits]) active_con = AffineConstraint(linear=linear, offset=offset, observed=stacked) inactive = True if inactive: pf = FL.regularizer_.penalty_factor - scale = 1 / (pf + (pf <= 0)) logl_score = FL.state_.logl_score(FL._family, Y_sel, @@ -357,16 +380,18 @@ def lasso_inference(glmnet_obj, # with \bar{\beta}_E the GLM soln score_ = (FL.design_.T @ logl_score)[1:] - score_ *= scale score_ = score_[inactive_set] # we now know that `score_` is bounded by \pm lambda_val I = scipy.sparse.eye(score_.shape[0]) L = scipy.sparse.vstack([I, -I]) - O = np.ones(L.shape[0]) * lambda_val + O = (np.ones(L.shape[0]) * lambda_val * + np.hstack([pf[inactive_set], + pf[inactive_set]])) + fudge_factor = 0.02 # allow 2% relative error on inactive gradient inactive_con = AffineConstraint(linear=L, - offset=O, + offset=O * (1 + fudge_factor), observed=score_) for i in range(transform_to_raw.shape[0]): diff --git a/tests/test_inference.py b/tests/test_inference.py index f38a841..4f9e89a 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -8,23 +8,6 @@ from glmnet.inference import (lasso_inference, TruncatedGaussian) -@pytest.mark.parametrize('standardize', [True, False]) -@pytest.mark.parametrize('fit_intercept', [True, False]) -@pytest.mark.parametrize('p', [103]) -@pytest.mark.parametrize('n', [500]) -def test_inference(n, - p, - fit_intercept, - standardize, - ntrial=10): - # run a few times to be sure KKT conditions not violated - - for _ in range(ntrial): - run_inference(n, - p, - fit_intercept, - standardize) - def test_Auto(): df = sm.datasets.get_rdataset('Auto', package='ISLR').data @@ -48,161 +31,195 @@ def test_Auto(): (X[:m], Df.iloc[:m], None), (X, Df, None)) -def run_inference(n, - p, - fit_intercept, - standardize, - rng=None, - alt=False, - s=3, - prop=0.75, - penalty_facs=True, - family='gaussian', - orthogonal=False, - cv=False): - +def sample_orthogonal(rng=None, + p=50, + s=5, + penalty_facs=False, + alt=True, + prop=0.8): if rng is None: rng = np.random.default_rng(0) beta = np.zeros(p) subs = rng.choice(p, s, replace=False) - if not orthogonal: - X = rng.standard_normal((n, p)) - D = np.linspace(1, p, p) / p + 0.2 - X *= D[None,:] - Y = rng.standard_normal(n) * np.fabs(1+np.random.standard_normal()) - if alt: - beta[subs] = rng.uniform(3, 5) * rng.choice([-1,1], size=s, replace=True) / np.sqrt(n) - - mu = X @ beta - Y += mu - - if family == 'gaussian': - fam = sm.families.Gaussian() - elif family == 'probit': - fam = sm.families.Binomial(link=sm.families.links.Probit()) - Y = (mu + rng.standard_normal(n)) > 0 - else: - raise ValueError('only testing "gaussian" and "probit"') - - if penalty_facs: - penalty_factor = np.ones(p) - penalty_factor[:10] = 0.5 - else: - penalty_factor = None + X = np.identity(p) + Y = rng.standard_normal(p) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + mu = beta + Y += mu + Df = pd.DataFrame({'response':Y}) - if not orthogonal: + penalty_factor = np.ones(p) + if penalty_facs: + penalty_factor[:p//10] = 0.5 + penalty_factor[p//10:2*p//10] = 0 - GN = GLMNet(response_id='response', - fit_intercept=fit_intercept, - standardize=standardize, - family=fam, - penalty_factor=penalty_factor) + penalty_factor *= p / penalty_factor.sum() - m = int(prop*n) - - Df = pd.DataFrame({'response':Y}) - GN.fit(X[:m], Df.iloc[:m]) - if cv: - GN.cross_validation_path(X[:m], Df.iloc[:m]) - lamval = GN.index_best_['Mean Squared Error'] - else: - eps = rng.standard_normal((X.shape[0], 1000)) - lamval = 1.2 * np.fabs(X.T @ eps).max() / X.shape[0] + GN = GLMNet(response_id='response', + fit_intercept=False, + standardize=False, + penalty_factor=penalty_factor) + + lamval = 2 + Y_sel = Y + np.sqrt((1 - prop) / prop) * rng.standard_normal(Y.shape) + active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] + + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + GN.fit(X, Df) + active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] + + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + pivots = [] + test_pvals = [] + + if active_naive.shape[0] > 0: df = lasso_inference(GN, - lamval, - (X[:m], Df.iloc[:m], None), - (X, Df, None)) + prop * lamval / p, + (X_sel, Df_sel, None), + (X, Df, None), + dispersion=1) + df['target'] = mu[df.index] + + signs = np.sign(Y_sel[df.index]) + assert np.all(Y_sel[df.index] * signs >= lamval * GN.penalty_factor[df.index]) + else: + return None - if df is not None: - if fit_intercept: - active_set = np.array(df.index[1:]).astype(int) + if df is not None: + for j, s in zip(df.index, signs): + if s == 1: + upper_bound = np.inf + if penalty_factor is not None: + lower_bound = lamval * penalty_factor[j] + else: + lower_bound = lamval else: - active_set = np.array(df.index).astype(int) - X_sel = X[:,active_set] + if penalty_factor is not None: + upper_bound = -lamval * penalty_factor[j] + else: + upper_bound = -lamval + lower_bound = -np.inf + + tg = df.loc[j,'TG'] + + TG = TruncatedGaussian(estimate=Y[j], + sigma=1, + smoothing_sigma=np.sqrt((1 - prop) / prop), + lower_bound=lower_bound, + upper_bound=upper_bound, + level=0.90) + pval = TG.pvalue() + pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) + test_pvals.append(TG.pvalue(null_value=0)) + assert np.allclose(pval, df.loc[j]['pval']) + df['pivot'] = pivots + df['pval_byhand'] = test_pvals + assert np.allclose(df['pval'], df['pval_byhand']) - if fit_intercept: - X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) - targets = np.linalg.pinv(X_sel) @ mu + return df - df['target'] = targets +def sample_gaussian(n, + p, + fit_intercept, + standardize, + rng=None, + alt=False, + s=10, + prop=0.8, + penalty_facs=False, + cv=True): + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + + X = rng.standard_normal((n, p)) + D = np.linspace(1, p, p) / p + 0.2 + X *= D[None,:] + Y = rng.standard_normal(n) * np.fabs(1+np.random.standard_normal()) + if alt: + beta[subs] = rng.uniform(3, 5) * rng.choice([-1,1], size=s, replace=True) / np.sqrt(n) + + mu = X @ beta + Y += mu + + if penalty_facs: + penalty_factor = np.ones(p) + penalty_factor[:10] = 0.5 else: + penalty_factor = None - X = np.identity(p) - Y = rng.standard_normal(p) - if alt: - beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) - mu = beta - Y += mu - Df = pd.DataFrame({'response':Y}) - - GN = GLMNet(response_id='response', - family=fam, - fit_intercept=False, - standardize=False, - penalty_factor=penalty_factor) - - lamval = 2 - Y_sel = Y + np.sqrt((1 - prop) / prop) * rng.standard_normal(Y.shape) - active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] - - Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) - X_sel = np.sqrt(prop) * X - GN.fit(X, Df) - if active_naive.shape[0] > 0: - df = lasso_inference(GN, - prop * lamval / p, - (X_sel, Df_sel, None), - (X, Df, None), - dispersion=1) - df['target'] = mu[df.index] - - pivots = [] - signs = np.sign(Y_sel[df.index]) - assert np.all(Y_sel[df.index] * signs >= lamval * penalty_factor[df.index]) + GN = GLMNet(response_id='response', + fit_intercept=fit_intercept, + standardize=standardize, + penalty_factor=penalty_factor) + + m = int(prop*n) + + Df = pd.DataFrame({'response':Y}) + GN.fit(X[:m], Df.iloc[:m]) + cv = True + if cv: + GN.cross_validation_path(X[:m], Df.iloc[:m]) + lamval = GN.index_best_['Mean Squared Error'] + else: + eps = rng.standard_normal((X.shape[0], 1000)) + lamval = 1.2 * np.fabs(X.T @ eps).max() / X.shape[0] + df = lasso_inference(GN, + lamval, + (X[:m], Df.iloc[:m], None), + (X, Df, None)) + + if df is not None: + if fit_intercept: + active_set = np.array(df.index[1:]).astype(int) else: - return None - if alt and df is not None: - if family == 'probit' and not set(subs).issubset(active_set): - df['target'] *= np.nan + active_set = np.array(df.index).astype(int) + X_sel = X[:,active_set] + + if fit_intercept: + X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) + targets = np.linalg.pinv(X_sel) @ mu + + df['target'] = targets if df is not None: df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] - # for j, s in zip(df.index, signs): - # if s == 1: - # upper_bound = np.inf - # if penalty_factor is not None: - # lower_bound = lamval * penalty_factor[j] - # else: - # lower_bound = lamval - # else: - # if penalty_factor is not None: - # upper_bound = -lamval * penalty_factor[j] - # else: - # upper_bound = -lamval - # lower_bound = -np.inf - - # tg = df.loc[j,'TG'] - - # TG = TruncatedGaussian(estimate=Y[j], - # sigma=1, - # smoothing_sigma=np.sqrt((1 - prop) / prop), - # lower_bound=lower_bound, - # upper_bound=upper_bound, - # level=0.90) - # pval = TG.pvalue() - # pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) - # assert np.allclose(pval, df.loc[j]['pval']) - # df['pivot'] = pivots return df +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('penalty_facs', [True, False]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize('p', [103]) +@pytest.mark.parametrize('n', [500]) +def test_gaussian(n, + p, + standardize, + fit_intercept, + penalty_facs): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = sample_gaussian(n=n, + p=p, + standardize=standardize, + fit_intercept=fit_intercept, + penalty_facs=penalty_facs, + cv=False) + def main(fit_intercept=True, standardize=True, n=500, p=75, + s=5, ntrial=500, rng=None, family='gaussian', @@ -222,6 +239,7 @@ def main(fit_intercept=True, p, fit_intercept, standardize, + s=s, rng=rng, family=family, alt=alt, From ba066ed5f125056e0876bec3ff202325924846bf Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 8 Jul 2024 14:49:05 -0700 Subject: [PATCH 40/44] running from a covariance matrix form in tests, need a function for IRL --- tests/test_inference.py | 193 +++++++++++++++++++++++++++++----------- 1 file changed, 140 insertions(+), 53 deletions(-) diff --git a/tests/test_inference.py b/tests/test_inference.py index 4f9e89a..7b4902e 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -4,6 +4,8 @@ import pandas as pd import statsmodels.api as sm +import scipy.sparse + from glmnet import GLMNet from glmnet.inference import (lasso_inference, TruncatedGaussian) @@ -71,8 +73,6 @@ def sample_orthogonal(rng=None, GN.fit(X, Df) active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] - Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) - X_sel = np.sqrt(prop) * X pivots = [] test_pvals = [] @@ -86,8 +86,6 @@ def sample_orthogonal(rng=None, signs = np.sign(Y_sel[df.index]) assert np.all(Y_sel[df.index] * signs >= lamval * GN.penalty_factor[df.index]) - else: - return None if df is not None: for j, s in zip(df.index, signs): @@ -116,22 +114,99 @@ def sample_orthogonal(rng=None, pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) test_pvals.append(TG.pvalue(null_value=0)) assert np.allclose(pval, df.loc[j]['pval']) - df['pivot'] = pivots + df['pivot_byhand'] = pivots df['pval_byhand'] = test_pvals assert np.allclose(df['pval'], df['pval_byhand']) + if df is not None: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] + assert np.allclose(df['pivot'], df['pivot_byhand']) + + return df + +def sample_AR1(rho=0.6, + rng=None, + p=100, + s=5, + alt=True, + prop=0.8, + dispersion=2): + + D = np.fabs(np.subtract.outer(np.arange(p), np.arange(p))) + dispersion = 2 + S = (rho**D) * dispersion + + return sample_cov(S, + rng=rng, + p=p, + s=s, + alt=alt, + prop=prop) + +def sample_cov(S, + rng=None, + p=100, + s=5, + alt=True, + prop=0.8): + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + + S_sqrt = X = np.linalg.cholesky(S).T # X.T @ X = S + + noise = X.T @ rng.standard_normal(p) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + mu = S @ beta + + Z = mu + noise + Y = scipy.linalg.solve_triangular(S_sqrt.T, Z, lower=True) + Df = pd.DataFrame({'response':Y}) + GN = GLMNet(response_id='response', + fit_intercept=False, + standardize=False) + + lamval = 3 + Z_sel = Z + np.sqrt((1 - prop) / prop) * X.T @ rng.standard_normal(p) + Y_sel = scipy.linalg.solve_triangular(S_sqrt.T, Z_sel, lower=True) + test_sel = np.linalg.inv(X.T) @ Z_sel + assert np.allclose(Y_sel, test_sel) + + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + GN.fit(X, Df) + pivots = [] + + assert np.allclose(X.T @ Y_sel, Z_sel) + + df = lasso_inference(GN, + prop * lamval / p, + (X_sel, Df_sel, None), + (X, Df, None), + dispersion=1) + if df is not None: + active = list(df.index) + df['target'] = np.linalg.inv(S[active][:,active]) @ mu[active] + + if df is not None: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] + return df -def sample_gaussian(n, - p, - fit_intercept, - standardize, - rng=None, - alt=False, - s=10, - prop=0.8, - penalty_facs=False, - cv=True): +def sample_randomX(n, + p, + fit_intercept, + standardize, + rng=None, + alt=False, + s=10, + prop=0.8, + penalty_facs=False, + cv=True, + upper_limits=np.inf): if rng is None: rng = np.random.default_rng(0) @@ -157,7 +232,8 @@ def sample_gaussian(n, GN = GLMNet(response_id='response', fit_intercept=fit_intercept, standardize=standardize, - penalty_factor=penalty_factor) + penalty_factor=penalty_factor, + upper_limits=upper_limits) m = int(prop*n) @@ -196,37 +272,55 @@ def sample_gaussian(n, @pytest.mark.parametrize('standardize', [True, False]) @pytest.mark.parametrize('penalty_facs', [True, False]) @pytest.mark.parametrize('fit_intercept', [True, False]) +#@pytest.mark.parametrize('upper_limits', [np.inf, 0.1]) @pytest.mark.parametrize('p', [103]) @pytest.mark.parametrize('n', [500]) -def test_gaussian(n, - p, - standardize, - fit_intercept, - penalty_facs): +def test_randomX(n, + p, + standardize, + fit_intercept, + penalty_facs, + upper_limits=np.inf): for _ in range(5): df = None while df is None: # make sure it is run - df = sample_gaussian(n=n, - p=p, - standardize=standardize, - fit_intercept=fit_intercept, - penalty_facs=penalty_facs, - cv=False) - - -def main(fit_intercept=True, - standardize=True, - n=500, - p=75, - s=5, - ntrial=500, - rng=None, - family='gaussian', - alt=False, - cv=False, - orthogonal=False, - penalty_facs=False): + kwargs = dict(n=n, + p=p, + standardize=standardize, + fit_intercept=fit_intercept, + penalty_facs=penalty_facs, + cv=False, + upper_limits=upper_limits) + + df = main(sample_randomX, + kwargs, + ntrial=1) + +def test_orthogonal(p=100): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = main(sample_orthogonal, + {'p':p}, + ntrial=1) + + +def test_AR1(p=100, + rho=0.6): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = sample_AR1(p=p, + rho=rho) + + + +def main(sampler, + kwargs, + ntrial=500): ncover = 0 nsel = 0 @@ -234,18 +328,10 @@ def main(fit_intercept=True, dfs = [] rng = np.random.default_rng(0) + kwargs.update(rng=rng) + for i in range(ntrial): - df = run_inference(n, - p, - fit_intercept, - standardize, - s=s, - rng=rng, - family=family, - alt=alt, - cv=cv, - orthogonal=orthogonal, - penalty_facs=penalty_facs) + df = sampler(**kwargs) if df is not None: dfs.append(df) if len(dfs) > 0: @@ -258,7 +344,8 @@ def main(fit_intercept=True, (all_df['pval'] < 0.05).mean(), (all_df['pivot'] < 0.05).mean(), all_df['pivot'].std()) - + if len(dfs) > 0: + return all_df def test_truncated_inference(B=1000, smoothing_sigma=np.sqrt(1/3), From 26526fc244fd654f443075f7a56acd6bde306ed5 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 8 Jul 2024 22:03:57 -0700 Subject: [PATCH 41/44] added functions for lasso from score form and resampler form --- glmnet/inference.py | 126 ++++++++++++++++++++++++++++++++++++++-- tests/test_inference.py | 95 ++++++++++++++++++++---------- 2 files changed, 187 insertions(+), 34 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index ed6cd91..b8175c8 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -18,8 +18,9 @@ mp.dps = 80 from .base import _get_design +from .glmnet import GLMNet -from dataclasses import dataclass +from dataclasses import dataclass, asdict from typing import Optional @dataclass @@ -236,6 +237,7 @@ def lasso_inference(glmnet_obj, raise NotImplementedError('upper/lower limits coming soon') active_set = np.nonzero(FL.coef_ != 0)[0] + if active_set.shape[0] != 0: inactive_set = np.nonzero(FL.coef_ == 0)[0] @@ -288,8 +290,8 @@ def lasso_inference(glmnet_obj, else: lower_limits = FL.lower_limits[active_set] - upper_limits = (D_sel.scaler_ @ np.hstack([0, upper_limits]))[1:] - lower_limits = (D_sel.scaler_ @ np.hstack([0, lower_limits]))[1:] + upper_limits = D_sel.scaling_ * upper_limits + lower_limits = D_sel.scaling_ * lower_limits P_full = D.quadratic_form(unreg_GLM._information, transformed=True) @@ -309,7 +311,7 @@ def lasso_inference(glmnet_obj, signs = np.sign(FL.coef_[active_set]) noisy_mle = D.raw_to_scaled(unreg_sel_GLM.state_) - + if FL.fit_intercept: penfac = np.hstack([0, penfac]) signs = np.hstack([0, signs]) @@ -343,7 +345,9 @@ def lasso_inference(glmnet_obj, ## the GLM's coef and intercept are on the original scale ## we transform them here to the (typically) unitless "standardized" scale + full_mle = D.raw_to_scaled(unreg_GLM.state_) + if FL.fit_intercept: full_mle = full_mle._stack else: @@ -1063,3 +1067,117 @@ def F(theta): mle = np.nan return L, U, mle, pval, sel_distr + +def score_inference(score, + cov_score, + lamval, + prop=0.8, + chol_cov=None, + perturbation=None, + rng=None): + + # perturbation should be N(0, cov_score) roughly + + # this is the X for the LASSO problem + # X.T @ X = S = cov_score + + if chol_cov is None: + chol_cov = X = np.linalg.cholesky(cov_score).T + + if rng is None: + rng = np.random.default_rng() + + # shorthand + Z = score # X.T @ Y in OLS case + X = chol_cov + p = X.shape[1] + + if perturbation is None: + perturbation = X.T @ rng.standard_normal(p) # X is square here... + + # this is the Y of the LASSO problem + + Y = scipy.linalg.solve_triangular(chol_cov.T, Z, lower=True) + Df = pd.DataFrame({'response':Y}) + + GN = GLMNet(response_id='response', + fit_intercept=False, + standardize=False) + GN.fit(X, Df) + + Z_sel = Z + np.sqrt((1 - prop) / prop) * perturbation + Y_sel = scipy.linalg.solve_triangular(chol_cov.T, Z_sel, lower=True) + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + + return lasso_inference(GN, + prop * lamval / p, + (X_sel, Df_sel, None), + (X, Df, None), + dispersion=1) + +def resampler_inference(sample, + lamval=None, + lam_frac=1, + prop=0.8, + random_idx=None, + rng=None, + estimate=None, + standardize=True): + + if estimate is None: + estimate = sample.mean(0) + + centered = sample - estimate[None,:] + B, p = centered.shape + + if random_idx is None: + if rng is None: + rng = np.random.default_rng() + random_idx = rng.choice(B, 1) + + prec_score = centered.T @ centered / B + cov_score = np.linalg.inv(prec_score) + + centered_scores = centered @ cov_score + scaling = centered_scores.std(0) + score = cov_score @ estimate + + if standardize: + centered_scores /= scaling[None,:] + cov_score /= np.multiply.outer(scaling, scaling) + score /= scaling + + # pick a lam + if lamval is None: + max_scores = np.fabs(centered_scores).max(1) + lamval = lam_frac * np.median(max_scores) + + # pick a pseudo-Gaussian perturbation + perturbation = centered_scores[random_idx].reshape((p,)) + + df = score_inference(score=score, + cov_score=cov_score, + lamval=lamval, + prop=prop, + perturbation=perturbation) + + if df is not None: + if standardize: + for col in ['mle', 'upper', 'lower']: + df[col].values[:] = df[col] / scaling[df.index] + + for j in df.index: + df.loc[j,'TG'] = None + # XX try to correct this!! + # tg = df.loc[j, 'TG'] + # tg_params = asdict(tg) + # tg_params.update(estimate=tg.estimate * scaling[j], + # sigma=tg.sigma * scaling[j], + # smoothing_sigma=tg.smoothing_sigma * scaling[j], + # lower_bound=tg.lower_bound * scaling[j], + # upper_bound=tg.upper_bound * scaling[j]) + # df.loc[j,'TG'] = TruncatedGaussian(**tg_params) + + return df + diff --git a/tests/test_inference.py b/tests/test_inference.py index 7b4902e..4b845bc 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -8,6 +8,8 @@ from glmnet import GLMNet from glmnet.inference import (lasso_inference, + score_inference, + resampler_inference, TruncatedGaussian) def test_Auto(): @@ -124,6 +126,41 @@ def sample_orthogonal(rng=None, return df +def resample_orthogonal(rng=None, + p=50, + s=5, + alt=True, + prop=0.8, + standardize=True, + B=2000): + + # when standardize is True, coverage will be fine but pivot won't be + # this is because the TruncatedGaussian instance is not corrected for scale + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + + scale = np.linspace(1, 3, p) + beta_hat = beta + rng.standard_normal(p) * scale + bootstrap_noise = rng.standard_normal((B, p)) * scale[None,:] + sample = bootstrap_noise + beta_hat[None,:] + + df = resampler_inference(sample, + prop=prop, + standardize=standardize) + if df is not None: + df['target'] = beta[df.index] + if not standardize: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j,'target']) for j in df.index] + else: + # scaling transformation not taken into account so cannot reuse TruncatedGaussian object + df['pivot'] = np.nan * df['pval'] + return df + def sample_AR1(rho=0.6, rng=None, p=100, @@ -148,7 +185,8 @@ def sample_cov(S, p=100, s=5, alt=True, - prop=0.8): + prop=0.8, + lamval=3): if rng is None: rng = np.random.default_rng(0) @@ -163,30 +201,12 @@ def sample_cov(S, mu = S @ beta Z = mu + noise - Y = scipy.linalg.solve_triangular(S_sqrt.T, Z, lower=True) - Df = pd.DataFrame({'response':Y}) - GN = GLMNet(response_id='response', - fit_intercept=False, - standardize=False) - lamval = 3 - Z_sel = Z + np.sqrt((1 - prop) / prop) * X.T @ rng.standard_normal(p) - Y_sel = scipy.linalg.solve_triangular(S_sqrt.T, Z_sel, lower=True) - test_sel = np.linalg.inv(X.T) @ Z_sel - assert np.allclose(Y_sel, test_sel) - - Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) - X_sel = np.sqrt(prop) * X - GN.fit(X, Df) - pivots = [] - - assert np.allclose(X.T @ Y_sel, Z_sel) - - df = lasso_inference(GN, - prop * lamval / p, - (X_sel, Df_sel, None), - (X, Df, None), - dispersion=1) + df = score_inference(score=Z, + cov_score=S, + lamval=lamval, + prop=prop, + chol_cov=S_sqrt) if df is not None: active = list(df.index) df['target'] = np.linalg.inv(S[active][:,active]) @ mu[active] @@ -203,6 +223,7 @@ def sample_randomX(n, rng=None, alt=False, s=10, + snr=1, prop=0.8, penalty_facs=False, cv=True, @@ -214,11 +235,12 @@ def sample_randomX(n, subs = rng.choice(p, s, replace=False) X = rng.standard_normal((n, p)) - D = np.linspace(1, p, p) / p + 0.2 + D = np.ones(p) # np.linspace(1, p, p) / p + 1 X *= D[None,:] - Y = rng.standard_normal(n) * np.fabs(1+np.random.standard_normal()) + sd = np.fabs(1+np.random.standard_normal()) + Y = rng.standard_normal(n) * sd if alt: - beta[subs] = rng.uniform(3, 5) * rng.choice([-1,1], size=s, replace=True) / np.sqrt(n) + beta[subs] = snr * (rng.uniform(3, 5) * rng.choice([-1,1], size=s, replace=True) / np.sqrt(n)) mu = X @ beta Y += mu @@ -239,13 +261,15 @@ def sample_randomX(n, Df = pd.DataFrame({'response':Y}) GN.fit(X[:m], Df.iloc[:m]) - cv = True + if cv: GN.cross_validation_path(X[:m], Df.iloc[:m]) lamval = GN.index_best_['Mean Squared Error'] else: - eps = rng.standard_normal((X.shape[0], 1000)) - lamval = 1.2 * np.fabs(X.T @ eps).max() / X.shape[0] + eps = rng.standard_normal((X.shape[0], 1000)) * sd + noise_score = X.T @ eps + lamval = 1.2 * np.median(np.fabs(noise_score).max(0)) / X.shape[0] + df = lasso_inference(GN, lamval, (X[:m], Df.iloc[:m], None), @@ -297,6 +321,17 @@ def test_randomX(n, kwargs, ntrial=1) +def test_resampler(p=50, + standardize=True): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = main(resample_orthogonal, + {'p':p}, + ntrial=1) + + def test_orthogonal(p=100): for _ in range(5): From 5e2de209ea4fa42801ee1c812cecc5ddf68a6228 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Mon, 8 Jul 2024 23:12:13 -0700 Subject: [PATCH 42/44] test for non-diagonal covariance with resample --- tests/test_inference.py | 53 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/tests/test_inference.py b/tests/test_inference.py index 4b845bc..39bf74d 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -180,6 +180,51 @@ def sample_AR1(rho=0.6, alt=alt, prop=prop) +def resample_AR1(rng=None, + p=50, + rho=0.6, + s=5, + alt=True, + prop=0.8, + standardize=True, + B=2000): + + # when standardize is True, coverage will be fine but pivot won't be + # this is because the TruncatedGaussian instance is not corrected for scale + + D = np.fabs(np.subtract.outer(np.arange(p), np.arange(p))) + dispersion = 2 + S_init = (rho**D) * dispersion + S = np.linalg.inv(S_init) + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + + S_sqrt = np.linalg.cholesky(S) + beta_hat = beta + S_sqrt @ rng.standard_normal(p) + bootstrap_noise = rng.standard_normal((B, p)) @ S_sqrt.T + sample = bootstrap_noise + beta_hat[None,:] + + S_i = np.linalg.inv(S) + mu = S_i @ beta + + df = resampler_inference(sample, + prop=prop, + standardize=standardize) + if df is not None: + prec_E = S_i[df.index][:,df.index] + df['target'] = np.linalg.inv(prec_E) @ mu[df.index] + if not standardize: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j,'target']) for j in df.index] + else: + # scaling transformation not taken into account so cannot reuse TruncatedGaussian object + df['pivot'] = np.nan * df['pval'] + return df + def sample_cov(S, rng=None, p=100, @@ -374,10 +419,10 @@ def main(sampler, ncover += ((all_df['lower'] < all_df['target']) & (all_df['upper'] > all_df['target'])).sum() nsel += all_df.shape[0] - print('cover:', - ncover / nsel, 'power:', - (all_df['pval'] < 0.05).mean(), - (all_df['pivot'] < 0.05).mean(), all_df['pivot'].std()) + print('cover:', ncover / nsel, + 'power:', (all_df['pval'] < 0.05).mean(), + 'pivot<0.05:', (all_df['pivot'] < 0.05).mean(), + 'std(pivot):', all_df['pivot'].std()) if len(dfs) > 0: return all_df From f1493e333d12c261ab4ac65a966766e582092e6f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 26 Jul 2024 13:09:42 -0700 Subject: [PATCH 43/44] making it possible to store results of inference a bit easier --- tests/test_inference.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_inference.py b/tests/test_inference.py index 39bf74d..d9430e9 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -124,7 +124,7 @@ def sample_orthogonal(rng=None, df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] assert np.allclose(df['pivot'], df['pivot_byhand']) - return df + return df, None def resample_orthogonal(rng=None, p=50, @@ -223,7 +223,7 @@ def resample_AR1(rng=None, else: # scaling transformation not taken into account so cannot reuse TruncatedGaussian object df['pivot'] = np.nan * df['pval'] - return df + return df, None def sample_cov(S, rng=None, @@ -246,12 +246,14 @@ def sample_cov(S, mu = S @ beta Z = mu + noise - + perturbation = X.T @ rng.standard_normal(p) + df = score_inference(score=Z, cov_score=S, lamval=lamval, prop=prop, - chol_cov=S_sqrt) + chol_cov=S_sqrt, + perturbation=perturbation) if df is not None: active = list(df.index) df['target'] = np.linalg.inv(S[active][:,active]) @ mu[active] @@ -259,7 +261,7 @@ def sample_cov(S, if df is not None: df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] - return df + return df, {'score':Z,'cov_score':S, 'lamval':lamval, 'prop':prop, 'chol_cov':S_sqrt, 'perturbation':perturbation} def sample_randomX(n, p, @@ -336,7 +338,7 @@ def sample_randomX(n, if df is not None: df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] - return df + return df, None @pytest.mark.parametrize('standardize', [True, False]) @pytest.mark.parametrize('penalty_facs', [True, False]) @@ -411,7 +413,9 @@ def main(sampler, kwargs.update(rng=rng) for i in range(ntrial): - df = sampler(**kwargs) + df, _ = sampler(**kwargs) + print(kwargs) + print(rng.standard_normal()) if df is not None: dfs.append(df) if len(dfs) > 0: From 21311a6c2c1882ea0bc8257a004db62feaebbd03 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Fri, 26 Jul 2024 13:10:00 -0700 Subject: [PATCH 44/44] making it possible to store results of inference a bit easier --- glmnet/inference.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/glmnet/inference.py b/glmnet/inference.py index b8175c8..8388bb6 100644 --- a/glmnet/inference.py +++ b/glmnet/inference.py @@ -420,7 +420,11 @@ def lasso_inference(glmnet_obj, if FL.fit_intercept: idx = ['intercept'] + idx - return pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us, 'TG':TGs}, index=idx) + TG_df = pd.concat([pd.Series(asdict(tg)) for tg in TGs], axis=1).T + TG_df.index = idx + df = pd.concat([pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx), TG_df], axis=1) + df['TG'] = TGs + return df else: return None @@ -1076,7 +1080,7 @@ def score_inference(score, perturbation=None, rng=None): - # perturbation should be N(0, cov_score) roughly + # perturbation should be N(0, cov_score) roughly -- shouldn't have a multiplier for the proportion! # this is the X for the LASSO problem # X.T @ X = S = cov_score