diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..8edfacf --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +glmnet/_version.py export-subst diff --git a/README.md b/README.md index 09bc0e7..26544cc 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,7 @@ -# pyglmnet -Python bindings to glmnet base source +# glmnet -## Organization - -- `src` contains `glmnet` C++ code for the two main routines - `spwls_exp` and `wls_exp`. -- `tests` contains test data from R so that we can verify we get the - same results in python. Also contains some code for regenerating - data using an instrumented version of glmnet. - - -## Steps to build - -- Assuming Xcode with command line tools is installed already. - -- Also probably a good idee to test in a new virtual env - -### Create a new virtual env, say `pyg`. - -``` -conda create -n pyg python=3.10 -y -conda activate pyg -``` - -### Install requirements - -``` -pip install -r requirements.txt -``` - -#### Specifying `Eigen` +## Specifying `Eigen` By default, build assumes the top level directory contains a directory `eigen` with the `Eigen` headers. This can be achieved with @@ -48,26 +19,6 @@ the appropriate path. pip install . ``` -Load the module: - -``` -import glmnet.glmnetpp as gpp -``` - -This provides two functions: - -- `gpp.wls_exp` for dense $x$ -- `gpp.spwls_exp` for sparse $x$ - -### Testing - -Example invocations can be seen in `tests/test_pyglmnet.py`. So one can run all the tests via: - -``` -pytest tests -``` - -No error messages mean all is well. diff --git a/glmnet/_version.py b/glmnet/_version.py index 9b01ea2..3f737f4 100644 --- a/glmnet/_version.py +++ b/glmnet/_version.py @@ -5,8 +5,9 @@ # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. -# This file is released into the public domain. Generated by -# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer) +# This file is released into the public domain. +# Generated by versioneer-0.29 +# https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" @@ -15,10 +16,11 @@ import re import subprocess import sys -from typing import Callable, Dict +from typing import Any, Callable, Dict, List, Optional, Tuple +import functools -def get_keywords(): +def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must @@ -34,17 +36,24 @@ def get_keywords(): class VersioneerConfig: """Container for Versioneer configuration parameters.""" + VCS: str + style: str + tag_prefix: str + parentdir_prefix: str + versionfile_source: str + verbose: bool -def get_config(): + +def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" - cfg.tag_prefix = "" - cfg.parentdir_prefix = "ISLP-" - cfg.versionfile_source = "ISLP/_version.py" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "glmnet-" + cfg.versionfile_source = "glmnet/_version.py" cfg.verbose = False return cfg @@ -57,9 +66,9 @@ class NotThisMethod(Exception): HANDLERS: Dict[str, Dict[str, Callable]] = {} -def register_vcs_handler(vcs, method): # decorator +def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" - def decorate(f): + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} @@ -68,11 +77,25 @@ def decorate(f): return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command( + commands: List[str], + args: List[str], + cwd: Optional[str] = None, + verbose: bool = False, + hide_stderr: bool = False, + env: Optional[Dict[str, str]] = None, +) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None + + popen_kwargs: Dict[str, Any] = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + for command in commands: try: dispcmd = str([command] + args) @@ -80,10 +103,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr - else None)) + else None), **popen_kwargs) break - except OSError: - e = sys.exc_info()[1] + except OSError as e: if e.errno == errno.ENOENT: continue if verbose: @@ -103,7 +125,11 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, process.returncode -def versions_from_parentdir(parentdir_prefix, root, verbose): +def versions_from_parentdir( + parentdir_prefix: str, + root: str, + verbose: bool, +) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both @@ -128,13 +154,13 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): @register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): +def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. - keywords = {} + keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: @@ -156,7 +182,11 @@ def git_get_keywords(versionfile_abs): @register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): +def git_versions_from_keywords( + keywords: Dict[str, str], + tag_prefix: str, + verbose: bool, +) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") @@ -220,7 +250,12 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): @register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): +def git_pieces_from_vcs( + tag_prefix: str, + root: str, + verbose: bool, + runner: Callable = run_command +) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* @@ -228,13 +263,18 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): version string, meaning we're inside a checked out source tree. """ GITS = ["git"] - TAG_PREFIX_REGEX = "*" if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - TAG_PREFIX_REGEX = r"\*" + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -242,11 +282,10 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", - "%s%s" % (tag_prefix, TAG_PREFIX_REGEX)], - cwd=root) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -256,7 +295,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() - pieces = {} + pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None @@ -335,8 +374,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() @@ -348,14 +387,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" -def render_pep440(pieces): +def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you @@ -380,7 +419,7 @@ def render_pep440(pieces): return rendered -def render_pep440_branch(pieces): +def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards @@ -410,7 +449,7 @@ def render_pep440_branch(pieces): return rendered -def pep440_split_post(ver): +def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the @@ -420,7 +459,7 @@ def pep440_split_post(ver): return vc[0], int(vc[1] or 0) if len(vc) == 2 else None -def render_pep440_pre(pieces): +def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: @@ -432,7 +471,7 @@ def render_pep440_pre(pieces): tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: - rendered += ".post%d.dev%d" % (post_version+1, pieces["distance"]) + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: @@ -444,7 +483,7 @@ def render_pep440_pre(pieces): return rendered -def render_pep440_post(pieces): +def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards @@ -471,7 +510,7 @@ def render_pep440_post(pieces): return rendered -def render_pep440_post_branch(pieces): +def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. @@ -500,7 +539,7 @@ def render_pep440_post_branch(pieces): return rendered -def render_pep440_old(pieces): +def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. @@ -522,7 +561,7 @@ def render_pep440_old(pieces): return rendered -def render_git_describe(pieces): +def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. @@ -542,7 +581,7 @@ def render_git_describe(pieces): return rendered -def render_git_describe_long(pieces): +def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. @@ -562,7 +601,7 @@ def render_git_describe_long(pieces): return rendered -def render(pieces, style): +def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", @@ -598,7 +637,7 @@ def render(pieces, style): "date": pieces.get("date")} -def get_versions(): +def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some diff --git a/glmnet/base.py b/glmnet/base.py index 257fbe0..18708c0 100644 --- a/glmnet/base.py +++ b/glmnet/base.py @@ -55,6 +55,10 @@ def __post_init__(self, standardize, intercept): self.X = (self.X - self.centers_[None,:]) / self.scaling_[None,:] self.X = np.asfortranarray(self.X) + self.unscaler_ = UnscaleOperator(centers=self.centers_, + scaling=self.scaling_) + self.scaler_ = ScaleOperator(centers=self.centers_, + scaling=self.scaling_) # LinearOperator API def _matvec(self, x): @@ -95,8 +99,15 @@ def _rmatvec(self, r): def quadratic_form(self, G=None, - columns=None): + columns=None, + transformed=False): ''' + if transformed is False: compute + + [1 X]'G[1 X[:,E]] + + if transformed is True: compute + A'GA[:,E] where A is the effective matrix @@ -104,6 +115,7 @@ def quadratic_form(self, [1, XS^{-1} - 1 (xm/xs)'] and E is a subset of columns + ''' # A is effective matrix @@ -112,6 +124,9 @@ def quadratic_form(self, # A'G1 = S^{-1}(X' - xm 1')G1 = S^{-1}X'G1 - S^{-1}xm 1'G1 # A'GA = S^{-1}X'GXS^{-1} - S^{-1}X'G1 xm/xs' - xm/xs 1'GXS^{-1} + xm/xs 1'G1 (xm/xs)' + # or, the reverse + # X'GX = SA'GAS + X'G1xm' + xm1'GX - xm1'G1xm' + n, p = self.shape[0], self.shape[1] - 1 if columns is None: @@ -122,35 +137,49 @@ def quadratic_form(self, if G is None: XX_block = self.X.T @ X_R # have to assume this is not too expensive + if scipy.sparse.issparse(self.X): + XX_block = XX_block.toarray() X1_block = self.X.sum(0) # X'1 G_sum = n - + else: GX = G @ X_R G1 = G @ np.ones(G.shape[0]) XX_block = self.X.T @ GX X1_block = self.X.T @ G1 G_sum = G1.sum() - - if scipy.sparse.issparse(XX_block): - XX_block = XX_block.toarray() - - # correct XX_block for standardize - + xm, xs = self.centers_, self.scaling_ + if scipy.sparse.issparse(self.X): # in this case X has not been transformed - if columns is not None: - XX_block -= (np.multiply.outer(X1_block, xm[columns]) + np.multiply.outer(xm, X1_block[columns])) - XX_block += np.multiply.outer(xm, xm[columns]) * G_sum - XX_block /= np.multiply.outer(xs, xs[columns]) - else: - XX_block -= (np.multiply.outer(X1_block, xm) + np.multiply.outer(xm, X1_block)) - XX_block += np.multiply.outer(xm, xm) * G_sum - XX_block /= np.multiply.outer(xs, xs) + # correct XX_block for standardize + + if transformed: + if columns is not None: + XX_block -= (np.multiply.outer(X1_block, xm[columns]) + np.multiply.outer(xm, X1_block[columns])) + XX_block += np.multiply.outer(xm, xm[columns]) * G_sum + XX_block /= np.multiply.outer(xs, xs[columns]) + else: + XX_block -= (np.multiply.outer(X1_block, xm) + np.multiply.outer(xm, X1_block)) + XX_block += np.multiply.outer(xm, xm) * G_sum + XX_block /= np.multiply.outer(xs, xs) + + X1_block -= G_sum * xm + X1_block /= xs + + else: # X will already have been transformed, so + # we only have to undo it if transformed=False + + # or, the reverse + # X'GX = SA'GAS + X'G1xm' + xm1'GX - xm1'G1xm' + + if not transformed: + XX_block *= np.multiply.outer(xs, xs[columns]) + X1_block *= xs + XX_block += (np.multiply.outer(X1_block, xm[columns]) + np.multiply.outer(xm, X1_block[columns])) + XX_block += np.multiply.outer(xm, xm[columns]) * G_sum + X1_block += G_sum * xm - X1_block -= G_sum * xm - X1_block /= xs - Q = np.zeros((XX_block.shape[0] + 1, XX_block.shape[1] + 1)) Q[1:,1:] = XX_block @@ -160,9 +189,103 @@ def quadratic_form(self, else: Q[0,1:] = X1_block Q[0,0] = G_sum - + return Q + def scaled_to_raw(self, state): + """ + Take a "scaled" (intercept, coef) (these are the params + used by glmnet) and return a (intercept, coef) on "raw" scale. + """ + unscaled = self.unscaler_ @ state._stack + coef = unscaled[1:] + intercept = unscaled[0] + klass = state.__class__ + return klass(coef=coef, + intercept=intercept) + + def raw_to_scaled(self, state): + """ + Take a "raw" (intercept, coef) and return a (intercept, coef) + on "scaled" scale (i.e. the scale GLMnet uses in its objective). + """ + unscaled = self.scaler_ @ state._stack + coef = unscaled[1:] + intercept = unscaled[0] + klass = state.__class__ + return klass(coef=coef, + intercept=intercept) + +@dataclass +class UnscaleOperator(LinearOperator): + + scaling : np.ndarray + centers: np.ndarray + + def __post_init__(self): + ncoef = self.scaling.shape[0] + self.shape = (ncoef+1,)*2 + self.dtype = float + + # LinearOperator API + def _matvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[1:] = coef / self.scaling + result[0] = intercept - (result[1:] * self.centers).sum() + + return result + + def _rmatvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[0] = intercept + result[1:] = coef / self.scaling + result[1:] -= intercept * self.centers / self.scaling + + return result + +@dataclass +class ScaleOperator(LinearOperator): + + scaling : np.ndarray + centers: np.ndarray + + def __post_init__(self): + ncoef = self.scaling.shape[0] + self.shape = (ncoef+1,)*2 + self.dtype = float + + # LinearOperator API + def _matvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[1:] = coef * self.scaling + result[0] = intercept + (coef * self.centers).sum() + + return result + + def _rmatvec(self, stacked): + + intercept = float(stacked[0]) + coef = np.squeeze(stacked[1:]) + + result = np.zeros(stacked.shape[0]) + result[0] = intercept + result[1:] = coef * self.scaling + result[1:] += intercept * self.centers + + return result + @add_dataclass_docstring @dataclass class DiagonalOperator(LinearOperator): diff --git a/glmnet/cox.py b/glmnet/cox.py index c2e1f1d..4256850 100644 --- a/glmnet/cox.py +++ b/glmnet/cox.py @@ -57,11 +57,12 @@ def logl_score(self, family, y, sample_weight): + link_parameter = self.link_parameter family._result = family._coxdev(link_parameter, sample_weight) # the gradient is the gradient of the deviance - # we want deviance of the log-likelihood + # we want gradient of the log-likelihood return - family._result.gradient / 2 @dataclass @@ -190,10 +191,10 @@ def _get_family_spec(self, status_id=self.family.status_id, start_id=self.family.start_id) - def _check(self, - X, - y, - check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -212,7 +213,8 @@ def _summarize(self, # IRLS used normalized weights, # this unnormalizes them... - unscaled_precision_ = self.design_.quadratic_form(self._information) + unscaled_precision_ = self.design_.quadratic_form(self._information, + transformed=False) keep = np.ones(unscaled_precision_.shape[0]-1, bool) if exclude is not []: @@ -243,7 +245,10 @@ class RegCoxLM(RegGLM): fit_intercept: Literal[False] = False - def _check(self, X, y, check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -268,10 +273,10 @@ class CoxNet(GLMNet): fit_intercept: Literal[False] = False regularized_estimator: BaseEstimator = RegCoxLM - def _check(self, - X, - y, - check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -290,6 +295,12 @@ def _get_family_spec(self, status_id=self.family.status_id, start_id=self.family.start_id) + def get_LM(self): + return CoxLM(family=self.family, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id) + def _get_initial_state(self, X, y, @@ -305,10 +316,7 @@ def _get_initial_state(self, if keep.sum() > 0: X_keep = X[:,keep] - coxlm = CoxLM(family=self.family, - offset_id=self.offset_id, - weight_id=self.weight_id, - response_id=self.response_id) + coxlm = self.get_LM() coxlm.fit(X_keep, y) coef_[keep] = coxlm.coef_ diff --git a/glmnet/docstrings.py b/glmnet/docstrings.py index 95346ef..31ffed0 100644 --- a/glmnet/docstrings.py +++ b/glmnet/docstrings.py @@ -12,6 +12,11 @@ # Data class containing GLMNet control parameters. # ''', + 'intercept':''' +intercept: + For a Design, is there an intercept? + ''', + 'nlambda':''' nlambda: int Number of values on data-dependent grid of lambda values. diff --git a/glmnet/glm.py b/glmnet/glm.py index 6476cbf..e2ee28c 100644 --- a/glmnet/glm.py +++ b/glmnet/glm.py @@ -28,7 +28,6 @@ _get_data) from .base import (Design, - DiagonalOperator, _get_design) from .scoring import (Scorer, @@ -54,44 +53,23 @@ class GLMFamilySpec(object): base: sm_family.Family = field(default_factory=sm_family.Gaussian) def link(self, - mean_parameter): - mu = mean_parameter # shorthand + mu): return self.base.link(mu) - link.__doc__ = """ -Parameters ----------- -{mean_parameter} - -Returns -------- -{linear_predictor} -""".format(**_docstrings).strip() def deviance(self, y, - mean_parameter, - sample_weight=None): + mu, + sample_weight): if sample_weight is not None: - return self.base.deviance(y, mean_parameter, freq_weights=sample_weight) + return self.base.deviance(y, mu, freq_weights=sample_weight) else: - return self.base.deviance(y, mean_parameter) - deviance.__doc__ = ''' -Parameters ----------- - -{mean_parameter} -{sample_weight} - -Returns -------- -{deviance} - '''.format(**_docstrings).strip() + return self.base.deviance(y, mu) def null_fit(self, y, - fit_intercept=True, - sample_weight=None, - offset=None): + sample_weight, + offset, + fit_intercept): sample_weight = np.asarray(sample_weight) y = np.asarray(y) @@ -131,63 +109,41 @@ def null_fit(self, self, offset, None) + else: state = GLMState(np.zeros(1), 0) state.link_parameter = offset state.mean_parameter = self.base.link.inverse(state.link_parameter) return state - null_fit.__doc__ = ''' -Parameters ----------- - -{mean_parameter} -{fit_intercept} -{sample_weight} -{offset} - -Returns -------- -null_state: GLMState - Fitted null state of GLM. - '''.format(**_docstrings).strip() def get_null_deviance(self, y, - fit_intercept=True, - sample_weight=None, - offset=None): + sample_weight, + offset, + fit_intercept): state0 = self.null_fit(y, - fit_intercept=fit_intercept, - sample_weight=sample_weight, - offset=offset) - D = self.deviance(y, - state0.mean_parameter, - sample_weight=sample_weight) + sample_weight, + offset, + fit_intercept) + D = self.deviance(y, state0.mean_parameter, sample_weight) return state0, D - get_null_deviance.__doc__ = ''' -Parameters ----------- - -{mean_parameter} -{fit_intercept} -{sample_weight} -{offset} + def get_null_state(self, + null_fit, + nvars): + coefold = np.zeros(nvars) # initial coefs = 0 + state = GLMState(coef=coefold, + intercept=null_fit.intercept) + state.mean_parameter = null_fit.mean_parameter + state.link_parameter = null_fit.link_parameter + return state -Returns -------- -null_state: GLMState - Fitted null state of GLM. -{deviance} - '''.format(**_docstrings).strip() - def get_response_and_weights(self, state, - response, + y, offset, sample_weight): - y = response # shorthand family = self.base # some checks for NAs/zeros @@ -205,24 +161,28 @@ def get_response_and_weights(self, return pseudo_response, newton_weights - get_response_and_weights.__doc__ = ''' -Parameters ----------- + def _default_scorers(self): -state: GLMState - State of GLM. -{response} -{offset} -{sample_weight} - -Returns -------- -pseudo_response: np.ndarray - Pseudo-response for (quasi) Newton step. -newton_weights: np.ndarray - Weights to be used for diagonal in Newton step. - - '''.format(**_docstrings).strip() + fam_name = self.base.__class__.__name__ + + def _dev(y, yhat, sample_weight): + return self.deviance(y, yhat, sample_weight) / y.shape[0] + dev_scorer = Scorer(name=f'{fam_name} Deviance', + score=_dev, + maximize=False) + + scorers_ = [dev_scorer, + mse_scorer, + mae_scorer, + ungrouped_mse_scorer, + ungrouped_mae_scorer] + + if isinstance(self.base, sm_family.Binomial): + scorers_.extend([accuracy_scorer, + auc_scorer, + aucpr_scorer]) + + return scorers_ def information(self, state, @@ -246,59 +206,24 @@ def information(self, n = W.shape[0] W = W.reshape(-1) return DiagonalOperator(W) - information.__doc__ = ''' -Parameters ----------- - -state: GLMState - State of GLM. -{sample_weight} - -Returns -------- -information: DiagonalOperator - Diagonal information matrix of the response vector for - `state.mean_pararmeter`. - '''.format(**_docstrings).strip() - - # Private methods - - def _default_scorers(self): - """ - Construct default scorers for GLM. - """ - fam_name = self.base.__class__.__name__ +@dataclass +class DiagonalOperator(LinearOperator): - def _dev(y, yhat, sample_weight): - return self.deviance(y, yhat, sample_weight) / y.shape[0] - dev_scorer = Scorer(name=f'{fam_name} Deviance', - score=_dev, - maximize=False) - - scorers_ = [dev_scorer, - mse_scorer, - mae_scorer, - ungrouped_mse_scorer, - ungrouped_mae_scorer] + weights: np.ndarray - if isinstance(self.base, sm_family.Binomial): - scorers_.extend([accuracy_scorer, - auc_scorer, - aucpr_scorer]) + def __post_init__(self): + self.weights = np.asarray(self.weights).reshape(-1) + n = self.weights.shape[0] + self.shape = (n, n) - return scorers_ + def _matvec(self, arg): + return self.weights * arg.reshape(-1) - def _get_null_state(self, - null_fit, - nvars): - coefold = np.zeros(nvars) # initial coefs = 0 - state = GLMState(coef=coefold, - intercept=null_fit.intercept) - state.mean_parameter = null_fit.mean_parameter - state.link_parameter = null_fit.link_parameter - return state + def _adjoint(self, arg): + return self._matvec(arg) + @add_dataclass_docstring @dataclass class GLMControl(object): @@ -314,9 +239,9 @@ class GLMBaseSpec(object): family: sm_family.Family = field(default_factory=sm_family.Gaussian) fit_intercept: bool = True control: GLMControl = field(default_factory=GLMControl) - response_id: Union[str,int] = None - weight_id: Union[str,int] = None offset_id: Union[str,int] = None + weight_id: Union[str,int] = None + response_id: Union[str,int] = None exclude: list = field(default_factory=list) add_dataclass_docstring(GLMBaseSpec, subs={'control':'control_glm'}) @@ -380,6 +305,7 @@ def logl_score(self, dmu_deta = family.link.inverse_deriv(self.link_parameter) # compute working residual + y = np.asarray(y).reshape(-1) r = (y - self.mu) return sample_weight * r * dmu_deta / varmu @@ -459,7 +385,7 @@ class GLMBase(BaseEstimator, GLMBaseSpec): def _get_regularizer(self, - X): + nvars=None): return GLMRegularizer(fit_intercept=self.fit_intercept) # no standardization for GLM @@ -478,7 +404,7 @@ def _get_family_spec(self, elif isinstance(self.family, GLMFamilySpec): return self.family - def _check(self, X, y, check=True): + def get_data_arrays(self, X, y, check=True): return _get_data(self, X, y, @@ -491,7 +417,8 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 4 options non sklearn API - dispersion=1, + warm_state=None, + dispersion=None, check=True, fit_null=True): @@ -505,7 +432,7 @@ def fit(self, if not hasattr(self, "_family"): self._family = self._get_family_spec(y) - X, y, response, offset, weight = self._check(X, y, check=check) + X, y, response, offset, weight = self.get_data_arrays(X, y, check=check) sample_weight = weight self.sample_weight_ = normed_sample_weight = sample_weight / sample_weight.sum() @@ -530,9 +457,12 @@ def fit(self, # the regularizer stores the warm start if regularizer is None: - regularizer = self._get_regularizer(X) + regularizer = self._get_regularizer(nvars=design.X.shape[1]) self.regularizer_ = regularizer + if warm_state is not None: + self.regularizer_.warm_state = warm_state + if fit_null or not hasattr(self.regularizer_, 'warm_state'): (null_state, self.null_deviance_) = self._family.get_null_deviance( @@ -606,13 +536,14 @@ def obj_function(response, self._set_coef_intercept(state) - if (hasattr(self._family, "base") and + if (dispersion is None and hasattr(self._family, "base") and isinstance(self._family.base, sm_family.Gaussian)): # GLM specific # usual estimate of sigma^2 self.dispersion_ = self.deviance_ / (nobs-nvar-self.fit_intercept) else: self.dispersion_ = dispersion + self.state_ = state return self fit.__doc__ = ''' Fit a GLM. @@ -622,18 +553,14 @@ def obj_function(response, {X} {y} -{sample_weight} -{regularizer} -{dispersion} -{check} -fit_null: bool - Fit a null model if no warm state present in the regularizer. - +{weights} +{summarize} + Returns ------- -self: GLMBase - Fitted GLM. +self: object + GLM class instance. '''.format(**_docstrings) def predict(self, X, prediction_type='response'): @@ -684,39 +611,32 @@ def score(self, X, y, sample_weight=None): '''.format(**_docstrings).strip() def _set_coef_intercept(self, state): - self.coef_ = state.coef.copy() # this makes a copy -- important to make this copy because `state.coef` is persistent - if hasattr(self, 'standardize') and self.standardize: - self.scaling_ = self.design_.scaling_ - self.coef_ /= self.scaling_ - self.intercept_ = state.intercept - (self.coef_ * self.design_.centers_).sum() + raw_state = self.design_.scaled_to_raw(state) + self.coef_ = raw_state.coef + self.intercept_ = raw_state.intercept GLMBase.__doc__ = ''' Base class to fit a Generalized Linear Model (GLM). Base class for `GLMNet`. Parameters ---------- -{family} {fit_intercept} +{summarize} +{family} {control_glm} -{response_id} -{weight_id} -{offset_id} -{exclude} Attributes ----------- - +__________ {coef_} {intercept_} -{regularizer_} +{summary_} +{covariance_} {null_deviance_} {deviance_} {dispersion_} - +{regularizer_} '''.format(**_docstrings) - - @dataclass class GLM(GLMBase): @@ -727,6 +647,7 @@ def fit(self, y, sample_weight=None, regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True): @@ -734,10 +655,11 @@ def fit(self, y, sample_weight=sample_weight, regularizer=regularizer, + warm_state=warm_state, dispersion=dispersion, check=check) - weight = self._check(X, y, check=False)[-1] + weight = self.get_data_arrays(X, y, check=False)[-1] if self.summarize: self.covariance_, self.summary_ = self._summarize(self.exclude, @@ -745,7 +667,6 @@ def fit(self, weight, X.shape) - else: self.summary_ = self.covariance_ = None @@ -760,7 +681,8 @@ def _summarize(self, # IRLS used normalized weights, # this unnormalizes them... - unscaled_precision_ = self.design_.quadratic_form(self._information) + unscaled_precision_ = self.design_.quadratic_form(self._information, + transformed=False) keep = np.ones(unscaled_precision_.shape[0]-1, bool) if exclude is not []: @@ -796,36 +718,6 @@ def _summarize(self, index=index) return covariance_, summary_ -GLM.__doc__ = ''' -Base class to fit a Generalized Linear Model (GLM). Base class for `GLMNet`. - -Parameters ----------- -{family} -{fit_intercept} -{control_glm} -{response_id} -{weight_id} -{offset_id} -{exclude} - -Notes ------ - -This is a test - -Attributes ----------- - -{coef_} -{intercept_} -{regularizer_} -{null_deviance_} -{deviance_} -{dispersion_} -{summary_} -{covariance_} -'''.format(**_docstrings) @dataclass class GaussianGLM(RegressorMixin, GLM): @@ -836,9 +728,9 @@ class BinomialGLM(ClassifierMixin, GLM): family: sm_family.Family = field(default_factory=sm_family.Binomial) - def _check(self, X, y, check=True): + def get_data_arrays(self, X, y, check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) encoder = LabelEncoder() labels = np.asfortranarray(encoder.fit_transform(response)) self.classes_ = encoder.classes_ @@ -851,6 +743,7 @@ def fit(self, y, sample_weight=None, regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True): @@ -865,6 +758,7 @@ def fit(self, y, sample_weight=sample_weight, regularizer=regularizer, # last 4 options non sklearn API + warm_state=warm_state, dispersion=dispersion, check=check) diff --git a/glmnet/glmnet.py b/glmnet/glmnet.py index e8c9953..fdd3ff1 100644 --- a/glmnet/glmnet.py +++ b/glmnet/glmnet.py @@ -66,10 +66,10 @@ class GLMNetSpec(object): class GLMNet(BaseEstimator, GLMNetSpec): - def _check(self, - X, - y, - check=True): + def get_data_arrays(self, + X, + y, + check=True): return _get_data(self, X, y, @@ -89,12 +89,13 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 3 options non sklearn API + warm_state=None, interpolation_grid=None): if not hasattr(self, "_family"): self._family = self._get_family_spec(y) - X, y, response, offset, weight = self._check(X, y) + X, y, response, offset, weight = self.get_data_arrays(X, y) if isinstance(X, pd.DataFrame): self.feature_names_in_ = list(X.columns) @@ -124,7 +125,11 @@ def fit(self, exclude=self.exclude ) - self.reg_glm_est_.fit(X, y, None, fit_null=False) + self.reg_glm_est_.fit(X, + y, + None, + fit_null=False, + warm_state=warm_state) regularizer_ = self.reg_glm_est_.regularizer_ state, keep_ = self._get_initial_state(X, @@ -184,6 +189,8 @@ def fit(self, check=False, fit_null=False) + self.state_ = self.reg_glm_est_.state_ + coefs_.append(self.reg_glm_est_.coef_.copy()) intercepts_.append(self.reg_glm_est_.intercept_) dev_ratios_.append(1 - self.reg_glm_est_.deviance_ / self.null_deviance_) @@ -308,7 +315,7 @@ def cross_validation_path(self, # truncate to the size we got predictions = predictions[:,:self.lambda_values_.shape[0]] - response, offset, weight = self._check(X, y, check=False)[2:] + response, offset, weight = self.get_data_arrays(X, y, check=False)[2:] # adjust for offset # because predictions are just X\beta @@ -457,7 +464,7 @@ def _get_initial_state(self, intercept_ = glm.intercept_ else: if self.fit_intercept: - response, offset, weight = self._check(X, y, check=False)[2:] + response, offset, weight = self.get_data_arrays(X, y, check=False)[2:] state0 = self._family.null_fit(response, weight, offset, @@ -468,5 +475,35 @@ def _get_initial_state(self, intercept_ = 0 return GLMState(coef_, intercept_), keep.astype(float) + def get_GLM(self): + return GLM(family=self.family, + fit_intercept=self.fit_intercept, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id) + + def get_fixed_lambda(self, + lambda_val): + + check_is_fitted(self, ["coefs_", "feature_names_in_"]) + estimator = self.regularized_estimator( + lambda_val=lambda_val, + family=self.family, + alpha=self.alpha, + penalty_factor=self.penalty_factor, + lower_limits=self.lower_limits, + upper_limits=self.upper_limits, + fit_intercept=self.fit_intercept, + standardize=self.standardize, + control=self.control, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id, + exclude=self.exclude + ) + coefs, intercepts = self.interpolate_coefs([lambda_val]) + cls = self.state_.__class__ + state = cls(coefs[0], intercepts[0]) + return estimator, state diff --git a/glmnet/inference.py b/glmnet/inference.py new file mode 100644 index 0000000..8388bb6 --- /dev/null +++ b/glmnet/inference.py @@ -0,0 +1,1187 @@ +""" +This module contains the core code needed for post selection +after LASSO. + +""" + +import warnings +from warnings import warn +from copy import copy + +import numpy as np +import scipy.sparse +import pandas as pd +from scipy.stats import norm as normal_dbn +import statsmodels.api as sm + +import mpmath as mp +mp.dps = 80 + +from .base import _get_design +from .glmnet import GLMNet + +from dataclasses import dataclass, asdict +from typing import Optional + +@dataclass +class TruncatedGaussian(object): + + estimate: float + sigma: float + smoothing_sigma: float + lower_bound: float + upper_bound: float + level: Optional[float] = 0.9 + num_sd: Optional[float] = 10 + num_grid: Optional[float] = 4000 + use_sample: Optional[bool] = False + seed: Optional[int] = 0 + + def _get_family(self, + basept=None): + + if basept is None: + basept = self.estimate + + self._rng = np.random.default_rng(self.seed) + + if not self.use_sample: + grid = np.linspace(basept - self.num_sd * self.sigma, + basept + self.num_sd * self.sigma, self.num_grid) + + weight = (normal_dbn.cdf((self.upper_bound - grid) / self.smoothing_sigma) + - normal_dbn.cdf((self.lower_bound - grid) / self.smoothing_sigma)) + + weight *= normal_dbn.pdf((grid - basept) / self.sigma) + + return discrete_family(grid, weight) + + else: + sample = self._rng.standard_normal(self.num_grid) * self.sigma + basept + weight = (normal_dbn.cdf((self.upper_bound - sample) / self.smoothing_sigma) + - normal_dbn.cdf((self.lower_bound - sample) / self.smoothing_sigma)) + return discrete_family(sample, weight) + + def pvalue(self, + null_value=0, + alternative='twosided', + basept=None): + + if basept is None: + basept = null_value + + if alternative not in ['twosided', 'greater', 'less']: + raise ValueError("alternative should be one of ['twosided', 'greater', 'less']") + _family = self._get_family(basept=basept) + tilt = (null_value - basept) / self.sigma**2 + if alternative in ['less', 'twosided']: + _cdf = _family.cdf(tilt, x=self.estimate) + if alternative == 'less': + pvalue = _cdf + else: + pvalue = 2 * min(_cdf, 1 - _cdf) + else: + pvalue = _family.sf(tilt, x=self.estimate) + return pvalue + + def interval(self, + basept=None): + + if basept is None: + basept = self.estimate + + _family = self._get_family(basept=basept) + + L, U = _family.equal_tailed_interval(self.estimate, + alpha=1-self.level) + L *= self.sigma**2; L += basept + U *= self.sigma**2; U += basept + + return L, U + + def MLE(self, + basept=None): + + if basept is None: + basept = self.estimate + + _family = self._get_family(basept=basept) + + mle, _, _ = _family.MLE(self.estimate) + mle *= self.sigma**2; mle += basept + return mle + +@dataclass +class AffineConstraint(object): + + linear: np.ndarray + offset: np.ndarray + observed: np.ndarray + + def __post_init__(self): + + if not all (self.linear @ self.observed - self.offset <= 0): + raise ValueError('constraint not satisfied for observed data') + + def interval_constraints(self, + target, + gamma, + tol = 1.e-4): + r""" + Given an affine constraint $\{z:Az \leq b \leq \}$ (elementwise) + specified with $A$ as `support_directions` and $b$ as + `support_offset`, a new direction of interest $\eta$, and + an `observed_data` is Gaussian vector $Z \sim N(\mu,\Sigma)$ + with `covariance` matrix $\Sigma$, this + function returns $\eta^TZ$ as well as an interval + bounding this value. + + The interval constructed is such that the endpoints are + independent of $\eta^TZ$, hence the $p$-value + of `Kac Rice`_ + can be used to form an exact pivot. + + Parameters + ---------- + + support_directions : np.float + Matrix specifying constraint, $A$. + + support_offsets : np.float + Offset in constraint, $b$. + + covariance : np.float + Covariance matrix of `observed_data`. + + observed_data : np.float + Observations. + + direction_of_interest : np.float + Direction in which we're interested for the + contrast. + + tol : float + Relative tolerance parameter for deciding + sign of $Az-b$. + + Returns + ------- + + lower_bound : float + + observed : float + + upper_bound : float + + sigma : float + + """ + + # shorthand + A, b, D = (self.linear, + self.offset, + self.observed) + + N = D - gamma * target + + U = A.dot(N) - b + V = A @ gamma + + # Inequalities are now U + V @ target <= 0 + + # adding the zero_coords in the denominator ensures that + # there are no divide-by-zero errors in RHS + # these coords are never used in upper_bound or lower_bound + + zero_coords = V == 0 + RHS = -U / (V + zero_coords) + RHS[zero_coords] = np.nan + pos_coords = V > tol * np.fabs(V).max() + neg_coords = V < -tol * np.fabs(V).max() + + if np.any(pos_coords): + upper_bound = RHS[pos_coords].min() + else: + upper_bound = np.inf + + if np.any(neg_coords): + lower_bound = RHS[neg_coords].max() + else: + lower_bound = -np.inf + + return lower_bound, upper_bound + +def lasso_inference(glmnet_obj, + lambda_val, + selection_data, + full_data, + level=.9, + dispersion=None): + + fixed_lambda, warm_state = glmnet_obj.get_fixed_lambda(lambda_val) + X_sel, Y_sel, weight_sel = selection_data + X_sel, Y_sel, _, _, weight_sel = fixed_lambda.get_data_arrays(X_sel, + Y_sel) + if weight_sel is None: + weight_sel = np.ones(X_sel.shape[0]) + + fixed_lambda.fit(X_sel, + Y_sel, + sample_weight=weight_sel, + warm_state=warm_state) + + FL = fixed_lambda # shorthand + + if (not np.all(FL.upper_limits >= FL.control.big) or + not np.all(FL.lower_limits <= -FL.control.big)): + raise NotImplementedError('upper/lower limits coming soon') + + active_set = np.nonzero(FL.coef_ != 0)[0] + + if active_set.shape[0] != 0: + inactive_set = np.nonzero(FL.coef_ == 0)[0] + + # fit unpenalized model on selection data + + unreg_sel_GLM = glmnet_obj.get_GLM() + + unreg_sel_GLM.summarize = True + unreg_sel_GLM.fit(X_sel[:,active_set], + Y_sel, + sample_weight=weight_sel, + dispersion=dispersion) + + # # quadratic approximation up to scaling and a factor of weight_sel.sum() + + D_sel = _get_design(X_sel[:,active_set], + weight_sel, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + P_noisy = D_sel.quadratic_form(unreg_sel_GLM._information, + transformed=True) + + # fit unpenalized model on full data + + X_full, Y_full, weight_full = full_data + if weight_full is None: + weight_full = np.ones(X_full.shape[0]) + + unreg_GLM = glmnet_obj.get_GLM() + unreg_GLM.summarize = True + unreg_GLM.fit(X_full[:,active_set], + Y_full, + sample_weight=weight_full, + dispersion=dispersion) + + # quadratic approximation + + D = _get_design(X_full[:,active_set], + weight_full, + standardize=glmnet_obj.standardize, + intercept=glmnet_obj.fit_intercept) + + if np.asarray(FL.upper_limits).shape == (): + upper_limits = np.ones(D_sel.shape[1] - 1) * FL.upper_limits + else: + upper_limits = FL.upper_limits[active_set] + + if np.asarray(FL.lower_limits).shape == (): + lower_limits = np.ones(D_sel.shape[1] - 1) * FL.lower_limits + else: + lower_limits = FL.lower_limits[active_set] + + upper_limits = D_sel.scaling_ * upper_limits + lower_limits = D_sel.scaling_ * lower_limits + + P_full = D.quadratic_form(unreg_GLM._information, + transformed=True) + if not FL.fit_intercept: + if FL.penalty_factor is not None: + penfac = FL.penalty_factor[active_set] + else: + penfac = np.ones_like(active_set) + P_full = P_full[1:,1:] + P_noisy = P_noisy[1:,1:] + else: + penfac = np.ones(active_set.shape[0]) + + Q_full = np.linalg.inv(P_full) + Q_noisy = np.linalg.inv(P_noisy) + + signs = np.sign(FL.coef_[active_set]) + + noisy_mle = D.raw_to_scaled(unreg_sel_GLM.state_) + + if FL.fit_intercept: + penfac = np.hstack([0, penfac]) + signs = np.hstack([0, signs]) + stacked = np.hstack([FL.state_.intercept, + FL.state_.coef[active_set]]) + noisy_mle = noisy_mle._stack + else: + stacked = FL.state_.coef[active_set] + noisy_mle = noisy_mle.coef + + # delta = lambda_val * penfac * signs + + # # remember loss of glmnet is normalized by sum of weights + # # when taking newton step adjust by weight_sel.sum() + + # delta = Q_sel @ delta * weight_sel.sum() + # noisy_mle = stacked + delta # unitless scale + + penalized = penfac > 0 + n_penalized = penalized.sum() + n_coef = penalized.shape[0] + row_idx = np.arange(n_penalized) + col_idx = np.nonzero(penalized)[0] + data = -signs[penalized] + sel_P = scipy.sparse.coo_matrix((data, (row_idx, col_idx)), shape=(n_penalized, n_coef)) + + if FL.fit_intercept: + sel_U = sel_L = scipy.sparse.dia_array((np.ones((1, n_coef)), [1]), shape=(n_coef-1, n_coef)) + else: + sel_U = sel_L = scipy.sparse.eye(n_coef) + + ## the GLM's coef and intercept are on the original scale + ## we transform them here to the (typically) unitless "standardized" scale + + full_mle = D.raw_to_scaled(unreg_GLM.state_) + + if FL.fit_intercept: + full_mle = full_mle._stack + else: + full_mle = full_mle.coef + + ## iterate over coordinates + Ls = np.zeros_like(stacked) + Us = np.zeros_like(stacked) + mles = np.zeros_like(stacked) + pvals = np.zeros_like(stacked) + TGs = [] + + transform_to_raw = (D.unscaler_ @ + np.identity(D.shape[1])) + if not FL.fit_intercept: + transform_to_raw = transform_to_raw[1:,1:] + + linear = scipy.sparse.vstack([sel_P, sel_U, -sel_L]) + offset = np.hstack([np.zeros(sel_P.shape[0]), upper_limits, -lower_limits]) + active_con = AffineConstraint(linear=linear, + offset=offset, + observed=stacked) + inactive = True + if inactive: + pf = FL.regularizer_.penalty_factor + + logl_score = FL.state_.logl_score(FL._family, + Y_sel, + weight_sel / weight_sel.sum()) + + # X_{-E}'(Y-X\hat{\beta}_E) -- \hat{\beta}_E is the LASSO soln, not GLM ! + # this is (2nd order Taylor series sense) equivalent to + # X_{-E}'(Y-X\bar{\beta}_E) + X_{-E}'WX_E(X_E'WX_E)^{-1}\lambda_E s_E + # with \bar{\beta}_E the GLM soln + + score_ = (FL.design_.T @ logl_score)[1:] + score_ = score_[inactive_set] + # we now know that `score_` is bounded by \pm lambda_val + + I = scipy.sparse.eye(score_.shape[0]) + L = scipy.sparse.vstack([I, -I]) + O = (np.ones(L.shape[0]) * lambda_val * + np.hstack([pf[inactive_set], + pf[inactive_set]])) + + fudge_factor = 0.02 # allow 2% relative error on inactive gradient + inactive_con = AffineConstraint(linear=L, + offset=O * (1 + fudge_factor), + observed=score_) + + for i in range(transform_to_raw.shape[0]): + ## call selection_interval and return + TG = _split_interval(active_con=active_con, + Q_noisy=Q_noisy * unreg_GLM.dispersion_, + Q_full=Q_full * unreg_GLM.dispersion_, + noisy_observation=noisy_mle, # these must be same scale / shift as glmnet + observation=full_mle, # these must be same scale / shift as glmnet + direction_of_interest=transform_to_raw[i], # this matrix describes the map from glmnet("transform") coords to raw("original") scale + level=level) + + (L, U), mle, pval = (TG.interval(), TG.MLE(), TG.pvalue()) + + Ls[i] = L + Us[i] = U + mles[i] = mle + pvals[i] = pval + TGs.append(TG) + + idx = active_set.tolist() + if FL.fit_intercept: + idx = ['intercept'] + idx + + TG_df = pd.concat([pd.Series(asdict(tg)) for tg in TGs], axis=1).T + TG_df.index = idx + df = pd.concat([pd.DataFrame({'mle': mles, 'pval': pvals, 'lower': Ls, 'upper': Us}, index=idx), TG_df], axis=1) + df['TG'] = TGs + return df + else: + return None + +def _split_interval(active_con, + Q_noisy, + Q_full, + noisy_observation, + observation, + direction_of_interest, + tol = 1.e-4, + level = 0.90, + dispersion=1): + """ + Given an affine in cone constraint $\{z:Az+b \leq 0\}$ (elementwise) + specified with $A$ as `support_directions` and $b$ as + `support_offset`, a new direction of interest $\eta$, and + `noisy_observation` is Gaussian vector $Z \sim N(\mu,\Sigma)$ + with `covariance` matrix $\Sigma$, this + function returns a confidence interval + for $\eta^T\mu$. + + Parameters + ---------- + + support_directions : np.float + Matrix specifying constraint, $A$. + + support_offset : np.float + Offset in constraint, $b$. + + covariance : np.float + Covariance matrix of `observed_data`. + + noisy_observation : np.float + Observations. + + observation : np.float + Observations. + + direction_of_interest : np.float + Direction in which we're interested for the + contrast. + + tol : float + Relative tolerance parameter for deciding + sign of $Az-b$. + + Returns + ------- + + confidence_interval : (float, float) + + """ + + noisy_var = direction_of_interest.T @ Q_noisy @ direction_of_interest + full_var = direction_of_interest.T @ Q_full @ direction_of_interest + noisy_estimate = (direction_of_interest * noisy_observation).sum() + estimate = (direction_of_interest * observation).sum() + + slice_dir = Q_full @ direction_of_interest / full_var + (lower_bound, + upper_bound) = active_con.interval_constraints(noisy_estimate, + slice_dir, + tol=tol) + + sigma = np.sqrt(full_var) + smoothing_sigma = np.sqrt(max(noisy_var - full_var, 0)) + + return TruncatedGaussian(estimate=estimate, + sigma=sigma, + smoothing_sigma=smoothing_sigma, + lower_bound=lower_bound, + upper_bound=upper_bound, + level=level) + +def _norm_interval(lower, upper): + r""" + A multiprecision evaluation of + + .. math:: + + \Phi(U) - \Phi(L) + + Parameters + ---------- + + lower : float + The lower limit $L$ + + upper : float + The upper limit $U$ + + """ + #cdf = normal_dbn.cdf + cdf = mp.ncdf + if lower > 0: + if lower < 4: + return cdf(-lower) - cdf(-upper) + else: + return np.exp(-np.fabs(lower*(upper-lower))) + elif upper < 0: + if upper < -4: + return np.exp(-np.fabs(upper*(upper-lower))) + else: + return cdf(-lower) - cdf(-upper) + else: + return cdf(upper) - cdf(lower) + +def find_root(f, y, lb, ub, tol=1e-6): + """ + searches for solution to f(x) = y in (lb, ub), where + f is a monotone decreasing function + """ + + # make sure solution is in range + a, b = lb, ub + fa, fb = f(a), f(b) + + # assume a < b + if fa > y and fb > y: + while fb > y : + b, fb = b + (b-a), f(b + (b-a)) + elif fa < y and fb < y: + while fa < y : + a, fa = a - (b-a), f(a - (b-a)) + + + # determine the necessary number of iterations + try: + max_iter = int( np.ceil( ( np.log(tol) - np.log(b-a) ) / np.log(0.5) ) ) + except OverflowError: + warnings.warn('root finding failed, returning np.nan') + return np.nan + + + # bisect (slow but sure) until solution is obtained + for _ in range(max_iter): + try: + c, fc = (a+b)/2, f((a+b)/2) + if fc > y: a = c + elif fc < y: b = c + except OverflowError: + warnings.warn('root finding failed, returning np.nan') + return np.nan + + return c + +class discrete_family(object): + + def __init__(self, sufficient_stat, weights, theta=0.): + r""" + A discrete 1-dimensional + exponential family with reference measure $\sum_j w_j \delta_{X_j}$ + and sufficient statistic `sufficient_stat`. For any $\theta$, the distribution + is + + .. math:: + + P_{\theta} = \sum_{j} e^{\theta X_j - \Lambda(\theta)} w_j \delta_{X_j} + + where + + .. math:: + + \Lambda(\theta) = \log \left(\sum_j w_j e^{\theta X_j} \right). + + Parameters + ---------- + + sufficient_stat : `np.float((n))` + + weights : `np.float(n)` + + Notes + ----- + + The weights are normalized to sum to 1. + """ + xw = np.array(sorted(zip(sufficient_stat, weights)), float) + xw[:,1] = np.maximum(xw[:,1], 1e-40) + self._x = xw[:,0] + self._w = xw[:,1] + self._lw = np.log(xw[:,1]) + + self._w /= self._w.sum() # make sure they are a pmf + self.n = len(xw) + self._theta = np.nan + self.theta = theta + + @property + def theta(self): + """ + The natural parameter of the family. + """ + return self._theta + + @theta.setter + def theta(self, _theta): + if _theta != self._theta: + _thetaX = _theta * self.sufficient_stat + self._lw + _largest = _thetaX.max() - 15 # try to avoid over/under flow, 5 seems arbitrary + _exp_thetaX = np.exp(_thetaX - _largest) + _prod = _exp_thetaX + self._partition = np.sum(_prod) + self._pdf = _prod / self._partition + self._partition *= np.exp(_largest) + self._theta = _theta + + @property + def partition(self): + r""" + Partition function at `self.theta`: + + .. math:: + + \sum_j e^{\theta X_j} w_j + """ + if hasattr(self, "_partition"): + return self._partition + + @property + def sufficient_stat(self): + """ + Sufficient statistics of the exponential family. + """ + return self._x + + @property + def weights(self): + """ + Weights of the exponential family. + """ + return self._w + + def pdf(self, theta): + r""" + Density of $P_{\theta}$ with respect to $P_0$. + + Parameters + ---------- + + theta : float + Natural parameter. + + Returns + ------- + + pdf : np.float + + """ + self.theta = theta # compute partition if necessary + return self._pdf + + def cdf(self, theta, x=None, gamma=1): + r""" + The cumulative distribution function of $P_{\theta}$ with + weight `gamma` at `x` + + .. math:: + + P_{\theta}(X < x) + \gamma * P_{\theta}(X = x) + + Parameters + ---------- + + theta : float + Natural parameter. + + x : float (optional) + Where to evaluate CDF. + + gamma : float(optional) + Weight given at `x`. + + Returns + ------- + + cdf : np.float + + """ + pdf = self.pdf(theta) + if x is None: + return np.cumsum(pdf) - pdf * (1 - gamma) + else: + tr = np.sum(pdf * (self.sufficient_stat < x)) + if x in self.sufficient_stat: + tr += gamma * np.sum(pdf[np.where(self.sufficient_stat == x)]) + return tr + + def sf(self, theta, x=None, gamma=0, return_unnorm=False): + r""" + The complementary cumulative distribution function + (i.e. survival function) of $P_{\theta}$ with + weight `gamma` at `x` + + .. math:: + + P_{\theta}(X > x) + \gamma * P_{\theta}(X = x) + + Parameters + ---------- + + theta : float + Natural parameter. + + x : float (optional) + Where to evaluate SF. + + gamma : float(optional) + Weight given at `x`. + + Returns + ------- + + sf : np.float + + """ + pdf = self.pdf(theta) + if x is None: + return np.cumsum(pdf[::-1])[::-1] - pdf * (1 - gamma) + else: + tr = np.sum(pdf * (self.sufficient_stat > x)) + if x in self.sufficient_stat: + tr += gamma * np.sum(pdf[np.where(self.sufficient_stat == x)]) + return tr + + def E(self, theta, func): + r""" + Expectation of `func` under $P_{\theta}$ + + Parameters + ---------- + + theta : float + Natural parameter. + + func : callable + Assumed to be vectorized. + + gamma : float(optional) + Weight given at `x`. + + Returns + ------- + + E : np.float + + """ + T = np.asarray(func(self.sufficient_stat)) + pdf_ = self.pdf(theta) + + if T.ndim == 1: + return (T * pdf_).sum() + else: + val = (T * pdf_[:,None]).sum(0) + return val + + + def Var(self, theta, func): + r""" + Variance of `func` under $P_{\theta}$ + + Parameters + ---------- + + theta : float + Natural parameter. + + func : callable + Assumed to be vectorized. + + Returns + ------- + + var : np.float + + """ + + mu = self.E(theta, func) + return self.E(theta, lambda x: (func(x)-mu)**2) + + def Cov(self, theta, func1, func2): + r""" + Covariance of `func1` and `func2` under $P_{\theta}$ + + Parameters + ---------- + + theta : float + Natural parameter. + + func1, func2 : callable + Assumed to be vectorized. + + Returns + ------- + + cov : np.float + + """ + + mu1 = self.E(theta, func1) + mu2 = self.E(theta, func2) + return self.E(theta, lambda x: (func1(x)-mu1)*(func2(x)-mu2)) + + def equal_tailed_interval(self, + observed, + alpha=0.05, + randomize=True, + auxVar=None, + tol=1e-6): + """ + Form interval by inverting + equal-tailed test with $\alpha/2$ in each tail. + + Parameters + ---------- + + observed : float + Observed sufficient statistic. + + alpha : float (optional) + Size of two-sided test. + + randomize : bool + Perform the randomized test (or conservative test). + + auxVar : [None, float] + If randomizing and not None, use this + as the random uniform variate. + + Returns + ------- + + lower, upper : float + Limits of confidence interval. + + """ + + mu = self.E(self.theta, lambda x: x) + sigma = np.sqrt(self.Var(self.theta, lambda x: x)) + lb = mu - 20 * sigma + ub = mu + 20 * sigma + F = lambda th : self.cdf(th, observed) + L = find_root(F, 1.0 - 0.5 * alpha, lb, ub) + U = find_root(F, 0.5 * alpha, lb, ub) + return L, U + + def equal_tailed_test(self, + theta0, + observed, + alpha=0.05): + r""" + Equal tailed test of H_0:theta=theta_0 + + Parameters + ---------- + + theta0 : float + Natural parameter under null hypothesis. + + observed : float + Observed sufficient statistic. + + alpha : float (optional) + Size of two-sided test. + + randomize : bool + Perform the randomized test (or conservative test). + + Returns + ------- + + decision : np.bool + Is the null hypothesis $H_0:\theta=\theta_0$ rejected? + + Notes + ----- + + We need an auxiliary uniform variable to carry out the randomized test. + Larger auxVar corresponds to x being slightly "larger." It can be passed in, + or chosen at random. If randomize=False, we get a conservative test. + """ + + pval = self.cdf(theta0, observed, gamma=0.5) + return min(pval, 1-pval) < alpha + + def MLE(self, + observed, + initial=0, + max_iter=20, + tol=1.e-4): + + r""" + Compute the maximum likelihood estimator + based on observed sufficient statistic `observed`. + + Parameters + ---------- + + observed : float + Observed value of sufficient statistic + + initial : float + Starting point for Newton-Raphson + + max_iter : int (optional) + Maximum number of Newton-Raphson iterations + + tol : float (optional) + Tolerance parameter for stopping, based + on relative change in parameter estimate. + Iteration stops when the change is smaller + than `tol * max(1, np.fabs(cur_estimate))`. + + Returns + ------- + + theta_hat : float + Maximum likelihood estimator. + + std_err : float + Estimated variance of `theta_hat` based + on inverse of variance of sufficient + statistic at `theta_hat`, i.e. the + observed Fisher information. + + """ + + cur_est = initial + + def first_two_moments(x): + return np.array([x, x**2]).T + + for i in range(max_iter): + cur_moments = self.E(cur_est, first_two_moments) # gradient and + # Hessian of CGF + # (almost) + grad, hessian = (cur_moments[0] - observed, + cur_moments[1] - cur_moments[0]**2) + next_est = cur_est - grad / hessian # newton step + + if np.fabs(next_est - cur_est) < tol * max(1, np.fabs(cur_est)): + break + cur_est = next_est + + if i == max_iter - 1: + warnings.warn('Newton-Raphson failed to converge after %d iterations' % max_iter) + + cur_moments = self.E(cur_est, first_two_moments) # gradient and + # Hessian of CGF + # (almost) + grad, hessian = (cur_moments[0] - observed, + cur_moments[1] - cur_moments[0]**2) + + return cur_est, 1. / hessian, grad + +def _truncated_inference(estimate, + sigma, + smoothing_sigma, + lower_bound, + upper_bound, + basept=None, + level=0.9): + + if basept is None: + basept = estimate + noisy_sigma = np.sqrt(smoothing_sigma**2 + sigma**2) + + if smoothing_sigma > 1e-6 * sigma: + + grid = np.linspace(basept - 10 * sigma, basept + 10 * sigma, 2001) + + weight = (normal_dbn.cdf((upper_bound - grid) / smoothing_sigma) + - normal_dbn.cdf((lower_bound - grid) / smoothing_sigma)) + + weight *= normal_dbn.pdf((grid - basept) / sigma) + + sel_distr = discrete_family(grid, weight) + + use_sample = False + if use_sample: + rng = np.random.default_rng(0) + sample = rng.standard_normal(2000) * sigma + basept + weight_s = (normal_dbn.cdf((upper_bound - sample) / smoothing_sigma) + - normal_dbn.cdf((lower_bound - sample) / smoothing_sigma)) + sel_distr = discrete_family(sample, weight_s) + + L, U = sel_distr.equal_tailed_interval(estimate, + alpha=1-level) + mle, _, _ = sel_distr.MLE(estimate) + mle *= sigma**2; mle += estimate + pval = sel_distr.cdf(-basept / sigma**2, estimate) + pval = 2 * min(pval, 1-pval) + L *= sigma**2; L += basept + U *= sigma**2; U += basept + + else: + warnings.warn('assuming data for selection is same as for inference -- using hard selection') + + lb = estimate - 20 * sigma + ub = estimate + 20 * sigma + + if estimate < lower_bound or estimate > upper_bound: + warn('Constraints not satisfied: returning [-np.inf, np.inf]') + return -np.inf, np.inf, np.nan, np.nan + + def F(theta): + + Z = (estimate - theta) / sigma + Z_L = (lower_bound - theta) / sigma + Z_U = (upper_bound - theta) / sigma + num = _norm_interval(Z_L, Z) + den = _norm_interval(Z_L, Z_U) + if Z_L > 0 and den < 1e-10: + C = np.fabs(Z_L) + D = Z-Z_L + cdf = 1-np.exp(-C*D) + elif Z_U < 0 and den < 1e-10: + C = np.fabs(Z_U) + D = Z_U-Z + if C*D < 0: + raise ValueError + cdf = np.exp(-C*D) + else: + cdf = num / den + return cdf + + + pval = F(0) + pval = 2 * min(pval, 1-pval) + + lb = lower_bound - 20 * sigma + ub = upper_bound + 20 * sigma + + alpha = 0.5 * (1 - level) + L = find_root(F, 1.0 - 0.5 * alpha, lb, ub) + if np.isnan(L): + L = -np.inf + U = find_root(F, 0.5 * alpha, lb, ub) + if np.isnan(U): + U = np.inf + + mle = np.nan + + return L, U, mle, pval, sel_distr + +def score_inference(score, + cov_score, + lamval, + prop=0.8, + chol_cov=None, + perturbation=None, + rng=None): + + # perturbation should be N(0, cov_score) roughly -- shouldn't have a multiplier for the proportion! + + # this is the X for the LASSO problem + # X.T @ X = S = cov_score + + if chol_cov is None: + chol_cov = X = np.linalg.cholesky(cov_score).T + + if rng is None: + rng = np.random.default_rng() + + # shorthand + Z = score # X.T @ Y in OLS case + X = chol_cov + p = X.shape[1] + + if perturbation is None: + perturbation = X.T @ rng.standard_normal(p) # X is square here... + + # this is the Y of the LASSO problem + + Y = scipy.linalg.solve_triangular(chol_cov.T, Z, lower=True) + Df = pd.DataFrame({'response':Y}) + + GN = GLMNet(response_id='response', + fit_intercept=False, + standardize=False) + GN.fit(X, Df) + + Z_sel = Z + np.sqrt((1 - prop) / prop) * perturbation + Y_sel = scipy.linalg.solve_triangular(chol_cov.T, Z_sel, lower=True) + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + + return lasso_inference(GN, + prop * lamval / p, + (X_sel, Df_sel, None), + (X, Df, None), + dispersion=1) + +def resampler_inference(sample, + lamval=None, + lam_frac=1, + prop=0.8, + random_idx=None, + rng=None, + estimate=None, + standardize=True): + + if estimate is None: + estimate = sample.mean(0) + + centered = sample - estimate[None,:] + B, p = centered.shape + + if random_idx is None: + if rng is None: + rng = np.random.default_rng() + random_idx = rng.choice(B, 1) + + prec_score = centered.T @ centered / B + cov_score = np.linalg.inv(prec_score) + + centered_scores = centered @ cov_score + scaling = centered_scores.std(0) + score = cov_score @ estimate + + if standardize: + centered_scores /= scaling[None,:] + cov_score /= np.multiply.outer(scaling, scaling) + score /= scaling + + # pick a lam + if lamval is None: + max_scores = np.fabs(centered_scores).max(1) + lamval = lam_frac * np.median(max_scores) + + # pick a pseudo-Gaussian perturbation + perturbation = centered_scores[random_idx].reshape((p,)) + + df = score_inference(score=score, + cov_score=cov_score, + lamval=lamval, + prop=prop, + perturbation=perturbation) + + if df is not None: + if standardize: + for col in ['mle', 'upper', 'lower']: + df[col].values[:] = df[col] / scaling[df.index] + + for j in df.index: + df.loc[j,'TG'] = None + # XX try to correct this!! + # tg = df.loc[j, 'TG'] + # tg_params = asdict(tg) + # tg_params.update(estimate=tg.estimate * scaling[j], + # sigma=tg.sigma * scaling[j], + # smoothing_sigma=tg.smoothing_sigma * scaling[j], + # lower_bound=tg.lower_bound * scaling[j], + # upper_bound=tg.upper_bound * scaling[j]) + # df.loc[j,'TG'] = TruncatedGaussian(**tg_params) + + return df + diff --git a/glmnet/paths/fastnet.py b/glmnet/paths/fastnet.py index 5246565..248336f 100644 --- a/glmnet/paths/fastnet.py +++ b/glmnet/paths/fastnet.py @@ -11,6 +11,7 @@ from ..base import _get_design from ..glmnet import GLMNet +from ..glm import GLMState from ..elnet import (_check_and_set_limits, _check_and_set_vp, _design_wrapper_args) @@ -61,7 +62,7 @@ def fit(self, else: self.feature_names_in_ = ['X{}'.format(i) for i in range(X.shape[1])] - X, y, response, offset, weight = self._check(X, y) + X, y, response, offset, weight = self.get_data_arrays(X, y) if not scipy.sparse.issparse(X): X = np.asfortranarray(X) @@ -128,6 +129,10 @@ def fit(self, self.coefs_ = result['coefs'] self.intercepts_ = result['intercepts'] + if self.coefs_.ndim == 1: + self.state_ = GLMState(self.coefs_[-1], + self.intercepts_[-1]) + self.lambda_values_ = result['lambda_values'] nfits = self.lambda_values_.shape[0] dev_ratios_ = self._fit['dev'][:nfits] diff --git a/glmnet/paths/fishnet.py b/glmnet/paths/fishnet.py index e638c6f..da28123 100644 --- a/glmnet/paths/fishnet.py +++ b/glmnet/paths/fishnet.py @@ -28,9 +28,12 @@ class FishNet(FastNetMixin): def __post_init__(self): self._family = GLMFamilySpec(base=sm_family.Poisson()) - def _check(self, X, y, check=True): + def get_data_arrays(self, + X, + y, + check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) if np.any(response < 0): raise ValueError("negative responses encountered; not permitted for Poisson family") response = np.asarray(response, float).copy() diff --git a/glmnet/paths/lognet.py b/glmnet/paths/lognet.py index bdd4dfd..b783ec1 100644 --- a/glmnet/paths/lognet.py +++ b/glmnet/paths/lognet.py @@ -41,9 +41,12 @@ def _extract_fits(self, V = super()._extract_fits(X_shape, response_shape) return V - def _check(self, X, y, check=True): + def get_data_arrays(self, + X, + y, + check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) encoder = LabelEncoder() labels = np.asfortranarray(encoder.fit_transform(response)) self.classes_ = encoder.classes_ diff --git a/glmnet/paths/multiclassnet.py b/glmnet/paths/multiclassnet.py index 676831c..05486e4 100644 --- a/glmnet/paths/multiclassnet.py +++ b/glmnet/paths/multiclassnet.py @@ -26,7 +26,7 @@ @dataclass class MultiClassFamily(object): - def default_scorers(self): + def _default_scorers(self): return [accuracy_scorer, misclass_scorer, @@ -68,9 +68,9 @@ def _offset_predictions(self, value = exp_value / exp_value.sum(-1)[:,:,None] return value - def _check(self, X, y, check=True): + def get_data_arrays(self, X, y, check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + X, y, response, offset, weight = super().get_data_arrays(X, y, check=check) encoder = OneHotEncoder(sparse_output=False) y_onehot = np.asfortranarray(encoder.fit_transform(response.reshape((-1,1)))) self.categories_ = encoder.categories_[0] diff --git a/glmnet/paths/multigaussnet.py b/glmnet/paths/multigaussnet.py index 7f30c38..b2a3943 100644 --- a/glmnet/paths/multigaussnet.py +++ b/glmnet/paths/multigaussnet.py @@ -25,7 +25,7 @@ @dataclass class MultiClassFamily(object): - def default_scorers(self): + def _default_scorers(self): return [mse_scorer, mae_scorer] @dataclass @@ -49,8 +49,13 @@ def _extract_fits(self, self._fit['dev'] = self._fit['rsq'] return super()._extract_fits(X_shape, response_shape) - def _check(self, X, y, check=True): - X, y, response, offset, weight = super()._check(X, y, check=check) + def get_data_arrays(self, + X, + y, + check=True): + X, y, response, offset, weight = super().get_data_arrays(X, + y, + check=check) return X, y, np.asfortranarray(response), offset, weight def _wrapper_args(self, diff --git a/glmnet/regularized_glm.py b/glmnet/regularized_glm.py index 9d1e294..3cda87a 100644 --- a/glmnet/regularized_glm.py +++ b/glmnet/regularized_glm.py @@ -113,7 +113,7 @@ def newton_step(self, warm = (cur_state.coef, cur_state.intercept, - cur_state.eta) # just X\beta -- doesn't include offset + cur_state.linear_predictor) # just X\beta -- doesn't include offset elnet_fit = self.elnet_estimator.fit(design, z, @@ -141,18 +141,29 @@ class RegGLM(GLM, control: RegGLMControl = field(default_factory=RegGLMControl) + def get_LM(self): + return GLM(family=self.family, + fit_intercept=self.fit_intercept, + offset_id=self.offset_id, + weight_id=self.weight_id, + response_id=self.response_id) + def _get_regularizer(self, - X): + nvars=None): # self.design_ will have been set by now + if nvars is None: + check_is_fitted(self, ["design_"]) + nvars = self.design_.X.shape[1] + return ElNetRegularizer(lambda_val=self.lambda_val, alpha=self.alpha, penalty_factor=self.penalty_factor, lower_limits=self.lower_limits * self.design_.scaling_, upper_limits=self.upper_limits * self.design_.scaling_, fit_intercept=self.fit_intercept, - nvars=X.shape[1], + nvars=nvars, control=self.control, exclude=self.exclude) @@ -170,6 +181,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 3 options non sklearn API + warm_state=None, check=True, fit_null=True): @@ -177,6 +189,7 @@ def fit(self, y, sample_weight=sample_weight, regularizer=regularizer, + warm_state=warm_state, dispersion=1, check=check) @@ -213,6 +226,7 @@ def fit(self, y, sample_weight=None, # ignored regularizer=None, # last 4 options non sklearn API + warm_state=None, dispersion=1, check=True): diff --git a/pyproject.toml b/pyproject.toml index e819950..a6be16e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ requires = ["setuptools>=42", "matplotlib>=3.3.3", "versioneer", "statsmodels", + "mpmath", "coxdev"] build-backend = "setuptools.build_meta" @@ -41,3 +42,11 @@ dynamic = ['version'] [project.license] file = 'LICENSE.txt' +[tool.versioneer] +VCS = "git" +style = "pep440" +versionfile_source = "glmnet/_version.py" +versionfile_build = "glmnet/_version.py" +tag_prefix = "v" +parentdir_prefix = "glmnet-" + diff --git a/setup.py b/setup.py index e4ca64a..2f42b2c 100644 --- a/setup.py +++ b/setup.py @@ -43,9 +43,10 @@ 'gaussnet', 'multigaussnet']] +long_description = open('README.md', 'rt', encoding='utf-8').read() + def main(**extra_args): setup(name='glmnet', - url="http://github.org/intro-stat-learning/glmnetpy", version=versioneer.get_version(), packages = ['glmnet', 'glmnet.paths'], @@ -54,7 +55,7 @@ def main(**extra_args): include_package_data=True, data_files=[], scripts=[], - #long_description=long_description, + long_description=long_description, long_description_content_type=long_description_content_type, cmdclass = cmdclass, **extra_args diff --git a/tests/flex/__init__.py b/tests/flex/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/paths/__init__.py b/tests/paths/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_design.py b/tests/test_design.py index af6c895..2c7dada 100644 --- a/tests/test_design.py +++ b/tests/test_design.py @@ -4,6 +4,7 @@ import statsmodels.api as sm from glmnet.base import Design +from glmnet.glm import GLMState rng = np.random.default_rng(0) n, p, q = 100, 5, 3 @@ -86,55 +87,116 @@ def test_design(X, weights, standardize, intercept): assert np.allclose(design.T @ V_l, design_s.T @ V_l) -# @pytest.mark.parametrize('X', [rng.standard_normal((n, p)), -# scipy.sparse.csc_array(rng.standard_normal((n, p)))] ) -# @pytest.mark.parametrize('weights', [np.ones(n), rng.uniform(1, 2, size=(n,))]) -# @pytest.mark.parametrize('standardize', [True, False]) -# @pytest.mark.parametrize('intercept', [True, False]) -# @pytest.mark.parametrize('gls', [None, -# rng.uniform(1, 2, size=(n,)), -# rng.uniform(1, 2, size=(n,n))]) -# def test_quadratic_form(X, weights, standardize, intercept, gls): -# """ -# test the linear / adjoint maps of Design from an np.ndarray and a scipy.sparse.csc_array -# """ +@pytest.mark.parametrize('X', [rng.standard_normal((n, p)), + scipy.sparse.csc_array(rng.standard_normal((n, p)))] ) +@pytest.mark.parametrize('weights', [np.ones(n), rng.uniform(1, 2, size=(n,))]) +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +@pytest.mark.parametrize('gls', [None, + np.diag(rng.uniform(1, 2, size=(n,))), + rng.uniform(1, 2, size=(n,n))]) +def test_quadratic_form(X, weights, standardize, intercept, gls): + """ + test the linear / adjoint maps of Design from an np.ndarray and a scipy.sparse.csc_array + """ + + if gls is not None and gls.ndim == 2: + gls = (gls + gls.T) / 2 + + X_s = scipy.sparse.csc_array(X) + W = weights + design = Design(X, W, standardize=standardize) + design_s = Design(X_s, W, standardize=standardize) + + columns = [0,1,3] + Q_s = design_s.quadratic_form(G=gls, columns=columns, transformed=standardize) + Q = design.quadratic_form(G=gls, columns=columns, transformed=standardize) + + X_eff = scipy.sparse.csc_array(X).toarray() + + if standardize: + xm = (X_eff * W[:,None]).sum(0) / W.sum() + x2 = (X_eff**2 * W[:,None]).sum(0) / W.sum() + xs = np.sqrt(x2 - xm**2) + else: + xm = np.zeros(p) + xs = np.ones(p) + + X_eff = X_eff / xs[None,:] - np.multiply.outer(np.ones(n), xm / xs) + X_eff = np.concatenate([np.ones((n,1)), X_eff], axis=1) + + if columns is not None: + columns = np.array(columns) + columns += 1 + columns = np.hstack([[0], columns]) + + if gls is None: + Q_eff = X_eff.T @ X_eff + elif gls.ndim == 1: + Q_eff = X_eff.T @ (gls[:, None] * X_eff) + else: + Q_eff = X_eff.T @ gls @ X_eff + if columns is not None: + Q_eff = Q_eff[:, columns] + assert np.allclose(Q, Q_eff) # calculation is done correctly + + assert np.allclose(Q_s, Q) # sparse and dense agree + + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_unscaler(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + M = D.unscaler_ @ np.identity(6) + MT = D.unscaler_.T @ np.identity(6) + assert np.allclose(MT, M.T) + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_scaler(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + M = D.scaler_ @ np.identity(6) + MT = D.scaler_.T @ np.identity(6) + assert np.allclose(MT, M.T) + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_scaler_unscaler_inv(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + M = D.unscaler_ @ np.identity(6) + MI = D.scaler_ @ np.identity(6) + assert np.allclose(M @ MI, np.identity(6)) + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('intercept', [True, False]) +def test_scaler_to_raw_conversion(intercept, + standardize): + + X = np.random.standard_normal((20,5)) + D = Design(X, intercept=True, standardize=True) + raw_state = GLMState(coef=np.random.standard_normal(5), + intercept=np.random.standard_normal()) + scaled_state = D.raw_to_scaled(raw_state) + raw_state_roundtrip = D.scaled_to_raw(scaled_state) + + assert np.allclose(raw_state._stack, + raw_state_roundtrip._stack) + + scaled_state = GLMState(coef=np.random.standard_normal(5), + intercept=np.random.standard_normal()) + raw_state = D.scaled_to_raw(scaled_state) + scaled_state_roundtrip = D.raw_to_scaled(raw_state) + + assert np.allclose(scaled_state._stack, + scaled_state_roundtrip._stack) + -# if gls is not None and gls.ndim == 2: -# gls = (gls + gls.T) / 2 -# X_s = scipy.sparse.csc_array(X) -# design = Design(X, W, standardize=stand) -# design_s = Design(X_s, W, standardize=stand) - -# Q_s = design_s.quadratic_form(G=gls, columns=columns) -# Q = design.quadratic_form(G=gls, columns=columns) - -# X_eff = scipy.sparse.csc_array(X).toarray() -# if stand: -# xm = (X_eff * W[:,None]).sum(0) / W.sum() -# x2 = (X_eff**2 * W[:,None]).sum(0) / W.sum() -# xs = np.sqrt(x2 - xm**2) -# else: -# xm = np.zeros(p) -# xs = np.ones(p) - -# X_eff = X_eff / xs[None,:] - np.multiply.outer(np.ones(n), xm / xs) -# X_eff = np.concatenate([np.ones((n,1)), X_eff], axis=1) - -# if columns is not None: -# columns = np.array(columns) -# columns += 1 -# columns = np.hstack([[0], columns]) - -# if gls is None: -# Q_eff = X_eff.T @ X_eff -# elif gls.ndim == 1: -# Q_eff = X_eff.T @ (gls[:, None] * X_eff) -# else: -# Q_eff = X_eff.T @ gls @ X_eff -# if columns is not None: -# Q_eff = Q_eff[:, columns] -# assert np.allclose(Q, Q_eff) # calculation is done correctly - -# assert np.allclose(Q_s, Q) # sparse and dense agree - - diff --git a/tests/test_inference.py b/tests/test_inference.py new file mode 100644 index 0000000..d9430e9 --- /dev/null +++ b/tests/test_inference.py @@ -0,0 +1,472 @@ +import pytest + +import numpy as np +import pandas as pd +import statsmodels.api as sm + +import scipy.sparse + +from glmnet import GLMNet +from glmnet.inference import (lasso_inference, + score_inference, + resampler_inference, + TruncatedGaussian) + +def test_Auto(): + + df = sm.datasets.get_rdataset('Auto', package='ISLR').data + df = df.set_index('name') + X = np.asarray(df.drop(columns=['mpg'])) + y = np.asarray(df['mpg']) + + GN = GLMNet(response_id='response', + fit_intercept=True, + standardize=True) + + Df = pd.DataFrame({'response':y}) + + GN.fit(X, Df) + prop = 0.8 + n = df.shape[0] + m = int(prop*n) + + df = lasso_inference(GN, + 2 / n, # GN.lambda_values_[min(10, GN.lambda_values_.shape[0]-1)], + (X[:m], Df.iloc[:m], None), + (X, Df, None)) + +def sample_orthogonal(rng=None, + p=50, + s=5, + penalty_facs=False, + alt=True, + prop=0.8): + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + + X = np.identity(p) + Y = rng.standard_normal(p) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + mu = beta + Y += mu + Df = pd.DataFrame({'response':Y}) + + penalty_factor = np.ones(p) + if penalty_facs: + penalty_factor[:p//10] = 0.5 + penalty_factor[p//10:2*p//10] = 0 + + penalty_factor *= p / penalty_factor.sum() + + GN = GLMNet(response_id='response', + fit_intercept=False, + standardize=False, + penalty_factor=penalty_factor) + + lamval = 2 + Y_sel = Y + np.sqrt((1 - prop) / prop) * rng.standard_normal(Y.shape) + active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] + + Df_sel = pd.DataFrame({'response':Y_sel * np.sqrt(prop)}) + X_sel = np.sqrt(prop) * X + GN.fit(X, Df) + active_naive = np.nonzero(np.fabs(Y_sel) > lamval)[0] + + pivots = [] + test_pvals = [] + + if active_naive.shape[0] > 0: + df = lasso_inference(GN, + prop * lamval / p, + (X_sel, Df_sel, None), + (X, Df, None), + dispersion=1) + df['target'] = mu[df.index] + + signs = np.sign(Y_sel[df.index]) + assert np.all(Y_sel[df.index] * signs >= lamval * GN.penalty_factor[df.index]) + + if df is not None: + for j, s in zip(df.index, signs): + if s == 1: + upper_bound = np.inf + if penalty_factor is not None: + lower_bound = lamval * penalty_factor[j] + else: + lower_bound = lamval + else: + if penalty_factor is not None: + upper_bound = -lamval * penalty_factor[j] + else: + upper_bound = -lamval + lower_bound = -np.inf + + tg = df.loc[j,'TG'] + + TG = TruncatedGaussian(estimate=Y[j], + sigma=1, + smoothing_sigma=np.sqrt((1 - prop) / prop), + lower_bound=lower_bound, + upper_bound=upper_bound, + level=0.90) + pval = TG.pvalue() + pivots.append(TG.pvalue(null_value=df.loc[j,'target'])) + test_pvals.append(TG.pvalue(null_value=0)) + assert np.allclose(pval, df.loc[j]['pval']) + df['pivot_byhand'] = pivots + df['pval_byhand'] = test_pvals + assert np.allclose(df['pval'], df['pval_byhand']) + + if df is not None: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] + assert np.allclose(df['pivot'], df['pivot_byhand']) + + return df, None + +def resample_orthogonal(rng=None, + p=50, + s=5, + alt=True, + prop=0.8, + standardize=True, + B=2000): + + # when standardize is True, coverage will be fine but pivot won't be + # this is because the TruncatedGaussian instance is not corrected for scale + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + + scale = np.linspace(1, 3, p) + beta_hat = beta + rng.standard_normal(p) * scale + bootstrap_noise = rng.standard_normal((B, p)) * scale[None,:] + sample = bootstrap_noise + beta_hat[None,:] + + df = resampler_inference(sample, + prop=prop, + standardize=standardize) + if df is not None: + df['target'] = beta[df.index] + if not standardize: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j,'target']) for j in df.index] + else: + # scaling transformation not taken into account so cannot reuse TruncatedGaussian object + df['pivot'] = np.nan * df['pval'] + return df + +def sample_AR1(rho=0.6, + rng=None, + p=100, + s=5, + alt=True, + prop=0.8, + dispersion=2): + + D = np.fabs(np.subtract.outer(np.arange(p), np.arange(p))) + dispersion = 2 + S = (rho**D) * dispersion + + return sample_cov(S, + rng=rng, + p=p, + s=s, + alt=alt, + prop=prop) + +def resample_AR1(rng=None, + p=50, + rho=0.6, + s=5, + alt=True, + prop=0.8, + standardize=True, + B=2000): + + # when standardize is True, coverage will be fine but pivot won't be + # this is because the TruncatedGaussian instance is not corrected for scale + + D = np.fabs(np.subtract.outer(np.arange(p), np.arange(p))) + dispersion = 2 + S_init = (rho**D) * dispersion + S = np.linalg.inv(S_init) + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + + S_sqrt = np.linalg.cholesky(S) + beta_hat = beta + S_sqrt @ rng.standard_normal(p) + bootstrap_noise = rng.standard_normal((B, p)) @ S_sqrt.T + sample = bootstrap_noise + beta_hat[None,:] + + S_i = np.linalg.inv(S) + mu = S_i @ beta + + df = resampler_inference(sample, + prop=prop, + standardize=standardize) + if df is not None: + prec_E = S_i[df.index][:,df.index] + df['target'] = np.linalg.inv(prec_E) @ mu[df.index] + if not standardize: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j,'target']) for j in df.index] + else: + # scaling transformation not taken into account so cannot reuse TruncatedGaussian object + df['pivot'] = np.nan * df['pval'] + return df, None + +def sample_cov(S, + rng=None, + p=100, + s=5, + alt=True, + prop=0.8, + lamval=3): + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + + S_sqrt = X = np.linalg.cholesky(S).T # X.T @ X = S + + noise = X.T @ rng.standard_normal(p) + if alt: + beta[subs] = rng.standard_normal(s) * 2 + 3 * rng.choice([1,-1]) + mu = S @ beta + + Z = mu + noise + perturbation = X.T @ rng.standard_normal(p) + + df = score_inference(score=Z, + cov_score=S, + lamval=lamval, + prop=prop, + chol_cov=S_sqrt, + perturbation=perturbation) + if df is not None: + active = list(df.index) + df['target'] = np.linalg.inv(S[active][:,active]) @ mu[active] + + if df is not None: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] + + return df, {'score':Z,'cov_score':S, 'lamval':lamval, 'prop':prop, 'chol_cov':S_sqrt, 'perturbation':perturbation} + +def sample_randomX(n, + p, + fit_intercept, + standardize, + rng=None, + alt=False, + s=10, + snr=1, + prop=0.8, + penalty_facs=False, + cv=True, + upper_limits=np.inf): + + if rng is None: + rng = np.random.default_rng(0) + beta = np.zeros(p) + subs = rng.choice(p, s, replace=False) + + X = rng.standard_normal((n, p)) + D = np.ones(p) # np.linspace(1, p, p) / p + 1 + X *= D[None,:] + sd = np.fabs(1+np.random.standard_normal()) + Y = rng.standard_normal(n) * sd + if alt: + beta[subs] = snr * (rng.uniform(3, 5) * rng.choice([-1,1], size=s, replace=True) / np.sqrt(n)) + + mu = X @ beta + Y += mu + + if penalty_facs: + penalty_factor = np.ones(p) + penalty_factor[:10] = 0.5 + else: + penalty_factor = None + + GN = GLMNet(response_id='response', + fit_intercept=fit_intercept, + standardize=standardize, + penalty_factor=penalty_factor, + upper_limits=upper_limits) + + m = int(prop*n) + + Df = pd.DataFrame({'response':Y}) + GN.fit(X[:m], Df.iloc[:m]) + + if cv: + GN.cross_validation_path(X[:m], Df.iloc[:m]) + lamval = GN.index_best_['Mean Squared Error'] + else: + eps = rng.standard_normal((X.shape[0], 1000)) * sd + noise_score = X.T @ eps + lamval = 1.2 * np.median(np.fabs(noise_score).max(0)) / X.shape[0] + + df = lasso_inference(GN, + lamval, + (X[:m], Df.iloc[:m], None), + (X, Df, None)) + + if df is not None: + if fit_intercept: + active_set = np.array(df.index[1:]).astype(int) + else: + active_set = np.array(df.index).astype(int) + X_sel = X[:,active_set] + + if fit_intercept: + X_sel = np.column_stack([np.ones(X_sel.shape[0]), X_sel]) + targets = np.linalg.pinv(X_sel) @ mu + + df['target'] = targets + + if df is not None: + df['pivot'] = [df.loc[j,'TG'].pvalue(df.loc[j, 'target']) for j in df.index] + + return df, None + +@pytest.mark.parametrize('standardize', [True, False]) +@pytest.mark.parametrize('penalty_facs', [True, False]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +#@pytest.mark.parametrize('upper_limits', [np.inf, 0.1]) +@pytest.mark.parametrize('p', [103]) +@pytest.mark.parametrize('n', [500]) +def test_randomX(n, + p, + standardize, + fit_intercept, + penalty_facs, + upper_limits=np.inf): + + for _ in range(5): + df = None + while df is None: # make sure it is run + kwargs = dict(n=n, + p=p, + standardize=standardize, + fit_intercept=fit_intercept, + penalty_facs=penalty_facs, + cv=False, + upper_limits=upper_limits) + + df = main(sample_randomX, + kwargs, + ntrial=1) + +def test_resampler(p=50, + standardize=True): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = main(resample_orthogonal, + {'p':p}, + ntrial=1) + + +def test_orthogonal(p=100): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = main(sample_orthogonal, + {'p':p}, + ntrial=1) + + +def test_AR1(p=100, + rho=0.6): + + for _ in range(5): + df = None + while df is None: # make sure it is run + df = sample_AR1(p=p, + rho=rho) + + + +def main(sampler, + kwargs, + ntrial=500): + + ncover = 0 + nsel = 0 + + dfs = [] + + rng = np.random.default_rng(0) + kwargs.update(rng=rng) + + for i in range(ntrial): + df, _ = sampler(**kwargs) + print(kwargs) + print(rng.standard_normal()) + if df is not None: + dfs.append(df) + if len(dfs) > 0: + all_df = pd.concat(dfs) + ncover += ((all_df['lower'] < all_df['target']) & (all_df['upper'] > all_df['target'])).sum() + nsel += all_df.shape[0] + + print('cover:', ncover / nsel, + 'power:', (all_df['pval'] < 0.05).mean(), + 'pivot<0.05:', (all_df['pivot'] < 0.05).mean(), + 'std(pivot):', all_df['pivot'].std()) + + if len(dfs) > 0: + return all_df + +def test_truncated_inference(B=1000, + smoothing_sigma=np.sqrt(1/3), + sigma=1, + upper_bound=None, + lower_bound=None, + alt=False, + level=0.9): + + pvals = [] + cover = [] + + rng = np.random.default_rng(0) + + for _ in range(B): + mu = rng.standard_normal() + 3 + if not alt: + mu *= 0 + if lower_bound is None or upper_bound is None: + lower_bound = rng.uniform(0, 2) + upper_bound = lower_bound + rng.uniform(3, 5) + while True: + Z = rng.standard_normal() * sigma + Z += mu + Z_noisy = Z + rng.standard_normal() * smoothing_sigma + if (Z_noisy > lower_bound) and (Z_noisy < upper_bound): + TG = TruncatedGaussian(estimate=Z, + sigma=sigma, + smoothing_sigma=smoothing_sigma, + lower_bound=lower_bound, + upper_bound=upper_bound, + level=level) + + (L, U), mle, pval = (TG.interval(), TG.MLE(), TG.pvalue()) + + cover.append((Lmu)) + pvals.append(pval) + print(np.mean(cover), np.mean(np.array(pvals) < 0.05)) + break +