diff --git a/doc/examples/compute_coverages.py b/doc/examples/compute_coverages.py new file mode 100644 index 000000000..e82c66792 --- /dev/null +++ b/doc/examples/compute_coverages.py @@ -0,0 +1,51 @@ +import numpy as np +import matplotlib.pyplot as plt + +from statsmodels.distributions import ECDF +from selection.randomized.tests.test_opt_weighted_intervals import test_opt_weighted_intervals + + +def compute_coverage(sel_ci, true_vec): + nactive = true_vec.shape[0] + coverage = np.zeros(nactive) + for i in range(nactive): + if true_vec[i]>=sel_ci[i,0] and true_vec[i]<=sel_ci[i,1]: + coverage[i]=1 + return coverage + + +def main(ndraw=5000, burnin=1000, nsim=20): + np.random.seed(1) + + sel_pivots_all = list() + sel_ci_all = list() + rand_all = [] + for i in range(nsim): + for idx, results in enumerate(test_opt_weighted_intervals(ndraw=ndraw, burnin=burnin)): + if results is not None: + (rand, sel_pivots, sel_ci, true_vec) = results + if i==0: + sel_pivots_all.append([]) + rand_all.append(rand) + sel_ci_all.append([]) + sel_pivots_all[idx]=np.concatenate((sel_pivots_all[idx],sel_pivots), axis=0) + sel_ci_all[idx] = np.concatenate((sel_ci_all[idx], compute_coverage(sel_ci, true_vec)), axis=0) + + xval = np.linspace(0, 1, 200) + + for idx in range(len(rand_all)): + fig = plt.figure(num=idx, figsize=(8,8)) + plt.clf() + #sel_pivots_all[idx] = [item for sublist in sel_pivots_all[idx] for item in sublist] + plt.plot(xval, ECDF(sel_pivots_all[idx])(xval), label='selective') + plt.plot(xval, xval, 'k-', lw=1) + plt.legend(loc='lower right') + + #sel_ci_all[idx] = [item for sublist in sel_ci_all[idx] for item in sublist] + #sel_ci_all[idx] = np.vstack(sel_ci_all[idx]) + print("covered", sel_ci_all[idx]) + plt.title(''.join(["coverage ", str(np.mean(sel_ci_all[idx]))])) + plt.savefig(''.join(["fig", rand_all[idx], '.pdf'])) + + for idx in range(len(rand_all)): + print("coverage", np.mean(sel_ci_all[idx])) \ No newline at end of file diff --git a/selection/randomized/M_estimator.py b/selection/randomized/M_estimator.py index e4c3dba86..60ebccd38 100644 --- a/selection/randomized/M_estimator.py +++ b/selection/randomized/M_estimator.py @@ -61,7 +61,7 @@ def __init__(self, loss, epsilon, penalty, randomization, solve_args={'min_its': def solve(self, scaling=1, solve_args={'min_its':20, 'tol':1.e-10}, nboot=2000): - self.randomize() + #self.randomize() (loss, randomized_loss, diff --git a/selection/randomized/glm.py b/selection/randomized/glm.py index 77225441b..2a2f31eca 100644 --- a/selection/randomized/glm.py +++ b/selection/randomized/glm.py @@ -671,21 +671,25 @@ def glm_parametric_covariance(glm_loss, solve_args={'min_its':50, 'tol':1.e-10}) return functools.partial(parametric_cov, glm_loss, solve_args=solve_args) -def standard_split_ci(glm_loss, X, y, active, leftout_indices, alpha=0.1): +def standard_split_ci(glm_loss, X, y, active, leftout_indices, parametric=False, alpha=0.1): """ Data plitting confidence intervals via bootstrap. """ loss = glm_loss(X[leftout_indices,], y[leftout_indices]) - boot_target, target_observed = pairs_bootstrap_glm(loss, active) + observed = restricted_Mest(loss, active) nactive = np.sum(active) - size= np.sum(leftout_indices) - observed = target_observed[:nactive] - boot_target_restricted = lambda indices: boot_target(indices)[:nactive] - sampler = lambda: np.random.choice(size, size=(size,), replace=True) - target_cov = bootstrap_cov(sampler, boot_target_restricted) + if parametric==False: + boot_target, target_observed = pairs_bootstrap_glm(loss, active) + size= np.sum(leftout_indices) + print("size", size) + boot_target_restricted = lambda indices: boot_target(indices)[:nactive] + sampler = lambda: np.random.choice(size, size=(size,), replace=True) + target_cov = bootstrap_cov(sampler, boot_target_restricted) + else: + target_cov=_parametric_cov_glm(loss, active)[:nactive,:nactive] quantile = - ndist.ppf(alpha / float(2)) - LU = np.zeros((2, target_observed.shape[0])) + LU = np.zeros((2, observed.shape[0])) for j in range(observed.shape[0]): sigma = np.sqrt(target_cov[j, j]) LU[0, j] = observed[j] - sigma * quantile diff --git a/selection/randomized/tests/test_opt_weighted_intervals.py b/selection/randomized/tests/test_opt_weighted_intervals.py index 6e45cdaea..a666e0a9c 100644 --- a/selection/randomized/tests/test_opt_weighted_intervals.py +++ b/selection/randomized/tests/test_opt_weighted_intervals.py @@ -21,24 +21,24 @@ def test_opt_weighted_intervals(ndraw=20000, burnin=2000): results = [] cls = lasso - for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['laplace', 'gaussian']): + for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['gaussian']): inst, const = const_info - X, Y, beta = inst(n=100, p=20, s=0, signal=5., sigma=5.)[:3] + X, Y, beta = inst(n=200, p=50, s=0, signal=5., sigma=1., rho=0.)[:3] n, p = X.shape - W = np.ones(X.shape[1]) * 8 - conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=True) + W = np.ones(X.shape[1]) * 1.3 + conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=False) signs = conv.fit() print("signs", signs) - marginalizing_groups = np.ones(p, np.bool) + #marginalizing_groups = np.ones(p, np.bool) #marginalizing_groups[:int(p/2)] = True - conditioning_groups = ~marginalizing_groups + #conditioning_groups = ~marginalizing_groups #conditioning_groups[-int(p/4):] = False - conv.decompose_subgradient(marginalizing_groups=marginalizing_groups, - conditioning_groups=conditioning_groups) + #conv.decompose_subgradient(marginalizing_groups=marginalizing_groups, + # conditioning_groups=conditioning_groups) selected_features = conv._view.selection_variable['variables'] nactive=selected_features.sum() @@ -51,7 +51,7 @@ def test_opt_weighted_intervals(ndraw=20000, burnin=2000): ndraw=ndraw, burnin=burnin, compute_intervals=True) - print(sel_pivots) + print("pivots",sel_pivots) results.append((rand, sel_pivots, sel_ci, beta[selected_features])) return results diff --git a/selection/randomized/tests/test_split_compare.py b/selection/randomized/tests/test_split_compare.py index 875c99058..9c8f31b7f 100644 --- a/selection/randomized/tests/test_split_compare.py +++ b/selection/randomized/tests/test_split_compare.py @@ -9,7 +9,7 @@ from selection.api import (randomization, split_glm_group_lasso) -from ...tests.instance import logistic_instance +from ...tests.instance import logistic_instance, gaussian_instance from ...tests.decorators import (wait_for_return_value, register_report, set_sampling_params_iftrue) @@ -19,7 +19,7 @@ pairs_bootstrap_glm) from ..M_estimator import restricted_Mest -from ..query import naive_confidence_intervals +from ..query import naive_confidence_intervals, multiple_queries @register_report(['pivots_clt', 'covered_clt', @@ -32,31 +32,49 @@ @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, burnin=10) @wait_for_return_value() def test_split_compare(s=3, - n=200, - p=20, + n=100, + p=50, signal=7, - rho=0.1, + rho=0., split_frac=0.8, lam_frac=0.7, - ndraw=10000, burnin=2000, - solve_args={'min_its':50, 'tol':1.e-10}, check_screen =True): + ndraw=10000, + burnin=2000, + solve_args={'min_its':50, 'tol':1.e-10}, + check_screen =False, + instance = "gaussian"): - X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) + m = int(split_frac * n) + if instance=="logistic": + X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1.) + loss = rr.glm.logistic(X, y) + lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) + loss_rr = rr.glm.logistic + elif instance=="gaussian": + X, y, beta, _,_ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1.) + loss = rr.glm.gaussian(X,y) + #lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.normal(0, 1. / 2, (n, 10000)))).max(0)) + #print("lam", lam) + lam = lam_frac * np.sqrt(np.log(p)*m/n) + print("lam", lam) + loss_rr = rr.glm.gaussian + else: + raise ValueError("invalid instance") nonzero = np.where(beta)[0] - - loss = rr.glm.logistic(X, y) epsilon = 1. - lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam - W[0] = 0 # use at least some unpenalized - penalty = rr.group_lasso(np.arange(p), - weights=dict(zip(np.arange(p), W)), lagrange=1.) - - m = int(split_frac * n) + penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = split_glm_group_lasso(loss, epsilon, m, penalty) + M_est.randomize() + leftout_indices = M_est.randomized_loss.saturated_loss.case_weights == 0 + X_used = X[~leftout_indices,:] + lam = lam_frac * np.mean(np.fabs(np.dot(X_used.T, np.random.normal(0, 1., (m, 10000)))).max(0)) + print("lam", lam) + M_est.penalty =rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), np.ones(p)*lam)), lagrange=1.) + M_est.solve() active_union = M_est.selection_variable['variables'] @@ -65,10 +83,7 @@ def test_split_compare(s=3, if nactive==0: return None - leftout_indices = M_est.randomized_loss.saturated_loss.case_weights == 0 - screen = set(nonzero).issubset(np.nonzero(active_union)[0]) - if check_screen and not screen: return None @@ -76,13 +91,10 @@ def test_split_compare(s=3, active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] - selected_features = np.zeros(p, np.bool) - selected_features[active_set] = True - - unpenalized_mle = restricted_Mest(loss, selected_features) + unpenalized_mle = restricted_Mest(loss, active_union) form_covariances = glm_nonparametric_bootstrap(n, n) - target_info, target_observed = pairs_bootstrap_glm(M_est.loss, selected_features, inactive=None) + target_info, target_observed = pairs_bootstrap_glm(M_est.loss, active_union, inactive=None) cov_info = M_est.setup_sampler() target_cov, score_cov = form_covariances(target_info, @@ -92,17 +104,17 @@ def test_split_compare(s=3, opt_sample = M_est.sampler.sample(ndraw, burnin) - pivots = M_est.sampler.coefficient_pvalues(unpenalized_mle, + pivots = M_est.sampler.coefficient_pvalues(unpenalized_mle, target_cov, score_cov, parameter=true_vec, sample=opt_sample) - LU = intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) + LU = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) LU_naive = naive_confidence_intervals(np.diag(target_cov), target_observed) if X.shape[0] - leftout_indices.sum() > nactive: - LU_split = standard_split_ci(rr.glm.logistic, X, y, active_union, leftout_indices) + LU_split = standard_split_ci(loss_rr, X, y, active_union, leftout_indices, parametric=False) else: LU_split = np.ones((nactive, 2)) * np.nan @@ -138,9 +150,9 @@ def coverage(LU): ci_length_naive) -def report(niter=3, **kwargs): - kwargs = {'s': 0, 'n': 300, 'p': 20, 'signal': 7, 'split_frac': 0.8} +def report(niter=2, **kwargs): + split_report = reports.reports['test_split_compare'] screened_results = reports.collect_multiple_runs(split_report['test'], split_report['columns'], @@ -148,9 +160,11 @@ def report(niter=3, **kwargs): reports.summarize_all, **kwargs) - fig = reports.boot_clt_plot(screened_results, inactive=True, active=False) - fig.savefig('split_compare_pivots.pdf') # will have both bootstrap and CLT on plot - + #fig = reports.custom_plot(screened_results, labels=['pivots_clt'], colors=['r']) + #fig.savefig('split_compare_pivots.pdf') + return (screened_results) -if __name__=='__main__': - report() +def main(): + kwargs = {'s': 0, 'n': 200, 'p': 50, 'signal': 7, 'lam_frac':2.5, 'split_frac': 0.8, + 'check_screen':True, 'instance':"gaussian"} + report(**kwargs) diff --git a/selection/tests/reports.py b/selection/tests/reports.py index 5b7d047bc..0e2f030e7 100644 --- a/selection/tests/reports.py +++ b/selection/tests/reports.py @@ -50,6 +50,27 @@ def collect_multiple_runs(test_fn, columns, nrun, summary_fn, *args, **kwargs): summary_fn(pd.concat(dfs)) return pd.concat(dfs) + +def custom_plot(multiple_results, labels, colors, screening=False, fig=None): + + if fig is None: + fig = plt.figure() + ax = fig.gca() + + fig.suptitle('Pivots', fontsize=20) + + grid = np.linspace(0, 1, 51) + + for i in range(len(labels)): + ecdf = sm.distributions.ECDF(multiple_results[labels[i]]) + F = ecdf(grid) + ax.plot(grid, F, '--o', c=colors[i], lw=2, label=labels[i]) + + ax.plot([0, 1], [0, 1], 'k-', lw=2) + ax.legend(loc='lower right') + return fig + + def pvalue_plot(multiple_results, screening=False, fig=None, label = '$H_0$', colors=['b','r']): """ Extract pvalues and group by @@ -139,7 +160,7 @@ def split_pvalue_plot(multiple_results, screening=False, fig=None): PA_s = multiple_results['split_pvalue'][multiple_results['active']] # presumes we also have a pvalue - P0 = multiple_results['pvalue'][~multiple_results['active']] + P0 = multiple_results['pivot_clt'][~multiple_results['active']] PA = multiple_results['pvalue'][multiple_results['active']] if fig is None: @@ -465,13 +486,13 @@ def compute_lengths(multiple_results): def compute_length_frac(multiple_results): result = {} - if 'ci_length_clt' and 'ci_length_split' in multiple_results.columns: + if 'ci_length_clt'in multiple_results.columns and 'ci_length_split' in multiple_results.columns: split = multiple_results['ci_length_split'] clt = multiple_results['ci_length_clt'] split = split[~np.isnan(clt)] clt = clt[~np.isnan(clt)] result['split/clt'] = np.median(np.divide(split, clt)) - if 'ci_length_boot' and 'ci_length_split' in multiple_results.columns: + if 'ci_length_boot' in multiple_results.columns and 'ci_length_split' in multiple_results.columns: split = multiple_results['ci_length_split'] boot = multiple_results['ci_length_boot'] split = split[~np.isnan(boot)]