Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions doc/examples/compute_coverages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.distributions import ECDF
from selection.randomized.tests.test_opt_weighted_intervals import test_opt_weighted_intervals


def compute_coverage(sel_ci, true_vec):
nactive = true_vec.shape[0]
coverage = np.zeros(nactive)
for i in range(nactive):
if true_vec[i]>=sel_ci[i,0] and true_vec[i]<=sel_ci[i,1]:
coverage[i]=1
return coverage


def main(ndraw=5000, burnin=1000, nsim=20):
np.random.seed(1)

sel_pivots_all = list()
sel_ci_all = list()
rand_all = []
for i in range(nsim):
for idx, results in enumerate(test_opt_weighted_intervals(ndraw=ndraw, burnin=burnin)):
if results is not None:
(rand, sel_pivots, sel_ci, true_vec) = results
if i==0:
sel_pivots_all.append([])
rand_all.append(rand)
sel_ci_all.append([])
sel_pivots_all[idx]=np.concatenate((sel_pivots_all[idx],sel_pivots), axis=0)
sel_ci_all[idx] = np.concatenate((sel_ci_all[idx], compute_coverage(sel_ci, true_vec)), axis=0)

xval = np.linspace(0, 1, 200)

for idx in range(len(rand_all)):
fig = plt.figure(num=idx, figsize=(8,8))
plt.clf()
#sel_pivots_all[idx] = [item for sublist in sel_pivots_all[idx] for item in sublist]
plt.plot(xval, ECDF(sel_pivots_all[idx])(xval), label='selective')
plt.plot(xval, xval, 'k-', lw=1)
plt.legend(loc='lower right')

#sel_ci_all[idx] = [item for sublist in sel_ci_all[idx] for item in sublist]
#sel_ci_all[idx] = np.vstack(sel_ci_all[idx])
print("covered", sel_ci_all[idx])
plt.title(''.join(["coverage ", str(np.mean(sel_ci_all[idx]))]))
plt.savefig(''.join(["fig", rand_all[idx], '.pdf']))

for idx in range(len(rand_all)):
print("coverage", np.mean(sel_ci_all[idx]))
2 changes: 1 addition & 1 deletion selection/randomized/M_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self, loss, epsilon, penalty, randomization, solve_args={'min_its':

def solve(self, scaling=1, solve_args={'min_its':20, 'tol':1.e-10}, nboot=2000):

self.randomize()
#self.randomize()

(loss,
randomized_loss,
Expand Down
20 changes: 12 additions & 8 deletions selection/randomized/glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,21 +671,25 @@ def glm_parametric_covariance(glm_loss, solve_args={'min_its':50, 'tol':1.e-10})
return functools.partial(parametric_cov, glm_loss, solve_args=solve_args)


def standard_split_ci(glm_loss, X, y, active, leftout_indices, alpha=0.1):
def standard_split_ci(glm_loss, X, y, active, leftout_indices, parametric=False, alpha=0.1):
"""
Data plitting confidence intervals via bootstrap.
"""
loss = glm_loss(X[leftout_indices,], y[leftout_indices])
boot_target, target_observed = pairs_bootstrap_glm(loss, active)
observed = restricted_Mest(loss, active)
nactive = np.sum(active)
size= np.sum(leftout_indices)
observed = target_observed[:nactive]
boot_target_restricted = lambda indices: boot_target(indices)[:nactive]
sampler = lambda: np.random.choice(size, size=(size,), replace=True)
target_cov = bootstrap_cov(sampler, boot_target_restricted)
if parametric==False:
boot_target, target_observed = pairs_bootstrap_glm(loss, active)
size= np.sum(leftout_indices)
print("size", size)
boot_target_restricted = lambda indices: boot_target(indices)[:nactive]
sampler = lambda: np.random.choice(size, size=(size,), replace=True)
target_cov = bootstrap_cov(sampler, boot_target_restricted)
else:
target_cov=_parametric_cov_glm(loss, active)[:nactive,:nactive]

quantile = - ndist.ppf(alpha / float(2))
LU = np.zeros((2, target_observed.shape[0]))
LU = np.zeros((2, observed.shape[0]))
for j in range(observed.shape[0]):
sigma = np.sqrt(target_cov[j, j])
LU[0, j] = observed[j] - sigma * quantile
Expand Down
18 changes: 9 additions & 9 deletions selection/randomized/tests/test_opt_weighted_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,24 @@ def test_opt_weighted_intervals(ndraw=20000, burnin=2000):

results = []
cls = lasso
for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['laplace', 'gaussian']):
for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['gaussian']):

inst, const = const_info

X, Y, beta = inst(n=100, p=20, s=0, signal=5., sigma=5.)[:3]
X, Y, beta = inst(n=200, p=50, s=0, signal=5., sigma=1., rho=0.)[:3]
n, p = X.shape

W = np.ones(X.shape[1]) * 8
conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=True)
W = np.ones(X.shape[1]) * 1.3
conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=False)
signs = conv.fit()
print("signs", signs)

marginalizing_groups = np.ones(p, np.bool)
#marginalizing_groups = np.ones(p, np.bool)
#marginalizing_groups[:int(p/2)] = True
conditioning_groups = ~marginalizing_groups
#conditioning_groups = ~marginalizing_groups
#conditioning_groups[-int(p/4):] = False
conv.decompose_subgradient(marginalizing_groups=marginalizing_groups,
conditioning_groups=conditioning_groups)
#conv.decompose_subgradient(marginalizing_groups=marginalizing_groups,
# conditioning_groups=conditioning_groups)

selected_features = conv._view.selection_variable['variables']
nactive=selected_features.sum()
Expand All @@ -51,7 +51,7 @@ def test_opt_weighted_intervals(ndraw=20000, burnin=2000):
ndraw=ndraw,
burnin=burnin,
compute_intervals=True)
print(sel_pivots)
print("pivots",sel_pivots)
results.append((rand, sel_pivots, sel_ci, beta[selected_features]))

return results
Expand Down
82 changes: 48 additions & 34 deletions selection/randomized/tests/test_split_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from selection.api import (randomization,
split_glm_group_lasso)

from ...tests.instance import logistic_instance
from ...tests.instance import logistic_instance, gaussian_instance
from ...tests.decorators import (wait_for_return_value,
register_report,
set_sampling_params_iftrue)
Expand All @@ -19,7 +19,7 @@
pairs_bootstrap_glm)

from ..M_estimator import restricted_Mest
from ..query import naive_confidence_intervals
from ..query import naive_confidence_intervals, multiple_queries

@register_report(['pivots_clt',
'covered_clt',
Expand All @@ -32,31 +32,49 @@
@set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, burnin=10)
@wait_for_return_value()
def test_split_compare(s=3,
n=200,
p=20,
n=100,
p=50,
signal=7,
rho=0.1,
rho=0.,
split_frac=0.8,
lam_frac=0.7,
ndraw=10000, burnin=2000,
solve_args={'min_its':50, 'tol':1.e-10}, check_screen =True):
ndraw=10000,
burnin=2000,
solve_args={'min_its':50, 'tol':1.e-10},
check_screen =False,
instance = "gaussian"):

X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal)
m = int(split_frac * n)
if instance=="logistic":
X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1.)
loss = rr.glm.logistic(X, y)
lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
loss_rr = rr.glm.logistic
elif instance=="gaussian":
X, y, beta, _,_ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1.)
loss = rr.glm.gaussian(X,y)
#lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.normal(0, 1. / 2, (n, 10000)))).max(0))
#print("lam", lam)
lam = lam_frac * np.sqrt(np.log(p)*m/n)
print("lam", lam)
loss_rr = rr.glm.gaussian
else:
raise ValueError("invalid instance")

nonzero = np.where(beta)[0]

loss = rr.glm.logistic(X, y)
epsilon = 1.

lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
W = np.ones(p)*lam
W[0] = 0 # use at least some unpenalized
penalty = rr.group_lasso(np.arange(p),
weights=dict(zip(np.arange(p), W)), lagrange=1.)

m = int(split_frac * n)
penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.)

M_est = split_glm_group_lasso(loss, epsilon, m, penalty)
M_est.randomize()
leftout_indices = M_est.randomized_loss.saturated_loss.case_weights == 0
X_used = X[~leftout_indices,:]
lam = lam_frac * np.mean(np.fabs(np.dot(X_used.T, np.random.normal(0, 1., (m, 10000)))).max(0))
print("lam", lam)
M_est.penalty =rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), np.ones(p)*lam)), lagrange=1.)

M_est.solve()

active_union = M_est.selection_variable['variables']
Expand All @@ -65,24 +83,18 @@ def test_split_compare(s=3,
if nactive==0:
return None

leftout_indices = M_est.randomized_loss.saturated_loss.case_weights == 0

screen = set(nonzero).issubset(np.nonzero(active_union)[0])

if check_screen and not screen:
return None

if True:
active_set = np.nonzero(active_union)[0]
true_vec = beta[active_union]

selected_features = np.zeros(p, np.bool)
selected_features[active_set] = True

unpenalized_mle = restricted_Mest(loss, selected_features)
unpenalized_mle = restricted_Mest(loss, active_union)

form_covariances = glm_nonparametric_bootstrap(n, n)
target_info, target_observed = pairs_bootstrap_glm(M_est.loss, selected_features, inactive=None)
target_info, target_observed = pairs_bootstrap_glm(M_est.loss, active_union, inactive=None)

cov_info = M_est.setup_sampler()
target_cov, score_cov = form_covariances(target_info,
Expand All @@ -92,17 +104,17 @@ def test_split_compare(s=3,
opt_sample = M_est.sampler.sample(ndraw,
burnin)

pivots = M_est.sampler.coefficient_pvalues(unpenalized_mle,
pivots = M_est.sampler.coefficient_pvalues(unpenalized_mle,
target_cov,
score_cov,
parameter=true_vec,
sample=opt_sample)
LU = intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample)
LU = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample)

LU_naive = naive_confidence_intervals(np.diag(target_cov), target_observed)

if X.shape[0] - leftout_indices.sum() > nactive:
LU_split = standard_split_ci(rr.glm.logistic, X, y, active_union, leftout_indices)
LU_split = standard_split_ci(loss_rr, X, y, active_union, leftout_indices, parametric=False)
else:
LU_split = np.ones((nactive, 2)) * np.nan

Expand Down Expand Up @@ -138,19 +150,21 @@ def coverage(LU):
ci_length_naive)


def report(niter=3, **kwargs):

kwargs = {'s': 0, 'n': 300, 'p': 20, 'signal': 7, 'split_frac': 0.8}
def report(niter=2, **kwargs):

split_report = reports.reports['test_split_compare']
screened_results = reports.collect_multiple_runs(split_report['test'],
split_report['columns'],
niter,
reports.summarize_all,
**kwargs)

fig = reports.boot_clt_plot(screened_results, inactive=True, active=False)
fig.savefig('split_compare_pivots.pdf') # will have both bootstrap and CLT on plot

#fig = reports.custom_plot(screened_results, labels=['pivots_clt'], colors=['r'])
#fig.savefig('split_compare_pivots.pdf')
return (screened_results)

if __name__=='__main__':
report()
def main():
kwargs = {'s': 0, 'n': 200, 'p': 50, 'signal': 7, 'lam_frac':2.5, 'split_frac': 0.8,
'check_screen':True, 'instance':"gaussian"}
report(**kwargs)
27 changes: 24 additions & 3 deletions selection/tests/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,27 @@ def collect_multiple_runs(test_fn, columns, nrun, summary_fn, *args, **kwargs):
summary_fn(pd.concat(dfs))
return pd.concat(dfs)


def custom_plot(multiple_results, labels, colors, screening=False, fig=None):

if fig is None:
fig = plt.figure()
ax = fig.gca()

fig.suptitle('Pivots', fontsize=20)

grid = np.linspace(0, 1, 51)

for i in range(len(labels)):
ecdf = sm.distributions.ECDF(multiple_results[labels[i]])
F = ecdf(grid)
ax.plot(grid, F, '--o', c=colors[i], lw=2, label=labels[i])

ax.plot([0, 1], [0, 1], 'k-', lw=2)
ax.legend(loc='lower right')
return fig


def pvalue_plot(multiple_results, screening=False, fig=None, label = '$H_0$', colors=['b','r']):
"""
Extract pvalues and group by
Expand Down Expand Up @@ -139,7 +160,7 @@ def split_pvalue_plot(multiple_results, screening=False, fig=None):
PA_s = multiple_results['split_pvalue'][multiple_results['active']]

# presumes we also have a pvalue
P0 = multiple_results['pvalue'][~multiple_results['active']]
P0 = multiple_results['pivot_clt'][~multiple_results['active']]
PA = multiple_results['pvalue'][multiple_results['active']]

if fig is None:
Expand Down Expand Up @@ -465,13 +486,13 @@ def compute_lengths(multiple_results):

def compute_length_frac(multiple_results):
result = {}
if 'ci_length_clt' and 'ci_length_split' in multiple_results.columns:
if 'ci_length_clt'in multiple_results.columns and 'ci_length_split' in multiple_results.columns:
split = multiple_results['ci_length_split']
clt = multiple_results['ci_length_clt']
split = split[~np.isnan(clt)]
clt = clt[~np.isnan(clt)]
result['split/clt'] = np.median(np.divide(split, clt))
if 'ci_length_boot' and 'ci_length_split' in multiple_results.columns:
if 'ci_length_boot' in multiple_results.columns and 'ci_length_split' in multiple_results.columns:
split = multiple_results['ci_length_split']
boot = multiple_results['ci_length_boot']
split = split[~np.isnan(boot)]
Expand Down