jonathan-taylor · jelena-markovic · Nov 2, 2017 · Nov 2, 2017 · Nov 3, 2017 · Nov 6, 2017
diff --git a/doc/examples/compute_coverages.py b/doc/examples/compute_coverages.py
@@ -0,0 +1,51 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+from statsmodels.distributions import ECDF
+from selection.randomized.tests.test_opt_weighted_intervals import test_opt_weighted_intervals
+
+
+def compute_coverage(sel_ci, true_vec):
+    nactive = true_vec.shape[0]
+    coverage = np.zeros(nactive)
+    for i in range(nactive):
+        if true_vec[i]>=sel_ci[i,0] and true_vec[i]<=sel_ci[i,1]:
+            coverage[i]=1
+    return coverage
+
+
+def main(ndraw=5000, burnin=1000, nsim=20):
+    np.random.seed(1)
+
+    sel_pivots_all = list()
+    sel_ci_all = list()
+    rand_all = []
+    for i in range(nsim):
+        for idx, results in enumerate(test_opt_weighted_intervals(ndraw=ndraw, burnin=burnin)):
+            if results is not None:
+                (rand, sel_pivots, sel_ci, true_vec) = results
+                if i==0:
+                    sel_pivots_all.append([])
+                    rand_all.append(rand)
+                    sel_ci_all.append([])
+                sel_pivots_all[idx]=np.concatenate((sel_pivots_all[idx],sel_pivots), axis=0)
+                sel_ci_all[idx] = np.concatenate((sel_ci_all[idx], compute_coverage(sel_ci, true_vec)), axis=0)
+
+    xval = np.linspace(0, 1, 200)
+
+    for idx in range(len(rand_all)):
+        fig = plt.figure(num=idx, figsize=(8,8))
+        plt.clf()
+        #sel_pivots_all[idx] = [item for sublist in sel_pivots_all[idx] for item in sublist]
+        plt.plot(xval, ECDF(sel_pivots_all[idx])(xval), label='selective')
+        plt.plot(xval, xval, 'k-', lw=1)
+        plt.legend(loc='lower right')
+
+        #sel_ci_all[idx] = [item for sublist in sel_ci_all[idx] for item in sublist]
+        #sel_ci_all[idx] = np.vstack(sel_ci_all[idx])
+        print("covered", sel_ci_all[idx])
+        plt.title(''.join(["coverage ", str(np.mean(sel_ci_all[idx]))]))
+        plt.savefig(''.join(["fig", rand_all[idx], '.pdf']))
+
+    for idx in range(len(rand_all)):
+        print("coverage", np.mean(sel_ci_all[idx]))
diff --git a/selection/randomized/M_estimator.py b/selection/randomized/M_estimator.py
@@ -61,7 +61,7 @@ def __init__(self, loss, epsilon, penalty, randomization, solve_args={'min_its':
 
     def solve(self, scaling=1, solve_args={'min_its':20, 'tol':1.e-10}, nboot=2000):
 
-        self.randomize()
+        #self.randomize()
 
         (loss,
          randomized_loss,

diff --git a/selection/randomized/glm.py b/selection/randomized/glm.py
@@ -671,21 +671,25 @@ def glm_parametric_covariance(glm_loss, solve_args={'min_its':50, 'tol':1.e-10})
     return functools.partial(parametric_cov, glm_loss, solve_args=solve_args)
 
 
-def standard_split_ci(glm_loss, X, y, active, leftout_indices, alpha=0.1):
+def standard_split_ci(glm_loss, X, y, active, leftout_indices, parametric=False, alpha=0.1):
     """
     Data plitting confidence intervals via bootstrap.
     """
     loss = glm_loss(X[leftout_indices,], y[leftout_indices])
-    boot_target, target_observed = pairs_bootstrap_glm(loss, active)
+    observed = restricted_Mest(loss, active)
     nactive = np.sum(active)
-    size= np.sum(leftout_indices)
-    observed = target_observed[:nactive]
-    boot_target_restricted = lambda indices: boot_target(indices)[:nactive]
-    sampler = lambda: np.random.choice(size, size=(size,), replace=True)
-    target_cov = bootstrap_cov(sampler, boot_target_restricted)
+    if parametric==False:
+        boot_target, target_observed = pairs_bootstrap_glm(loss, active)
+        size= np.sum(leftout_indices)
+        print("size", size)
+        boot_target_restricted = lambda indices: boot_target(indices)[:nactive]
+        sampler = lambda: np.random.choice(size, size=(size,), replace=True)
+        target_cov = bootstrap_cov(sampler, boot_target_restricted)
+    else:
+        target_cov=_parametric_cov_glm(loss, active)[:nactive,:nactive]
 
     quantile = - ndist.ppf(alpha / float(2))
-    LU = np.zeros((2, target_observed.shape[0]))
+    LU = np.zeros((2, observed.shape[0]))
     for j in range(observed.shape[0]):
         sigma = np.sqrt(target_cov[j, j])
         LU[0, j] = observed[j] - sigma * quantile

diff --git a/selection/randomized/tests/test_opt_weighted_intervals.py b/selection/randomized/tests/test_opt_weighted_intervals.py
@@ -21,24 +21,24 @@ def test_opt_weighted_intervals(ndraw=20000, burnin=2000):
 
     results = []
     cls = lasso
-    for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['laplace', 'gaussian']):
+    for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['gaussian']):
 
         inst, const = const_info
 
-        X, Y, beta = inst(n=100, p=20, s=0, signal=5., sigma=5.)[:3]
+        X, Y, beta = inst(n=200, p=50, s=0, signal=5., sigma=1., rho=0.)[:3]
         n, p = X.shape
 
-        W = np.ones(X.shape[1]) * 8
-        conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=True)
+        W = np.ones(X.shape[1]) * 1.3
+        conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=False)
         signs = conv.fit()
         print("signs", signs)
 
-        marginalizing_groups = np.ones(p, np.bool)
+        #marginalizing_groups = np.ones(p, np.bool)
         #marginalizing_groups[:int(p/2)] = True
-        conditioning_groups = ~marginalizing_groups
+        #conditioning_groups = ~marginalizing_groups
         #conditioning_groups[-int(p/4):] = False
-        conv.decompose_subgradient(marginalizing_groups=marginalizing_groups,
-                                   conditioning_groups=conditioning_groups)
+        #conv.decompose_subgradient(marginalizing_groups=marginalizing_groups,
+        #                           conditioning_groups=conditioning_groups)
 
         selected_features = conv._view.selection_variable['variables']
         nactive=selected_features.sum()
@@ -51,7 +51,7 @@ def test_opt_weighted_intervals(ndraw=20000, burnin=2000):
                                                         ndraw=ndraw,
                                                         burnin=burnin,
                                                         compute_intervals=True)
-            print(sel_pivots)
+            print("pivots",sel_pivots)
             results.append((rand, sel_pivots, sel_ci, beta[selected_features]))
 
     return results

diff --git a/selection/randomized/tests/test_split_compare.py b/selection/randomized/tests/test_split_compare.py
@@ -9,7 +9,7 @@
 from selection.api import (randomization, 
                            split_glm_group_lasso)
 
-from ...tests.instance import logistic_instance
+from ...tests.instance import logistic_instance, gaussian_instance
 from ...tests.decorators import (wait_for_return_value, 
                                  register_report, 
                                  set_sampling_params_iftrue)
@@ -19,7 +19,7 @@
                    pairs_bootstrap_glm)
 
 from ..M_estimator import restricted_Mest
-from ..query import naive_confidence_intervals
+from ..query import naive_confidence_intervals, multiple_queries
 
 @register_report(['pivots_clt', 
                   'covered_clt', 
@@ -32,31 +32,49 @@
 @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, burnin=10)
 @wait_for_return_value()
 def test_split_compare(s=3,
-                       n=200,
-                       p=20,
+                       n=100,
+                       p=50,
                        signal=7,
-                       rho=0.1,
+                       rho=0.,
                        split_frac=0.8,
                        lam_frac=0.7,
-                       ndraw=10000, burnin=2000,
-                       solve_args={'min_its':50, 'tol':1.e-10}, check_screen =True):
+                       ndraw=10000,
+                       burnin=2000,
+                       solve_args={'min_its':50, 'tol':1.e-10},
+                       check_screen =False,
+                       instance = "gaussian"):
 
-    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal)
+    m = int(split_frac * n)
+    if instance=="logistic":
+        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1.)
+        loss = rr.glm.logistic(X, y)
+        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
+        loss_rr = rr.glm.logistic
+    elif instance=="gaussian":
+        X, y, beta, _,_ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1.)
+        loss = rr.glm.gaussian(X,y)
+        #lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.normal(0, 1. / 2, (n, 10000)))).max(0))
+        #print("lam", lam)
+        lam = lam_frac * np.sqrt(np.log(p)*m/n)
+        print("lam", lam)
+        loss_rr = rr.glm.gaussian
+    else:
+        raise ValueError("invalid instance")
 
     nonzero = np.where(beta)[0]
-
-    loss = rr.glm.logistic(X, y)
     epsilon = 1.
 
-    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
     W = np.ones(p)*lam
-    W[0] = 0 # use at least some unpenalized
-    penalty = rr.group_lasso(np.arange(p),
-                             weights=dict(zip(np.arange(p), W)), lagrange=1.)
-
-    m = int(split_frac * n)
+    penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.)
 
     M_est = split_glm_group_lasso(loss, epsilon, m, penalty)
+    M_est.randomize()
+    leftout_indices = M_est.randomized_loss.saturated_loss.case_weights == 0
+    X_used = X[~leftout_indices,:]
+    lam = lam_frac * np.mean(np.fabs(np.dot(X_used.T, np.random.normal(0, 1., (m, 10000)))).max(0))
+    print("lam", lam)
+    M_est.penalty =rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), np.ones(p)*lam)), lagrange=1.)
+
     M_est.solve()
 
     active_union = M_est.selection_variable['variables']
@@ -65,24 +83,18 @@ def test_split_compare(s=3,
     if nactive==0:
         return None
 
-    leftout_indices = M_est.randomized_loss.saturated_loss.case_weights == 0
-
     screen = set(nonzero).issubset(np.nonzero(active_union)[0])
-
     if check_screen and not screen:
         return None
 
     if True:
         active_set = np.nonzero(active_union)[0]
         true_vec = beta[active_union]
 
-        selected_features = np.zeros(p, np.bool)
-        selected_features[active_set] = True
-
-        unpenalized_mle = restricted_Mest(loss, selected_features)
+        unpenalized_mle = restricted_Mest(loss, active_union)
 
         form_covariances = glm_nonparametric_bootstrap(n, n)
-        target_info, target_observed = pairs_bootstrap_glm(M_est.loss, selected_features, inactive=None)
+        target_info, target_observed = pairs_bootstrap_glm(M_est.loss, active_union, inactive=None)
 
         cov_info = M_est.setup_sampler()
         target_cov, score_cov = form_covariances(target_info,  
@@ -92,17 +104,17 @@ def test_split_compare(s=3,
         opt_sample = M_est.sampler.sample(ndraw,
                                           burnin)
 
-        pivots = M_est.sampler.coefficient_pvalues(unpenalized_mle, 
+        pivots = M_est.sampler.coefficient_pvalues(unpenalized_mle,
                                                    target_cov, 
                                                    score_cov, 
                                                    parameter=true_vec,
                                                    sample=opt_sample)
-        LU = intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample)
+        LU = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample)
 
         LU_naive = naive_confidence_intervals(np.diag(target_cov), target_observed)
 
         if X.shape[0] - leftout_indices.sum() > nactive:
-            LU_split = standard_split_ci(rr.glm.logistic, X, y, active_union, leftout_indices)
+            LU_split = standard_split_ci(loss_rr, X, y, active_union, leftout_indices, parametric=False)
         else:
             LU_split = np.ones((nactive, 2)) * np.nan
 
@@ -138,19 +150,21 @@ def coverage(LU):
                 ci_length_naive)
 
 
-def report(niter=3, **kwargs):
 
-    kwargs = {'s': 0, 'n': 300, 'p': 20, 'signal': 7, 'split_frac': 0.8}
+def report(niter=2, **kwargs):
+
     split_report = reports.reports['test_split_compare']
     screened_results = reports.collect_multiple_runs(split_report['test'],
                                                      split_report['columns'],
                                                      niter,
                                                      reports.summarize_all,
                                                      **kwargs)
 
-    fig = reports.boot_clt_plot(screened_results, inactive=True, active=False)
-    fig.savefig('split_compare_pivots.pdf') # will have both bootstrap and CLT on plot
-
+    #fig = reports.custom_plot(screened_results, labels=['pivots_clt'], colors=['r'])
+    #fig.savefig('split_compare_pivots.pdf')
+    return (screened_results)
 
-if __name__=='__main__':
-    report()
+def main():
+    kwargs = {'s': 0, 'n': 200, 'p': 50, 'signal': 7, 'lam_frac':2.5, 'split_frac': 0.8,
+              'check_screen':True, 'instance':"gaussian"}
+    report(**kwargs)
diff --git a/selection/tests/reports.py b/selection/tests/reports.py
@@ -50,6 +50,27 @@ def collect_multiple_runs(test_fn, columns, nrun, summary_fn, *args, **kwargs):
             summary_fn(pd.concat(dfs))
     return pd.concat(dfs)
 
+
+def custom_plot(multiple_results, labels, colors, screening=False, fig=None):
+
+    if fig is None:
+        fig = plt.figure()
+    ax = fig.gca()
+
+    fig.suptitle('Pivots', fontsize=20)
+
+    grid = np.linspace(0, 1, 51)
+
+    for i in range(len(labels)):
+        ecdf = sm.distributions.ECDF(multiple_results[labels[i]])
+        F = ecdf(grid)
+        ax.plot(grid, F, '--o', c=colors[i], lw=2, label=labels[i])
+
+    ax.plot([0, 1], [0, 1], 'k-', lw=2)
+    ax.legend(loc='lower right')
+    return fig
+
+
 def pvalue_plot(multiple_results, screening=False, fig=None, label = '$H_0$', colors=['b','r']):
     """
     Extract pvalues and group by 
@@ -139,7 +160,7 @@ def split_pvalue_plot(multiple_results, screening=False, fig=None):
     PA_s = multiple_results['split_pvalue'][multiple_results['active']]
 
     # presumes we also have a pvalue
-    P0 = multiple_results['pvalue'][~multiple_results['active']]
+    P0 = multiple_results['pivot_clt'][~multiple_results['active']]
     PA = multiple_results['pvalue'][multiple_results['active']]
 
     if fig is None:
@@ -465,13 +486,13 @@ def compute_lengths(multiple_results):
 
 def compute_length_frac(multiple_results):
     result = {}
-    if 'ci_length_clt' and 'ci_length_split' in multiple_results.columns:
+    if 'ci_length_clt'in multiple_results.columns and 'ci_length_split' in multiple_results.columns:
         split = multiple_results['ci_length_split']
         clt = multiple_results['ci_length_clt']
         split = split[~np.isnan(clt)]
         clt = clt[~np.isnan(clt)]
         result['split/clt'] = np.median(np.divide(split, clt))
-    if 'ci_length_boot' and 'ci_length_split' in multiple_results.columns:
+    if 'ci_length_boot' in multiple_results.columns and 'ci_length_split' in multiple_results.columns:
         split = multiple_results['ci_length_split']
         boot = multiple_results['ci_length_boot']
         split = split[~np.isnan(boot)]