NGO-Algorithm-Audit · fholstege · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,9 @@
 *.ipynb_checkpoints
 *.DS_Store
 *__pycache__
+*.env
+*fpr_experiment
+*.csv
+*DUO_CBS
+*paper_analysis
+*sim
diff --git a/paper_analysis/analysis.ipynb b/paper_analysis/analysis.ipynb
diff --git a/paper_analysis/analysis_check.ipynb b/paper_analysis/analysis_check.ipynb
diff --git a/paper_analysis/analysis_utils.py b/paper_analysis/analysis_utils.py
@@ -0,0 +1,63 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+def plot_diff_in_bias(df, target_col_map={'y': r'$y$', 'y_pred': r'$\hat{y}$', 'err': r'Error'}, color_set='Reds', ax=None, show_legend='auto'):
+
+    # if no ax is provided, create a new figure
+    if ax is None:
+        fig, ax = plt.subplots()
+
+    # make a copy of the df for the plot
+    df_p = df.copy()
+
+    # create the color palette for seaborn
+    color_palette = sns.color_palette(color_set, n_colors=df['cluster_nr'].nunique())
+
+    # change the target_col values based on a dictionary
+    df_p['target_col'] = df_p['target_col'].map(target_col_map)
+
+    # Map the cluster numbers to have +1
+    df_p['cluster_nr'] = df_p['cluster_nr'] + 1
+
+    # create the plot
+    sns.barplot(data=df_p, x='target_col', y='diff_clust', hue='cluster_nr', palette=color_palette, errorbar = ('ci', 95), ax=ax, legend=show_legend)
+
+    # set the labels
+    ax.set_xlabel('Bias metric')
+    ax.set_ylabel('Difference in Bias')
+
+    # define the title of the legend
+    if show_legend == 'auto':
+        ax.legend(title='Cluster')
+
+    return ax
+
+def plot_grid_of_bias_diffs(df, K_values, N_values, target_col_map={'y': r'$y$', 'y_pred': r'$\hat{y}$', 'err': r'Error'}, color_set='Reds'):
+    fig, axes = plt.subplots(len(N_values), len(K_values), figsize=(5 * len(K_values), 5 * len(N_values)), sharey=True)
+
+    for i, N in enumerate(N_values):
+        for j, K in enumerate(K_values):
+
+            # only show legend at first plot in column
+            show_legend = 'auto' if (i ==0) else False
+            print('i is {}, j is {}, show_legend is {}'.format(i, j, show_legend))
+
+
+            # Create a modified version of the dataframe for each combination of K and N
+            df_mod = df[(df['K'] == K) & (df['N'] == N)].copy()
+
+            # Create the plot using the existing function
+            plot_diff_in_bias(df_mod, target_col_map, color_set, ax=axes[i, j], show_legend=show_legend)
+
+            # Set the title for each subplot
+            axes[i, j].set_title(r'$K={}$, $N={}$'.format(K, N))
+
+    # Adjust layout
+    plt.tight_layout()
+
+    return fig
+
+
+
+
+
diff --git a/paper_analysis/hbac_dgp.png b/paper_analysis/hbac_dgp.png
diff --git a/paper_analysis/params.py b/paper_analysis/params.py
@@ -0,0 +1,29 @@
+from itertools import product
+
+
+
+
+
+
+# define the parameters
+n_sims = 1000
+method = ['hbac']
+target_col = ['y','y_pred']
+K = [ 5 ]
+N = [1000]
+y_dgp = ['constant', 'linear']
+x_dgp = ['random']
+d = [2]
+binary_y = [False]
+fit_train = [True, False]
+n_iter_hbac = ['known_clusters']
+min_cluster_size = [5]
+val_frac = [0.5]
+bonf_correct = [True, False]
+bootstrap_perm =[True, False]
+n_perm = [1000]
+
+
+
+# create the parameter grid
+params = list(product(method, K, N, y_dgp, x_dgp,  d, binary_y, fit_train, n_iter_hbac, min_cluster_size, val_frac, bonf_correct, target_col, bootstrap_perm, n_perm))
diff --git a/paper_analysis/requirements.txt b/paper_analysis/requirements.txt
@@ -0,0 +1,54 @@
+appnope==0.1.4
+asttokens==2.4.1
+comm==0.2.2
+contourpy==1.3.0
+cramjam==2.9.0
+cycler==0.12.1
+debugpy==1.8.7
+decorator==5.1.1
+executing==2.1.0
+fastparquet==2024.5.0
+fonttools==4.54.1
+fsspec==2024.10.0
+ipykernel==6.29.5
+ipython==8.29.0
+jedi==0.19.1
+joblib==1.4.2
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.7
+kmodes==0.12.2
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+nest-asyncio==1.6.0
+numpy==1.26.4
+packaging==24.1
+pandas==2.2.3
+parso==0.8.4
+patsy==0.5.6
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.6
+prompt_toolkit==3.0.48
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+Pygments==2.18.0
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+pyzmq==26.2.0
+scikit-learn==1.5.2
+scipy==1.14.1
+seaborn==0.13.2
+six==1.16.0
+stack-data==0.6.3
+statsmodels==0.14.4
+threadpoolctl==3.5.0
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2024.2
+wcwidth==0.2.13