tansey · lionelkusch · Jun 18, 2025 · Jun 23, 2025
diff --git a/benchmarks/liang/README.md b/benchmarks/liang/README.md
@@ -1,2 +1,39 @@
 # Benchmarks
-This folder contains code to replicate the benchmarks from the paper.
+This folder contains code to replicate the benchmarks from the paper.
+
+
+```
+# require to create the dataset before 
+mkdir plot data
+NB_FEATURE=100
+# for // use the script: sim_parallel.py 0 100 100 100 --reset --nthreads 2
+for trial in {0..100}; do python sim_liang.py $trial $NB_FEATURE; done
+for trial in {0..100}; do python sim_liang.py $trial $NB_FEATURE --cv 10; done
+for trial in {0..100}; do python sim_liang.py $trial $NB_FEATURE --robust 10; done
+for trial in {0..100}; do python sim_liang.py $trial $NB_FEATURE --cv 10 --robust 10; done
+python sim_liang_agg.py # aggregate all the simulated data
+
+# for adding result with shappley value
+for trial in {0..100}; do python sim_shapley.py $trial $NB_FEATURE; done
+
+# create robust: sweep_robust / require normal sim for having X, Y, trust
+for trial in {0..100}; do python sim_robust.py $trial $NB_FEATURE --reset-models; done
+python sim_robust_agg.py # aggregate the result
+
+
+# require result cv, sweep_robust and normal
+python sim_agg.py
+
+# Predictor:
+python sim_predictors.py 0 100 0 100 --reset --nthreads 8 # create data
+python sim_predictors_agg.py # aggregate data
+python sim_predictors_importance.py
+python sim_predictors_order.py
+
+# require require predictor
+python sim_knockoffs.py
+
+# helper function: sim_model.py
+
+
+```
diff --git a/benchmarks/liang/sim_knockoffs.py b/benchmarks/liang/sim_knockoffs.py
@@ -25,14 +25,14 @@ def run(trial):
              ModelInfo(trial, 'Random Forest', None, 'rf') 
                ]
 
-    folds = get_model(infos[0], X, y, None, False).folds
+    folds = get_model(infos[0], X, y, [], False).folds
     models = [get_model(info, X, y, folds, False) for info in infos]
 
     # Get the knockoffs for the OLS and neural net models
     LINEAR_PATH = 'data/{}/cv_linear.pt'.format(trial)
     NONLINEAR_PATH = 'data/{}/cv_nonlinear.pt'.format(trial)
-    ols_model = torch.load(LINEAR_PATH)
-    nn_model = torch.load(NONLINEAR_PATH)
+    ols_model = torch.load(LINEAR_PATH, weights_only=False)
+    nn_model = torch.load(NONLINEAR_PATH, weights_only=False)
     models.append(ols_model)
     models.append(nn_model)
     infos.append(ModelInfo(trial, 'OLS', None, 'linear'))

diff --git a/benchmarks/liang/sim_liang.py b/benchmarks/liang/sim_liang.py
@@ -116,8 +116,8 @@ def run(trial, feature, reset, cv, robust):
 
     # Load the checkpoint if available
     if not reset and os.path.exists(LINEAR_PATH):
-        linear_model = torch.load(LINEAR_PATH)
-        nonlinear_model = torch.load(NONLINEAR_PATH)
+        linear_model = torch.load(LINEAR_PATH, weights_only=False)
+        nonlinear_model = torch.load(NONLINEAR_PATH, weights_only=False)
     else:
         # Train the model
         print('Fitting models with N={} P={} S={} T={} nperms={}'.format(N, P, S, T, nperms))

diff --git a/benchmarks/liang/sim_liang_agg.py b/benchmarks/liang/sim_liang_agg.py
@@ -36,8 +36,8 @@ def bounds_plot(bounds):
         plt.rc('axes', lw=2)
         lower = bounds[:,:,0][~np.isnan(bounds[:,:,0])].flatten()
         upper = bounds[:,:,1][~np.isnan(bounds[:,:,1])].flatten()
-        plt.hist(lower, label='Lower band', color='blue', bins=np.linspace(0,50,51), normed=True)
-        plt.hist(upper, label='Upper band', color='orange', bins=np.linspace(50,100,51), normed=True)
+        plt.hist(lower, label='Lower band', color='blue', bins=np.linspace(0,50,51), density=True)
+        plt.hist(upper, label='Upper band', color='orange', bins=np.linspace(50,100,51), density=True)
         plt.xlabel('Band value', fontsize=18, weight='bold')
         plt.ylabel('Proportion', fontsize=18, weight='bold')
         plt.legend(loc='upper right')

diff --git a/benchmarks/liang/sim_model.py b/benchmarks/liang/sim_model.py
@@ -152,7 +152,7 @@ def fit_nn(X, y, nepochs=100, batch_size=10, val_pct=0.1,
         if verbose:
             print('Validation loss: {} Best: {}'.format(val_losses[epoch], best_loss))
 
-    model = torch.load(tmp_file)
+    model = torch.load(tmp_file, weights_only=False)
     os.remove(tmp_file)
     return model
 

diff --git a/benchmarks/liang/sim_predictors.py b/benchmarks/liang/sim_predictors.py
@@ -2,7 +2,7 @@
 import numpy as np
 import torch
 from sim_liang import load_or_create_dataset
-from sklearn.externals import joblib
+import joblib
 from pyhrt.utils import create_folds
 from pyhrt.hrt import hrt
 
@@ -125,7 +125,7 @@ def get_r2(trial, info):
         return np.load(r2_path + '.npy')
     from sklearn.metrics import r2_score
     X, y, truth = load_or_create_dataset(trial, None, None, None)
-    model = get_model(info, X, y, None, False)
+    model = get_model(info, X, y, [], False)
     y_pred = model.predict(X)
     score = r2_score(y, y_pred)
     np.save(r2_path, score)

diff --git a/benchmarks/liang/sim_predictors_agg.py b/benchmarks/liang/sim_predictors_agg.py
@@ -143,6 +143,7 @@ def r2_scatter(tpr_vals, r2_vals, names):
         for info in infos:
             r2_scores[info.name].append(get_r2(trial, info))
             all_p_filename = 'data/{}/{}.npy'.format(trial, info.prefix)
+            print('data/{}/{}.npy'.format(trial, info.prefix))
             if not os.path.exists(all_p_filename):
                 np.save(all_p_filename, np.full(P, np.nan))
             p_values[info.name][trial] = np.load(all_p_filename)

diff --git a/benchmarks/liang/sim_predictors_importance.py b/benchmarks/liang/sim_predictors_importance.py
@@ -97,7 +97,7 @@ def rf_importance(models):
                  ModelInfo(trial, 'Elastic Net', None, 'enet'),
                  ModelInfo(trial, 'Lasso', None, 'lasso')]
 
-        models = [get_model(info, None, None, None, False) for info in infos]
+        models = [get_model(info, None, None, [], False) for info in infos]
 
         # Load the p-values for the predictor models
         for info, model in zip(infos, models):

diff --git a/benchmarks/liang/sim_shapley.py b/benchmarks/liang/sim_shapley.py
@@ -28,7 +28,7 @@ def main():
     X = np.loadtxt(X_PATH, delimiter=',')
     y = np.loadtxt(Y_PATH, delimiter=',')
     truth = np.loadtxt(TRUTH_PATH, delimiter=',')
-    nonlinear_model = torch.load(NONLINEAR_PATH)
+    nonlinear_model = torch.load(NONLINEAR_PATH, weights_only=False)
     yhat = nonlinear_model.predict(X)
 
     # Check if all of the results have already been generated and compiled

diff --git a/pyhrt/continuous.py b/pyhrt/continuous.py
@@ -194,7 +194,7 @@ def fit_mdn(X, y, ncomponents=5,
         if verbose:
             print('Validation loss: {} Best: {}'.format(val_losses[epoch], best_loss))
 
-    model = torch.load(tmp_file)
+    model = torch.load(tmp_file, weights_only=False)
     os.remove(tmp_file)
     return model