From 14126331765757d19ee231ad7af72d3e5f0e3132 Mon Sep 17 00:00:00 2001 From: ryanhammonds Date: Fri, 30 Jun 2023 16:26:49 -0700 Subject: [PATCH] test updates --- riskslim/classifier.py | 11 +- riskslim/coefficient_set.py | 20 +-- riskslim/data.py | 6 +- riskslim/reporter.py | 30 ++-- riskslim/tests/conftest.py | 5 +- .../loss_functions/test_log_loss_weighted.py | 8 +- riskslim/tests/test_bound_tightening.py | 8 +- riskslim/tests/test_callbacks.py | 147 ++++++++++-------- riskslim/tests/test_coefficient_set.py | 29 ++-- riskslim/tests/test_data.py | 20 +-- riskslim/tests/test_fit.py | 67 +++----- riskslim/tests/test_heuristics.py | 4 +- riskslim/tests/test_risk_slim.py | 36 +++-- riskslim/tests/test_solution_pool.py | 3 +- riskslim/tests/test_utils.py | 72 +-------- riskslim/tests/test_warmstart.py | 16 +- riskslim/tests/utils.py | 7 +- 17 files changed, 191 insertions(+), 298 deletions(-) diff --git a/riskslim/classifier.py b/riskslim/classifier.py index 60678ac..420787c 100644 --- a/riskslim/classifier.py +++ b/riskslim/classifier.py @@ -55,9 +55,8 @@ class RiskSLIMClassifier(BaseEstimator, ClassifierMixin): calibrated_estimators_ : list of sklearn.calibration.CalibratedClassifierCV Calibrators trained per fold. Must use the fitcv method. """ - def __init__(self, max_coef = 5, max_size = None, coef_set = None, - variable_names = None, outcome_name = None, c0_value = 1e-6, - verbose = True, **kwargs): + def __init__(self, max_coef = 5, max_size = None, variable_names = None, + outcome_name = None, verbose = True, **kwargs): """ Parameters ---------- @@ -109,7 +108,7 @@ def __init__(self, max_coef = 5, max_size = None, coef_set = None, # internals self._data = None self._variable_names = variable_names - self._outcome_name = outcome_name + self._outcome_name = outcome_name if outcome_name is not None else "outcome" self._coef_set = None # todo: check that this @@ -275,10 +274,8 @@ def recalibrate(self, X, y, sample_weights=None, method= "sigmoid"): method : {"sigmoid", "isotonic"} Linear classifier used to recalibrate scores. """ - # todo: call check_data <- rh: check_data has to be called in .fit() prior to this call + # todo: call check_data # todo: add support for kwargs (method = 'sigmoid') should be - # <- rh: i don't think kwargs should be user facing; it leads to issues like unclear signatures - # or passing bad args in (e.g. methdo="sigmoid" typo wouldn't raise an error) if not self.fitted: raise ValueError("fit RiskSLIM before calling recalibrate") diff --git a/riskslim/coefficient_set.py b/riskslim/coefficient_set.py index 171ddac..308a58b 100644 --- a/riskslim/coefficient_set.py +++ b/riskslim/coefficient_set.py @@ -45,7 +45,7 @@ def __init__(self, variable_names, lb=-5., ub=5., c0=float('nan'), vtype='I', pr self._initialized = True ### methods ### - def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None): + def update_intercept_bounds(self, X, y, max_offset, max_size = None): """ uses data to set the lower and upper bound on the offset to a conservative value the value is guaranteed to avoid a loss in performance @@ -63,7 +63,7 @@ def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None): X y max_offset - max_L0_value + max_size Returns ------- @@ -85,14 +85,14 @@ def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None): penalized_idx = [self._coef_elements[n].penalized for n in variable_names] trivial_max_size = len(penalized_idx) - if max_L0_value is None: - max_L0_value = trivial_max_size + if max_size is None: + max_size = trivial_max_size - if max_L0_value > 0: - max_L0_value = min(trivial_max_size, max_L0_value) + if max_size > 0: + max_size = min(trivial_max_size, max_size) # update intercept bounds - Z = X * y[:, None] + Z = X * y Z_min = np.min(Z, axis = 0) Z_max = np.max(Z, axis = 0) @@ -104,15 +104,15 @@ def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None): Z_max = Z_max[variable_idx], rho_lb = self.lb[variable_idx], rho_ub = self.ub[variable_idx], - L0_reg_ind = L0_reg_ind, - max_size = max_L0_value) + L0_reg_ind = L0_reg_ind[variable_idx], + max_size = max_size) # set intercept conservative_offset = max(abs(s_min), abs(s_max)) + 1 if max_offset is None: max_offset = conservative_offset else: - max_offset = np.min(max_offset, conservative_offset) + max_offset = min(max_offset, conservative_offset) e.ub = max_offset e.lb = -max_offset diff --git a/riskslim/data.py b/riskslim/data.py index 639903e..0acacc6 100644 --- a/riskslim/data.py +++ b/riskslim/data.py @@ -47,7 +47,7 @@ def __init__(self, X, y, variable_names = None, outcome_name=None, sample_weight self._sample_weights = sample_weights assert self.__check_rep__() - self._Z = (self._X * self._y[:, None]).astype(np.float64) + self._Z = (self._X * self._y).astype(np.float64) # Infer variable types self._variable_types = np.zeros(self.X.shape[1], dtype="str") @@ -170,13 +170,13 @@ def __repr__(self): def check_data(self): if np.all(self._variable_types[1:] == "B"): - warn("X is recommended to be all binary.") + warnings.warn("X is recommended to be all binary.") # Constant warning idx = np.flatnonzero(self.X == self.X[0], axis=0) constant_variables = [self.variable_names[j] for j in idx if j > 0] if len(constant_variables): - warn("Constant variable other than intercept found in X.") + warnings.warn("Constant variable other than intercept found in X.") diff --git a/riskslim/reporter.py b/riskslim/reporter.py index 0a8c3bf..2c31b98 100644 --- a/riskslim/reporter.py +++ b/riskslim/reporter.py @@ -39,26 +39,14 @@ def __init__(self, dataset, estimator): self.variable_names = dataset.variable_names self.outcome_name = dataset.outcome_name - if hasattr(estimator, "rho"): - self.rho = estimator.rho - else: - # For scikit-learn estimators - self.rho = np.insert(np.squeeze(self.estimator.coef_), 0, self.estimator.intercept_) - self.variable_names.insert(0, "(Intercept)") + self.coefs = np.insert(self.estimator.coef_.copy(), 0, 1) + self._variable_types = estimator._variable_types - if hasattr(estimator, "_variable_types"): - self._variable_types = estimator._variable_types - else: - # For scikit-learn estimators - self._variable_types = np.zeros(self.X.shape[1], dtype="str") - self._variable_types[:] = "C" - self._variable_types[np.all(self.X == np.require(self.X, dtype=np.int_), axis=0)] = "I" - self._variable_types[np.all(self.X == np.require(self.X, dtype=np.bool_), axis=0)] = "B" # Table if np.not_equal(estimator.coef_, 0.0).any(): self.table_str = print_model( - self.rho, + self.coefs, self.variable_names, self.outcome_name, show_omitted_variables=False, @@ -69,9 +57,9 @@ def __init__(self, dataset, estimator): # Probability estimates if not hasattr(self.estimator, "calibrated_estimator") or self.estimator.calibrated_estimator is None: - self.proba = estimator.predict_proba(self.X) + self.proba = estimator.predict_proba(self.X[:, 1:]) else: - self.proba = self.estimator.calibrated_estimator.predict_proba(self.X)[:, 1] + self.proba = self.estimator.calibrated_estimator.predict_proba(self.X[:, 1:])[:, 1] @staticmethod def from_model(estimator): @@ -113,12 +101,12 @@ def _prepare_table(self): """Prepare arrays for plotly table.""" # Non-zero coefficients - inds = np.flatnonzero(self.rho[1:]) + inds = np.flatnonzero(self.coefs[1:]) if len(inds) == 0: raise ValueError('all zero coefficients') self.table["names"] = np.array(self.variable_names)[inds+1].tolist() - self.table["scores"] = self.rho[inds+1] + self.table["scores"] = self.coefs[inds+1] self.table["names"] = [str(i+1) + '. ' + n for i, n in enumerate(self.table["names"])] @@ -187,9 +175,9 @@ def create_report(self, file_name=None, show=False, replace_table=False, fig.update_layout(font_family="Source Code Pro") fig_str = fig.to_html(include_plotlyjs=False, full_html=False) - inds = np.flatnonzero(self.rho) + inds = np.flatnonzero(self.coefs) _vars = np.array(self.variable_names)[inds] - _rho = self.rho[inds] + _rho = self.coefs[inds] min_values = self.X.min(axis=0)[inds] max_values = self.X.max(axis=0)[inds] diff --git a/riskslim/tests/conftest.py b/riskslim/tests/conftest.py index e53aa0e..10f9e60 100644 --- a/riskslim/tests/conftest.py +++ b/riskslim/tests/conftest.py @@ -54,7 +54,7 @@ def generate_integer_model(n_cols=20, rho_ub=100, rho_lb=-100, sparse_pct=0.5): weights = np.ones(len(X)) # Create lookup table - min_score, max_score = get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, L0_max=n_cols) + min_score, max_score = get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, max_size=n_cols) loss_value_tbl, prob_value_tbl, loss_tbl_offset = \ lookup.get_loss_value_and_prob_tables(min_score, max_score) @@ -105,7 +105,6 @@ def generated_normal_data(): Z = X * y - names = ['var_' + str(i).zfill(2) for i in range(n_columns-1)] - names.insert(0, '(Intercept)') + names = ['var_' + str(i).zfill(2) for i in range(n_columns)] yield {'X':X, 'y':y, 'Z':Z, 'rho':rho, 'rho_true':rho_true, 'variable_names':names} diff --git a/riskslim/tests/loss_functions/test_log_loss_weighted.py b/riskslim/tests/loss_functions/test_log_loss_weighted.py index 411809e..0c674af 100644 --- a/riskslim/tests/loss_functions/test_log_loss_weighted.py +++ b/riskslim/tests/loss_functions/test_log_loss_weighted.py @@ -12,8 +12,8 @@ def test_log_loss_value(generated_normal_data): rho = generated_normal_data['rho'] # Weights of one are the same as unweighted - assert log_loss_weighted.log_loss_value(Z, np.ones(len(Z)), len(Z), rho) == \ - log_loss.log_loss_value(Z, rho) + assert log_loss_weighted.log_loss_value(Z, np.ones(len(Z)), len(Z), rho).round(6) == \ + log_loss.log_loss_value(Z, rho).round(6) # Weights of zero should give zero loss assert log_loss_weighted.log_loss_value(Z, np.zeros(len(Z)), len(Z), rho) == 0. @@ -40,7 +40,7 @@ def test_log_loss_value_and_slope(generated_normal_data): loss, slope = log_loss.log_loss_value_and_slope(Z, rho) - assert wloss == loss + assert wloss.round(6) == loss.round(6) assert np.all(slope == wslope) # Weights are all zero, assert zero @@ -75,7 +75,7 @@ def test_log_loss_value_from_scores(generated_normal_data): loss = log_loss.log_loss_value_from_scores(Z.dot(rho)) - assert wloss == loss + assert wloss.round(6) == loss.round(6) # Loss of random row in Z, selected using weights ind = np.random.choice(np.arange(len(Z))) diff --git a/riskslim/tests/test_bound_tightening.py b/riskslim/tests/test_bound_tightening.py index bb46e45..ca6954d 100644 --- a/riskslim/tests/test_bound_tightening.py +++ b/riskslim/tests/test_bound_tightening.py @@ -17,8 +17,8 @@ def test_chained_updates(update, loss, new_objvals, C_0_nnz): objval_max=10, loss_min=loss[0], loss_max=loss[1], - L0_min=1, - L0_max=10 + min_size=1, + max_size=10 ) new_objval_at_feasible, new_objval_at_relaxation = new_objvals @@ -33,5 +33,5 @@ def test_chained_updates(update, loss, new_objvals, C_0_nnz): assert new_bounds.loss_min >= bounds.loss_min assert new_bounds.loss_max <= bounds.loss_max - assert new_bounds.L0_min >= bounds.L0_min - assert new_bounds.L0_max <= bounds.L0_max + assert new_bounds.min_size >= bounds.min_size + assert new_bounds.max_size <= bounds.max_size diff --git a/riskslim/tests/test_callbacks.py b/riskslim/tests/test_callbacks.py index 9ed4284..daaec9b 100644 --- a/riskslim/tests/test_callbacks.py +++ b/riskslim/tests/test_callbacks.py @@ -2,123 +2,132 @@ import pytest import numpy as np +from riskslim.optimizer import RiskSLIMOptimizer +from riskslim.mip import create_risk_slim from riskslim.solution_pool import FastSolutionPool from riskslim.coefficient_set import CoefficientSet from riskslim.heuristics import discrete_descent, sequential_rounding -from riskslim import RiskSLIM -from riskslim.fit.callbacks import LossCallback, PolishAndRoundCallback - +from riskslim.callbacks import LossCallback, PolishAndRoundCallback +from riskslim.defaults import DEFAULT_LCPA_SETTINGS +from riskslim.data import ClassificationDataset @pytest.mark.parametrize('cut_queue', [None, FastSolutionPool(12)]) @pytest.mark.parametrize('polish_queue', [None, FastSolutionPool(12)]) def test_losscallback(generated_normal_data, cut_queue, polish_queue): + # Dataset X = generated_normal_data['X'][0] y = generated_normal_data['y'] - variable_names = generated_normal_data['variable_names'] + data = ClassificationDataset(X, y, variable_names=variable_names, outcome_name='outcome') - coef_set = CoefficientSet(variable_names) - rs = RiskSLIM(coef_set=coef_set, L0_min=0, L0_max=10) + # Create mip + coef_set = CoefficientSet(data.variable_names) - # Set data attributes - rs.X = X - rs.y = y - rs.variable_names = variable_names - rs.outcome_name = None - rs.sample_weights = None - rs.init_fit() - rs.init_mip() - rs.warmstart() + mip_settings =mip_settings = { + "C_0": 1e-6, + "coef_set": coef_set, + "tight_formulation": DEFAULT_LCPA_SETTINGS["tight_formulation"], + "drop_variables":DEFAULT_LCPA_SETTINGS["drop_variables"], + "include_auxillary_variable_for_L0_norm": DEFAULT_LCPA_SETTINGS["include_auxillary_variable_for_L0_norm"], + "include_auxillary_variable_for_objval": DEFAULT_LCPA_SETTINGS["include_auxillary_variable_for_objval"], + } - # Initialize solution queues - rs.cut_queue = cut_queue - rs.polish_queue = polish_queue + mip, indices = create_risk_slim(coef_set=coef_set, settings=mip_settings) + + # Create required attributes + opt = RiskSLIMOptimizer(data, coef_set, 5) + indices.update({"C_0_nnz": opt.C_0_nnz, "L0_reg_ind": opt.L0_reg_ind}) - loss_cb = rs.mip.register_callback(LossCallback) + loss_cb = mip.register_callback(LossCallback) loss_cb.initialize( - indices=rs.mip_indices, - stats=rs.stats, - settings=rs.settings, - compute_loss_cut=rs.compute_loss_cut, - get_alpha=rs.get_alpha, - get_L0_penalty_from_alpha=rs.get_L0_penalty_from_alpha, - initial_cuts=rs.initial_cuts, - cut_queue=rs.cut_queue, - polish_queue=rs.polish_queue, - verbose=rs.verbose, - ) + indices=indices, + stats=opt.stats, + settings=DEFAULT_LCPA_SETTINGS, + compute_loss_cut=opt.compute_loss_cut, + get_alpha=opt.get_alpha, + get_L0_penalty_from_alpha=opt.get_L0_penalty_from_alpha, + initial_cuts=None, + cut_queue=opt.cut_queue, + polish_queue=opt.polish_queue, + verbose=opt.verbose, + ) assert loss_cb.cut_queue is not None assert loss_cb.polish_queue is not None - assert loss_cb.compute_loss_cut == rs.compute_loss_cut - assert loss_cb.get_alpha == rs.get_alpha - assert loss_cb.get_L0_penalty_from_alpha == rs.get_L0_penalty_from_alpha + assert loss_cb.compute_loss_cut == opt.compute_loss_cut + assert loss_cb.get_alpha == opt.get_alpha + assert loss_cb.get_L0_penalty_from_alpha == opt.get_L0_penalty_from_alpha def test_polish_and_round_callback(generated_normal_data): + # Dataset X = generated_normal_data['X'][0] y = generated_normal_data['y'] - variable_names = generated_normal_data['variable_names'] + data = ClassificationDataset(X, y, variable_names=variable_names, outcome_name='outcome') + + # Create mip + coef_set = CoefficientSet(data.variable_names) + + mip_settings =mip_settings = { + "C_0": 1e-6, + "coef_set": coef_set, + "tight_formulation": DEFAULT_LCPA_SETTINGS["tight_formulation"], + "drop_variables":DEFAULT_LCPA_SETTINGS["drop_variables"], + "include_auxillary_variable_for_L0_norm": DEFAULT_LCPA_SETTINGS["include_auxillary_variable_for_L0_norm"], + "include_auxillary_variable_for_objval": DEFAULT_LCPA_SETTINGS["include_auxillary_variable_for_objval"], + } - coef_set = CoefficientSet(variable_names) - rs = RiskSLIM(coef_set=coef_set, L0_min=0, L0_max=10) + mip, indices = create_risk_slim(coef_set=coef_set, settings=mip_settings) - # Set data attributes - rs.X = X - rs.y = y - rs.variable_names = variable_names - rs.outcome_name = None - rs.sample_weights = None - rs.init_fit() - rs.init_mip() - rs.warmstart() + # Create required attributes + opt = RiskSLIMOptimizer(data, coef_set, 5) # Initialize solution queues - rs.cut_queue = FastSolutionPool(12) - rs.polish_queue = FastSolutionPool(12) + opt.cut_queue = FastSolutionPool(12) + opt.polish_queue = FastSolutionPool(12) polisher = lambda rho: discrete_descent( rho, - rs.Z, - rs.C_0, - rs.rho_max, - rs.rho_min, - rs.get_L0_penalty, - rs.compute_loss_from_scores, + opt.Z, + opt.C_0, + opt.rho_max, + opt.rho_min, + opt.get_L0_penalty, + opt.compute_loss_from_scores, True, ) rounder = lambda rho, cutoff: sequential_rounding( rho, - rs.Z, - rs.C_0, - rs.compute_loss_from_scores_real, - rs.get_L0_penalty, + opt.Z, + opt.C_0, + opt.compute_loss_from_scores_real, + opt.get_L0_penalty, cutoff ) - polish_cb = rs.mip.register_callback(PolishAndRoundCallback) + polish_cb = mip.register_callback(PolishAndRoundCallback) polish_cb.initialize( - indices=rs.mip_indices, - control=rs.stats, - settings=rs.settings, - cut_queue=rs.cut_queue, - polish_queue=rs.polish_queue, - get_objval=rs.get_objval, - get_L0_norm=rs.get_L0_norm, - is_feasible=rs.is_feasible, + indices=opt.mip_indices, + control=opt.stats, + settings=opt.settings, + cut_queue=opt.cut_queue, + polish_queue=opt.polish_queue, + get_objval=opt.get_objval, + get_L0_norm=opt.get_L0_norm, + is_feasible=opt.is_feasible, polishing_handle=polisher, rounding_handle=rounder, ) - assert polish_cb.get_objval == rs.get_objval - assert polish_cb.get_L0_norm == rs.get_L0_norm - assert polish_cb.is_feasible == rs.is_feasible + assert polish_cb.get_objval == opt.get_objval + assert polish_cb.get_L0_norm == opt.get_L0_norm + assert polish_cb.is_feasible == opt.is_feasible assert polish_cb.polishing_handle == polisher assert polish_cb.rounding_handle == rounder diff --git a/riskslim/tests/test_coefficient_set.py b/riskslim/tests/test_coefficient_set.py index 7495276..9cac49d 100644 --- a/riskslim/tests/test_coefficient_set.py +++ b/riskslim/tests/test_coefficient_set.py @@ -4,6 +4,7 @@ import numpy as np from riskslim.coefficient_set import CoefficientSet, _CoefficientElement from riskslim.bounds import get_score_bounds +from riskslim.data import ClassificationDataset @pytest.mark.parametrize('lb', [-5, [-5]*11]) @@ -32,26 +33,22 @@ def test_coefficientset_init(lb, ub): assert len(cs._coef_elements) == len(variable_names) -@pytest.mark.parametrize('has_intercept', [ - True, pytest.param(False, marks=pytest.mark.xfail(raises=ValueError)) -]) -@pytest.mark.parametrize('max_L0_value', [10, None]) +# @pytest.mark.parametrize('has_intercept', [ +# True, pytest.param(False, marks=pytest.mark.xfail(raises=ValueError)) +# ]) +@pytest.mark.parametrize('max_size', [10, None]) def test_coefficientset_update_intercept_bounds( - generated_normal_data, has_intercept, max_L0_value + generated_normal_data, max_size ): - - variable_names = generated_normal_data['variable_names'] - - if not has_intercept: - variable_names = variable_names.copy() - del variable_names[0] - + # Dataset X = generated_normal_data['X'][0] y = generated_normal_data['y'] + variable_names = generated_normal_data['variable_names'] + data = ClassificationDataset(X, y, variable_names=variable_names, outcome_name='outcome') - cs = CoefficientSet(variable_names) + cs = CoefficientSet(data.variable_names) - cs.update_intercept_bounds(X, y, 1, max_L0_value=max_L0_value) + cs.update_intercept_bounds(data.X, data.y, 1, max_size=max_size) def test_coefficientset_tabulate(): @@ -186,13 +183,13 @@ def test_get_score_bounds(use_L0): rho_ub = np.repeat(-5, 10) L0_reg_ind = None - L0_max = None + max_size = None if use_L0: L0_reg_ind = np.ones(10).astype(int) L0_max = 1 s_min, s_max = get_score_bounds( - Z_min, Z_max, rho_lb, rho_ub, L0_reg_ind=L0_reg_ind, L0_max=L0_max + Z_min, Z_max, rho_lb, rho_ub, L0_reg_ind=L0_reg_ind, max_size=max_size ) assert s_min <= s_max diff --git a/riskslim/tests/test_data.py b/riskslim/tests/test_data.py index b61a89b..421d0a8 100644 --- a/riskslim/tests/test_data.py +++ b/riskslim/tests/test_data.py @@ -1,6 +1,5 @@ """Test data objects.""" -import pytest import numpy as np import pandas as pd from riskslim.data import ClassificationDataset @@ -14,19 +13,16 @@ def test_ClassificationDataset(): n_variables = 100 X = np.random.rand(n_obs, n_variables) - y = np.random.choice([1, -1], n_obs) + y = np.random.choice([1, -1], n_obs).reshape(-1, 1) - variable_names = ['var_' + str(i) for i in range(n_variables-1)] + variable_names = ['var_' + str(i) for i in range(n_variables)] - variable_names.insert(0, '(Intercept)') - X[:, 0] = 1 - - ds = ClassificationDataset(X, y, variable_names) - assert np.all(ds.X == X) + ds = ClassificationDataset(X, y, variable_names, outcome_name='outcome') + assert np.all(ds.X[:, 1:] == X) assert np.all(ds.y == y) - assert ds.variable_names == variable_names + assert ds.variable_names[1:] == variable_names assert ds.sample_weights is None - assert ds.outcome_name is None + assert ds.outcome_name == 'outcome' assert isinstance(ds.df, pd.DataFrame) assert isinstance(ds.__str__(), str) assert isinstance(ds.__repr__(), str) @@ -37,11 +33,11 @@ def test_ClassificationDataset(): def test_bounds(): bounds = Bounds(objval_min=0., objval_max=1., loss_min=0., loss_max=1., - L0_min=1, L0_max=10) + min_size=1, max_size=10) assert bounds.objval_min == bounds.loss_min == 0. assert bounds.objval_max == bounds.loss_max == 1. - assert bounds.L0_min == 1 and bounds.L0_max == 10 + assert bounds.min_size == 1 and bounds.max_size == 10 bounds = bounds.asdict() assert isinstance(bounds, dict) diff --git a/riskslim/tests/test_fit.py b/riskslim/tests/test_fit.py index 6ec030c..e7c6403 100644 --- a/riskslim/tests/test_fit.py +++ b/riskslim/tests/test_fit.py @@ -8,23 +8,23 @@ from riskslim.coefficient_set import CoefficientSet from riskslim.utils import Stats from riskslim.bounds import Bounds -from riskslim.fit import RiskSLIM +from riskslim.classifier import RiskSLIMClassifier @pytest.mark.parametrize('init_coef', [True, False]) -def test_RiskSLIM_init(init_coef): - """Test RiskSLIM initialization.""" +def test_RiskSLIMClassifier_init(init_coef): + """Test RiskSLIMClassifier initialization.""" variable_names = ['variable_' + str(i) for i in range(10)] coef_set = CoefficientSet(variable_names) if init_coef else None - L0_min=0 - L0_max=10 + min_size=0 + max_size=10 - rs = RiskSLIM(coef_set=coef_set, L0_min=L0_min, L0_max=L0_max) + rs = RiskSLIMClassifier(coef_set=coef_set, min_size=min_size, max_size=max_size) - assert rs.L0_min == L0_min - assert rs.L0_max == L0_max + assert rs.min_size == min_size + assert rs.max_size == max_size assert rs.X is None assert rs.y is None @@ -46,14 +46,14 @@ def test_RiskSLIM_init(init_coef): @pytest.mark.parametrize('use_coef_set', [True, False]) -def test_RiskSLIM_init_fit(generated_normal_data, use_coef_set): - """Test RiskSLIM fit initalization.""" +def test_RiskSLIMClassifier_init_fit(generated_normal_data, use_coef_set): + """Test RiskSLIMClassifier fit initalization.""" X = generated_normal_data['X'][0] y = generated_normal_data['y'] variable_names = generated_normal_data['variable_names'] coef_set = CoefficientSet(variable_names) if use_coef_set else None - rs = RiskSLIM(coef_set=coef_set, L0_min=0, L0_max=10) + rs = RiskSLIMClassifier(coef_set=coef_set, min_size=0, max_size=10) # Load data into attribute # this is normally done in .fit @@ -80,9 +80,8 @@ def test_RiskSLIM_init_fit(generated_normal_data, use_coef_set): assert (rs.Z.shape == rs.X.shape) - @pytest.mark.parametrize('loss_computation', ['fast', 'normal', 'weighted', 'lookup']) -def test_RiskSLIM_init_loss(generated_normal_data, loss_computation): +def test_RiskSLIMClassifier_init_loss(generated_normal_data, loss_computation): """Test setting up loss functions.""" X = generated_normal_data['X'][0] @@ -98,7 +97,7 @@ def test_RiskSLIM_init_loss(generated_normal_data, loss_computation): coef_set = None settings = {'loss_computation': loss_computation} - rs = RiskSLIM(coef_set=coef_set, L0_min=0, L0_max=10, settings=settings) + rs = RiskSLIMClassifier(coef_set=coef_set, min_size=0, max_size=10, settings=settings) # Load data into attribute # this is normally done in .fit @@ -135,8 +134,8 @@ def test_RiskSLIM_init_loss(generated_normal_data, loss_computation): @pytest.mark.parametrize('use_rounding', [True, False]) @pytest.mark.parametrize('polishing_after', [True, False]) -def test_RiskSLIM_warmstart(generated_normal_data, use_rounding, polishing_after): - """Test RiskSLIM fitting.""" +def test_RiskSLIMClassifier_warmstart(generated_normal_data, use_rounding, polishing_after): + """Test RiskSLIMClassifier fitting.""" X = generated_normal_data['X'] y = generated_normal_data['y'] variable_names = generated_normal_data['variable_names'] @@ -150,7 +149,7 @@ def test_RiskSLIM_warmstart(generated_normal_data, use_rounding, polishing_after coef_set = CoefficientSet(variable_names=variable_names, lb=lb, ub=ub) - rs = RiskSLIM(coef_set=coef_set, L0_min=0, L0_max=5) + rs = RiskSLIMClassifier(coef_set=coef_set, min_size=0, max_size=5) # Load data into attribute # this is normally done in .fit @@ -182,8 +181,8 @@ def test_RiskSLIM_warmstart(generated_normal_data, use_rounding, polishing_after @pytest.mark.parametrize('polish_flag', [True, False]) -def test_RiskSLIM_fit(generated_normal_data, polish_flag): - """Test fitting RiskSLIM.""" +def test_RiskSLIMClassifier_fit(generated_normal_data, polish_flag): + """Test fitting RiskSLIMClassifier.""" X = generated_normal_data['X'].copy() y = generated_normal_data['y'].copy() variable_names = generated_normal_data['variable_names'].copy() @@ -195,8 +194,6 @@ def test_RiskSLIM_fit(generated_normal_data, polish_flag): # Settings settings = { - # Problem Parameters - 'c0_value': c0_value, # LCPA Settings 'max_runtime': 2, 'max_tolerance': np.finfo('float').eps, @@ -216,39 +213,19 @@ def test_RiskSLIM_fit(generated_normal_data, polish_flag): 'cplex_mipemphasis': 0, } - solutions = np.zeros((n_iters, n_columns), dtype=np.int8) + solutions = np.zeros((n_iters, n_columns+1), dtype=np.int8) for ind in range(n_iters): - # Constraints - ub = np.array([5.] * len(variable_names)) - lb = np.array([-5.] * len(variable_names)) - - # Fix intercept at zero - lb[0] = 0. - ub[0] = 0. - # Initalize - rs = RiskSLIM(L0_min=0, L0_max=10, rho_min=lb, rho_max=ub, c0_value=c0_value, - settings=settings) - - if ind == 0: - # Ensure printable - rs.print_model() - with pytest.raises(ValueError): - rs.print_solution() + rs = RiskSLIMClassifier(max_size=10, max_coef=5, settings=settings, variable_names=variable_names) # Fit - rs.fit(X[ind], y, variable_names=variable_names) + rs.fit(X[ind], y) assert rs.fitted - if ind == 0: - # Ensure printable - rs.print_model() - rs.print_solution() - # Get solutions - solutions[ind] = rs.solution_info['solution'] + solutions[ind] = np.insert(rs.coef_, 0, rs.intercept_) # Test accuracy between computed and persistent solution if polish_flag: diff --git a/riskslim/tests/test_heuristics.py b/riskslim/tests/test_heuristics.py index 7604a24..27badc9 100644 --- a/riskslim/tests/test_heuristics.py +++ b/riskslim/tests/test_heuristics.py @@ -55,6 +55,6 @@ def test_discrete_descent(generated_normal_data): assert base_loss < base_objval assert base_loss < log_loss_value_from_scores(Z.dot(rho)) - assert log_loss_value_from_scores(Z.dot(rho_discrete)).astype(np.float32) == \ - base_loss.astype(np.float32) + assert log_loss_value_from_scores(Z.dot(rho_discrete)).astype(np.float32).round(6) == \ + base_loss.astype(np.float32).round(6) assert len(rho) == len(rho_discrete) diff --git a/riskslim/tests/test_risk_slim.py b/riskslim/tests/test_risk_slim.py index e7a6d83..21365ec 100644 --- a/riskslim/tests/test_risk_slim.py +++ b/riskslim/tests/test_risk_slim.py @@ -4,6 +4,7 @@ import pytest import numpy as np +import pandas as pd import riskslim # Dataset Strategy @@ -17,7 +18,7 @@ # # loss_computation normal, fast, lookup # max_coefficient 0, 1, >1 -# max_L0_value 0, 1, >1 +# max_size 0, 1, >1 # max_offset 0, 1, Inf # c0_value eps, 1e-8, 0.01, C0_max # sample_weights no, yes @@ -93,48 +94,49 @@ @pytest.mark.parametrize('max_coefficient', [5]) -@pytest.mark.parametrize('max_L0_value', [0, 1, 5]) +@pytest.mark.parametrize('max_size', [0, 1, 5]) @pytest.mark.parametrize('max_offset', [0, 50]) -def test_risk_slim(max_coefficient, max_L0_value, max_offset): +def test_risk_slim(max_coefficient, max_size, max_offset): # Load data - data = riskslim.load_data_from_csv( - dataset_csv_file=data_csv_file, sample_weights_csv_file=sample_weights_csv_file - ) + df = pd.read_csv(data_csv_file) + + X = df.iloc[:, 1:].values + y = df.iloc[:, 0].values - N, P = data['X'].shape + N, P = X.shape # Offset value coef_set = riskslim.CoefficientSet( - variable_names=data['variable_names'], + variable_names=list(df.columns)[1:], lb=-max_coefficient, ub=max_coefficient ) coef_set.update_intercept_bounds( - X = data['X'], y = data['y'], max_offset=max_offset, max_L0_value = max_L0_value + X = X, y = y, max_offset=max_offset, max_size = max_size ) # Create constraint dictionary - trivial_L0_max = P - np.sum(coef_set.C_0j == 0) - max_L0_value = min(max_L0_value, trivial_L0_max) + trivial_max_size = P - np.sum(coef_set.C_0j == 0) + max_size = min(max_size, trivial_max_size) # Train model using lattice_cpa - rs = riskslim.RiskSLIM( - coef_set=coef_set, L0_min=0, L0_max=max_L0_value, settings=default_settings + rs = riskslim.RiskSLIMClassifier( + coef_set=coef_set, min_size=0, max_size=max_size, settings=default_settings ) - rs.fit(data['X'], y = data['y']) + rs.fit(X, y) # Model info contains key results pprint.pprint(rs.solution_info) - assert rs.L0_min == rs.bounds.L0_min == 0 - assert rs.L0_max == rs.bounds.L0_max == max_L0_value + assert rs.min_size == rs.bounds.min_size == 0 + assert rs.max_size == rs.bounds.max_size == max_size assert rs.coef_set == coef_set # Each column of X has a rho and alpha (except for intercept, # which doesn't have an alpha). There are 3 additional parameters: # loss, objval, L0_norm - assert (len(data['X'][0]) * 2) - 1 + 3 == rs.mip_indices['n_variables'] + assert (len(X[0]) * 2) - 1 + 3 == rs.mip_indices['n_variables'] assert True diff --git a/riskslim/tests/test_solution_pool.py b/riskslim/tests/test_solution_pool.py index 97bae22..b30c54d 100644 --- a/riskslim/tests/test_solution_pool.py +++ b/riskslim/tests/test_solution_pool.py @@ -71,6 +71,7 @@ def test_solution_pool_solution_table(): assert table == pool.__repr__() == pool.__str__() + def test_solution_pool_copy(): solutions = np.zeros(10) @@ -234,7 +235,6 @@ def test_solution_pool_remove_suboptimal(): assert len(pool) == 0 - def test_fast_solution_pool(): solution_pool = FastSolutionPool(1) @@ -244,6 +244,7 @@ def test_fast_solution_pool(): assert solution_pool.table() == solution_pool.__repr__() == solution_pool.__str__() + def test_fast_solution_pool_add(): solution_pool = FastSolutionPool(2) solution_pool.add(np.array([0, 1]), np.array([[0, 0], [1, 1]])) diff --git a/riskslim/tests/test_utils.py b/riskslim/tests/test_utils.py index 3c8f7c1..404df8e 100644 --- a/riskslim/tests/test_utils.py +++ b/riskslim/tests/test_utils.py @@ -7,79 +7,9 @@ import pytest import numpy as np from riskslim.utils import ( - load_data_from_csv, print_model, setup_logging, print_log, + print_model, setup_logging, print_log, validate_settings, is_integer, cast_to_integer ) -from riskslim.data import check_data - - -@pytest.mark.parametrize('sample_weights_csv_file', - [ - None, - pytest.param('/bad/path', marks=pytest.mark.xfail(raises=IOError)), - ] -) -def test_load_data_from_csv(sample_weights_csv_file): - - # Data - data_name = "breastcancer" # name of the data - data_dir = os.getcwd() + '/examples/data/' # directory where datasets are stored - data_csv_file = data_dir + data_name + '_data.csv' # csv file for the data - - folds = cycle(list(range(1, 6))) - folds = [next(folds) for _ in range(683)] - - with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: - for i in folds: - f.write(str(i) + '\n') - - data = load_data_from_csv( - dataset_csv_file=data_csv_file, sample_weights_csv_file=sample_weights_csv_file, - fold_csv_file=f.name, fold_num=1 - ) - - os.remove(f.name) - - assert isinstance(data['X'], np.ndarray) - assert isinstance(data['y'], np.ndarray) - assert len(data['X']) == len(data['y']) == len(data['sample_weights']) - assert len(data['X'][0]) == len(data['variable_names']) - assert isinstance(data['outcome_name'], str) - - -@pytest.mark.parametrize('y', - [ - np.random.choice([-1, 1], 100).astype(int), - np.ones(100).astype(int), - np.ones(100).astype(int) * -1 - ] -) -@pytest.mark.parametrize('variable_names', - [ - ['(Intercept)', *['var_' + str(i) for i in range(9)]], - ['var_' + str(i) for i in range(10)] - ] -) -@pytest.mark.parametrize('outcome_name', - [ - 'Outcome', - pytest.param(0, marks=pytest.mark.xfail(raises=AssertionError)) - ] -) -@pytest.mark.parametrize('sample_weights', - [ - np.ones(100), - np.random.uniform(.1, .9, 100), - pytest.param(0, marks=pytest.mark.xfail(raises=AssertionError)) - ] -) -def test_check_data(y, variable_names, outcome_name, sample_weights): - - X = np.random.rand(100, 10) - X[:, 0] = 1. - check = check_data(X, y, variable_names, outcome_name, sample_weights) - - assert check @pytest.mark.parametrize('variable_names', diff --git a/riskslim/tests/test_warmstart.py b/riskslim/tests/test_warmstart.py index 1ba2d92..f5c5b68 100644 --- a/riskslim/tests/test_warmstart.py +++ b/riskslim/tests/test_warmstart.py @@ -80,8 +80,8 @@ def test_round_solution_pool(generated_normal_data): coef_set = CoefficientSet(variable_names) constraints = { - "L0_min": 0, - "L0_max": 10, + "min_size": 0, + "max_size": 10, "coef_set": coef_set, } @@ -129,7 +129,7 @@ def test_sequential_round_solution_pool(generated_normal_data): ) sol = rounded_pool.solutions[0] - assert (sol - generated_normal_data['rho_true'][0].copy()).mean() < .1 + assert (sol - generated_normal_data['rho_true'][0].copy()).mean() < .2 assert total_runtime > 0 assert total_rounded > 0 @@ -139,19 +139,19 @@ def test_discrete_descent_solution_pool(generated_normal_data, non_integral): Z = generated_normal_data['Z'][0].copy() rho = generated_normal_data['rho_true'][0].copy() - + rho[0] += 1 # Add small amount of noise to move from ints to floats inds = np.where(rho != 0.)[0] if non_integral: - rho[inds] = rho[inds] + (np.random.rand(len(inds))) * 1 + rho[inds] = rho[inds] + (np.random.rand(len(inds))) variable_names = generated_normal_data['variable_names'].copy() coef_set = CoefficientSet(variable_names) constraints = { - "L0_min": 0, - "L0_max": 10, + "min_size": 0, + "max_size": 10, "coef_set": coef_set, } @@ -159,7 +159,7 @@ def test_discrete_descent_solution_pool(generated_normal_data, non_integral): pool = SolutionPool({'objvals': objvals, 'solutions':rho}) - C_0 = np.zeros(len(Z[0])) + .1 + C_0 = np.zeros(len(Z[0])) + .1e-16 get_L0_penalty = lambda rho: np.sum( C_0 * (rho != 0.0) diff --git a/riskslim/tests/utils.py b/riskslim/tests/utils.py index 19d2349..5e6fde0 100644 --- a/riskslim/tests/utils.py +++ b/riskslim/tests/utils.py @@ -39,7 +39,7 @@ def generate_random_normal(n_rows, n_columns, n_targets, seed): # Simulate random int normals with varying overlap stdevs = iter(np.linspace(100, 50, n_targets)) - for ind in range(1, n_columns): + for ind in range(0, n_columns): if ind in selected: # Class A @@ -50,11 +50,8 @@ def generate_random_normal(n_rows, n_columns, n_targets, seed): # Noise X[:, ind] = np.random.normal(0, 100, n_rows).astype(np.int32) - # Intercept - X[:, 0]= 1. - # Variale names - variable_names = ['var_' + str(i).zfill(2) for i in range(n_columns-1)] + variable_names = ['var_' + str(i).zfill(2) for i in range(n_columns)] variable_names.insert(0, '(Intercept)') # Data