Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions riskslim/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,8 @@ class RiskSLIMClassifier(BaseEstimator, ClassifierMixin):
calibrated_estimators_ : list of sklearn.calibration.CalibratedClassifierCV
Calibrators trained per fold. Must use the fitcv method.
"""
def __init__(self, max_coef = 5, max_size = None, coef_set = None,
variable_names = None, outcome_name = None, c0_value = 1e-6,
verbose = True, **kwargs):
def __init__(self, max_coef = 5, max_size = None, variable_names = None,
outcome_name = None, verbose = True, **kwargs):
"""
Parameters
----------
Expand Down Expand Up @@ -109,7 +108,7 @@ def __init__(self, max_coef = 5, max_size = None, coef_set = None,
# internals
self._data = None
self._variable_names = variable_names
self._outcome_name = outcome_name
self._outcome_name = outcome_name if outcome_name is not None else "outcome"
self._coef_set = None

# todo: check that this
Expand Down Expand Up @@ -275,10 +274,8 @@ def recalibrate(self, X, y, sample_weights=None, method= "sigmoid"):
method : {"sigmoid", "isotonic"}
Linear classifier used to recalibrate scores.
"""
# todo: call check_data <- rh: check_data has to be called in .fit() prior to this call
# todo: call check_data
# todo: add support for kwargs (method = 'sigmoid') should be
# <- rh: i don't think kwargs should be user facing; it leads to issues like unclear signatures
# or passing bad args in (e.g. methdo="sigmoid" typo wouldn't raise an error)

if not self.fitted:
raise ValueError("fit RiskSLIM before calling recalibrate")
Expand Down
20 changes: 10 additions & 10 deletions riskslim/coefficient_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, variable_names, lb=-5., ub=5., c0=float('nan'), vtype='I', pr
self._initialized = True

### methods ###
def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None):
def update_intercept_bounds(self, X, y, max_offset, max_size = None):
"""
uses data to set the lower and upper bound on the offset to a conservative value
the value is guaranteed to avoid a loss in performance
Expand All @@ -63,7 +63,7 @@ def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None):
X
y
max_offset
max_L0_value
max_size

Returns
-------
Expand All @@ -85,14 +85,14 @@ def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None):
penalized_idx = [self._coef_elements[n].penalized for n in variable_names]
trivial_max_size = len(penalized_idx)

if max_L0_value is None:
max_L0_value = trivial_max_size
if max_size is None:
max_size = trivial_max_size

if max_L0_value > 0:
max_L0_value = min(trivial_max_size, max_L0_value)
if max_size > 0:
max_size = min(trivial_max_size, max_size)

# update intercept bounds
Z = X * y[:, None]
Z = X * y
Z_min = np.min(Z, axis = 0)
Z_max = np.max(Z, axis = 0)

Expand All @@ -104,15 +104,15 @@ def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None):
Z_max = Z_max[variable_idx],
rho_lb = self.lb[variable_idx],
rho_ub = self.ub[variable_idx],
L0_reg_ind = L0_reg_ind,
max_size = max_L0_value)
L0_reg_ind = L0_reg_ind[variable_idx],
max_size = max_size)

# set intercept
conservative_offset = max(abs(s_min), abs(s_max)) + 1
if max_offset is None:
max_offset = conservative_offset
else:
max_offset = np.min(max_offset, conservative_offset)
max_offset = min(max_offset, conservative_offset)
e.ub = max_offset
e.lb = -max_offset

Expand Down
6 changes: 3 additions & 3 deletions riskslim/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, X, y, variable_names = None, outcome_name=None, sample_weight
self._sample_weights = sample_weights
assert self.__check_rep__()

self._Z = (self._X * self._y[:, None]).astype(np.float64)
self._Z = (self._X * self._y).astype(np.float64)

# Infer variable types
self._variable_types = np.zeros(self.X.shape[1], dtype="str")
Expand Down Expand Up @@ -170,13 +170,13 @@ def __repr__(self):

def check_data(self):
if np.all(self._variable_types[1:] == "B"):
warn("X is recommended to be all binary.")
warnings.warn("X is recommended to be all binary.")

# Constant warning
idx = np.flatnonzero(self.X == self.X[0], axis=0)
constant_variables = [self.variable_names[j] for j in idx if j > 0]
if len(constant_variables):
warn("Constant variable other than intercept found in X.")
warnings.warn("Constant variable other than intercept found in X.")



Expand Down
30 changes: 9 additions & 21 deletions riskslim/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,14 @@ def __init__(self, dataset, estimator):
self.variable_names = dataset.variable_names
self.outcome_name = dataset.outcome_name

if hasattr(estimator, "rho"):
self.rho = estimator.rho
else:
# For scikit-learn estimators
self.rho = np.insert(np.squeeze(self.estimator.coef_), 0, self.estimator.intercept_)
self.variable_names.insert(0, "(Intercept)")
self.coefs = np.insert(self.estimator.coef_.copy(), 0, 1)
self._variable_types = estimator._variable_types

if hasattr(estimator, "_variable_types"):
self._variable_types = estimator._variable_types
else:
# For scikit-learn estimators
self._variable_types = np.zeros(self.X.shape[1], dtype="str")
self._variable_types[:] = "C"
self._variable_types[np.all(self.X == np.require(self.X, dtype=np.int_), axis=0)] = "I"
self._variable_types[np.all(self.X == np.require(self.X, dtype=np.bool_), axis=0)] = "B"

# Table
if np.not_equal(estimator.coef_, 0.0).any():
self.table_str = print_model(
self.rho,
self.coefs,
self.variable_names,
self.outcome_name,
show_omitted_variables=False,
Expand All @@ -69,9 +57,9 @@ def __init__(self, dataset, estimator):

# Probability estimates
if not hasattr(self.estimator, "calibrated_estimator") or self.estimator.calibrated_estimator is None:
self.proba = estimator.predict_proba(self.X)
self.proba = estimator.predict_proba(self.X[:, 1:])
else:
self.proba = self.estimator.calibrated_estimator.predict_proba(self.X)[:, 1]
self.proba = self.estimator.calibrated_estimator.predict_proba(self.X[:, 1:])[:, 1]

@staticmethod
def from_model(estimator):
Expand Down Expand Up @@ -113,12 +101,12 @@ def _prepare_table(self):
"""Prepare arrays for plotly table."""

# Non-zero coefficients
inds = np.flatnonzero(self.rho[1:])
inds = np.flatnonzero(self.coefs[1:])
if len(inds) == 0:
raise ValueError('all zero coefficients')

self.table["names"] = np.array(self.variable_names)[inds+1].tolist()
self.table["scores"] = self.rho[inds+1]
self.table["scores"] = self.coefs[inds+1]

self.table["names"] = [str(i+1) + '. ' + n
for i, n in enumerate(self.table["names"])]
Expand Down Expand Up @@ -187,9 +175,9 @@ def create_report(self, file_name=None, show=False, replace_table=False,
fig.update_layout(font_family="Source Code Pro")
fig_str = fig.to_html(include_plotlyjs=False, full_html=False)

inds = np.flatnonzero(self.rho)
inds = np.flatnonzero(self.coefs)
_vars = np.array(self.variable_names)[inds]
_rho = self.rho[inds]
_rho = self.coefs[inds]

min_values = self.X.min(axis=0)[inds]
max_values = self.X.max(axis=0)[inds]
Expand Down
5 changes: 2 additions & 3 deletions riskslim/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def generate_integer_model(n_cols=20, rho_ub=100, rho_lb=-100, sparse_pct=0.5):
weights = np.ones(len(X))

# Create lookup table
min_score, max_score = get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, L0_max=n_cols)
min_score, max_score = get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, max_size=n_cols)

loss_value_tbl, prob_value_tbl, loss_tbl_offset = \
lookup.get_loss_value_and_prob_tables(min_score, max_score)
Expand Down Expand Up @@ -105,7 +105,6 @@ def generated_normal_data():

Z = X * y

names = ['var_' + str(i).zfill(2) for i in range(n_columns-1)]
names.insert(0, '(Intercept)')
names = ['var_' + str(i).zfill(2) for i in range(n_columns)]

yield {'X':X, 'y':y, 'Z':Z, 'rho':rho, 'rho_true':rho_true, 'variable_names':names}
8 changes: 4 additions & 4 deletions riskslim/tests/loss_functions/test_log_loss_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def test_log_loss_value(generated_normal_data):
rho = generated_normal_data['rho']

# Weights of one are the same as unweighted
assert log_loss_weighted.log_loss_value(Z, np.ones(len(Z)), len(Z), rho) == \
log_loss.log_loss_value(Z, rho)
assert log_loss_weighted.log_loss_value(Z, np.ones(len(Z)), len(Z), rho).round(6) == \
log_loss.log_loss_value(Z, rho).round(6)

# Weights of zero should give zero loss
assert log_loss_weighted.log_loss_value(Z, np.zeros(len(Z)), len(Z), rho) == 0.
Expand All @@ -40,7 +40,7 @@ def test_log_loss_value_and_slope(generated_normal_data):

loss, slope = log_loss.log_loss_value_and_slope(Z, rho)

assert wloss == loss
assert wloss.round(6) == loss.round(6)
assert np.all(slope == wslope)

# Weights are all zero, assert zero
Expand Down Expand Up @@ -75,7 +75,7 @@ def test_log_loss_value_from_scores(generated_normal_data):

loss = log_loss.log_loss_value_from_scores(Z.dot(rho))

assert wloss == loss
assert wloss.round(6) == loss.round(6)

# Loss of random row in Z, selected using weights
ind = np.random.choice(np.arange(len(Z)))
Expand Down
8 changes: 4 additions & 4 deletions riskslim/tests/test_bound_tightening.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def test_chained_updates(update, loss, new_objvals, C_0_nnz):
objval_max=10,
loss_min=loss[0],
loss_max=loss[1],
L0_min=1,
L0_max=10
min_size=1,
max_size=10
)

new_objval_at_feasible, new_objval_at_relaxation = new_objvals
Expand All @@ -33,5 +33,5 @@ def test_chained_updates(update, loss, new_objvals, C_0_nnz):
assert new_bounds.loss_min >= bounds.loss_min
assert new_bounds.loss_max <= bounds.loss_max

assert new_bounds.L0_min >= bounds.L0_min
assert new_bounds.L0_max <= bounds.L0_max
assert new_bounds.min_size >= bounds.min_size
assert new_bounds.max_size <= bounds.max_size
Loading