From 5f283c26d5f1868edc5dbf06f3b720fdb5fc3188 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 10 Jan 2026 00:30:40 +0300 Subject: [PATCH 01/14] Native categorical support part 1 --- flexml/config/ml_models.py | 4 +- flexml/structures/supervised_base.py | 189 ++++++++++++++++++++++++--- 2 files changed, 172 insertions(+), 21 deletions(-) diff --git a/flexml/config/ml_models.py b/flexml/config/ml_models.py index 9e4147e..752572e 100644 --- a/flexml/config/ml_models.py +++ b/flexml/config/ml_models.py @@ -308,8 +308,8 @@ def get_ml_models( # Quick Classification Models LOGISTIC_REGRESSION = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=n_jobs) - XGBOOST_CLASSIFIER = XGBClassifier(objective=xgb_objective, random_state=random_state, n_jobs=n_jobs) - LIGHTGBM_CLASSIFIER = LGBMClassifier(verbose=-1, random_state=random_state, n_jobs=n_jobs) + XGBOOST_CLASSIFIER = XGBClassifier(enable_categorical=True, objective=xgb_objective, random_state=random_state, n_jobs=n_jobs) + LIGHTGBM_CLASSIFIER = LGBMClassifier(enable_categorical=True, verbose=-1, random_state=random_state, n_jobs=n_jobs) CATBOOST_CLASSIFIER = CatBoostClassifier(allow_writing_files=False, silent=True, random_seed=random_state, thread_count=n_jobs) DECISION_TREE_CLASSIFIER = DecisionTreeClassifier(random_state=random_state) RANDOM_FOREST_CLASSIFIER = RandomForestClassifier(random_state=random_state, n_jobs=n_jobs) diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 78e7c82..9c797f4 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -41,6 +41,33 @@ pd.set_option('display.max_columns', None) +# Models that support native categorical features (no encoding needed) +NATIVE_CATEGORICAL_MODELS = { + 'CatBoostRegressor', 'CatBoostClassifier', + 'LGBMRegressor', 'LGBMClassifier', + 'XGBRegressor', 'XGBClassifier', + 'HistGradientBoostingRegressor', 'HistGradientBoostingClassifier' # sklearn also supports! +} + +def _get_encoded_categorical_columns(encoded_columns, original_cat_cols): + """ + Identifies which columns in the encoded dataframe came from categorical encoding. + Handles both label encoding (same name) and one-hot encoding (prefix_value format). + """ + encoded_cat_cols = [] + for col in encoded_columns: + # Check if it's the original column name (label/ordinal encoding) + if col in original_cat_cols: + encoded_cat_cols.append(col) + else: + # Check if it's a one-hot encoded column (format: original_value) + for orig_col in original_cat_cols: + if col.startswith(f"{orig_col}_"): + encoded_cat_cols.append(col) + break + return encoded_cat_cols + + class SupervisedBase: """ Base class for Supervised tasks (regression & classification) @@ -325,9 +352,15 @@ def __prepare_holdout_data(self, test_size: Optional[float] = None): ], axis=1) self.feature_engineer.setup(data=train_data) + self.categorical_columns = self.feature_engineer.categorical_columns + + # Store raw categorical columns for holdout data (for native-categorical models) + self.X_train_cat_raw = self._get_raw_categorical_data(train_data) + self.X_test_cat_raw = self._get_raw_categorical_data(test_data) self.X_train, self.y_train = self.feature_engineer.fit_transform() self.X_test, self.y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) + self.encoded_categorical_columns = _get_encoded_categorical_columns(self.X_train.columns, self.categorical_columns) self.feature_names = list(self.X_train.columns) self.y_class_mapping = self.feature_engineer.y_class_mapping @@ -380,6 +413,90 @@ def __top_n_models_checker(self, top_n_models: Optional[int]) -> int: return top_n_models + def _get_raw_categorical_data(self, data: pd.DataFrame) -> Optional[pd.DataFrame]: + """ + Extract raw categorical columns from data and convert to category dtype. + Used to preserve original categoricals before encoding for native-categorical models. + + Parameters + ---------- + data : pd.DataFrame + The data containing categorical columns + + Returns + ------- + Optional[pd.DataFrame] + DataFrame with categorical columns converted to 'category' dtype, or None if no categorical columns + """ + if not hasattr(self, 'categorical_columns') or not self.categorical_columns: + return None + cat_df = data[self.categorical_columns].copy() + for col in self.categorical_columns: + cat_df[col] = cat_df[col].astype('category') + return cat_df + + def _prepare_data_for_model( + self, + model_name: str, + X_encoded: pd.DataFrame, + X_cat_raw: Optional[pd.DataFrame] = None + ) -> pd.DataFrame: + """ + Prepares data for a specific model by swapping encoded categoricals + with raw ones for models that support native categorical features. + + Parameters + ---------- + model_name : str + The name of the model + X_encoded : pd.DataFrame + The encoded feature data + X_cat_raw : Optional[pd.DataFrame] + The raw categorical columns (with 'category' dtype) + + Returns + ------- + pd.DataFrame + Prepared feature data for the model + """ + if model_name in NATIVE_CATEGORICAL_MODELS and X_cat_raw is not None and hasattr(self, 'encoded_categorical_columns') and self.encoded_categorical_columns: + # Drop encoded categorical columns + X_final = X_encoded.drop(columns=self.encoded_categorical_columns, errors='ignore') + # Merge with raw categorical columns (aligned by index) + X_final = pd.concat([X_final, X_cat_raw.loc[X_final.index]], axis=1) + return X_final + return X_encoded + + def _fit_model( + self, + model: object, + X: pd.DataFrame, + y: pd.Series, + model_name: Optional[str] = None + ): + """ + Fits a model with proper categorical feature handling. + Passes cat_features to CatBoost models for native categorical support. + + Parameters + ---------- + model : object + The model to fit + X : pd.DataFrame + The feature data + y : pd.Series + The target data + model_name : Optional[str] + The name of the model (if None, uses model's class name) + """ + if model_name is None: + model_name = model.__class__.__name__ + + if 'CatBoost' in model_name and hasattr(self, 'categorical_columns') and self.categorical_columns: + model.fit(X, y, cat_features=self.categorical_columns) + else: + model.fit(X, y) + def __process_experiment_result(self, experiment_stats: dict): """ Processes and aggregates the results of an experiment, calculating average metrics and selecting the best model. @@ -625,9 +742,13 @@ def start_experiment( ], axis=1) self.feature_engineer.setup(data=train_data) + + # Save raw categorical columns BEFORE encoding (for native-categorical models) + X_train_cat_raw = self._get_raw_categorical_data(train_data) + X_test_cat_raw = self._get_raw_categorical_data(test_data) - X_train, y_train = self.feature_engineer.fit_transform() - X_test, y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) + X_train_encoded, y_train = self.feature_engineer.fit_transform() + X_test_encoded, y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) for model_idx in range(len(self.__ML_MODELS)): model_info = self.__ML_MODELS[model_idx] @@ -638,19 +759,24 @@ def start_experiment( continue # Skip already trained or raised error models model = model_info['model'] + + # Prepare data based on model type (native categorical vs encoded) + X_train_final = self._prepare_data_for_model(model_name, X_train_encoded, X_train_cat_raw) + X_test_final = self._prepare_data_for_model(model_name, X_test_encoded, X_test_cat_raw) + try: all_metrics = [] all_times = [] t_start = time() - model.fit(X_train, y_train) + self._fit_model(model, X_train_final, y_train, model_name) t_end = time() time_taken = round(t_end - t_start, 2) if self.__ML_TASK_TYPE == "Classification" and hasattr(model, 'predict_proba'): - y_pred = model.predict_proba(X_test) + y_pred = model.predict_proba(X_test_final) else: - y_pred = model.predict(X_test) + y_pred = model.predict(X_test_final) model_perf = evaluate_model_perf( self.__ML_TASK_TYPE, @@ -860,8 +986,17 @@ def save_model( if not already_trained: self.__logger.info("Training the model using the whole data") self.feature_engineer.setup(data=self.data) - X_train, y_train = self.feature_engineer.fit_transform() - model.fit(X_train, y_train) + + # Get raw categoricals before encoding + X_cat_raw = self._get_raw_categorical_data(self.data) + + X_train_encoded, y_train = self.feature_engineer.fit_transform() + + # Prepare data for this specific model + X_train_final = self._prepare_data_for_model(model_name, X_train_encoded, X_cat_raw) + + # Fit with proper cat_features handling + self._fit_model(model, X_train_final, y_train, model_name) # find the model in leaderboard and update the full_train to True, and update the model object in there for model_info in self.__model_training_info: @@ -975,8 +1110,13 @@ def _predict_helper( if not already_trained: self.__logger.info("Training the model using the whole data") self.feature_engineer.setup(data=self.data) - X_train, y_train = self.feature_engineer.fit_transform() - model.fit(X_train, y_train) + + # Get raw categoricals before encoding + X_cat_raw = self._get_raw_categorical_data(self.data) + + X_train_encoded, y_train = self.feature_engineer.fit_transform() + X_train_final = self._prepare_data_for_model(model_name, X_train_encoded, X_cat_raw) + self._fit_model(model, X_train_final, y_train, model_name) # find the model in leaderboard and update the full_train to True, and update the model object in there for model_info in self.__model_training_info: @@ -987,10 +1127,11 @@ def _predict_helper( break # Update leaderboard self.get_best_models() - X_test = self.feature_engineer.transform(test_data) - else: - X_test = self.feature_engineer.transform(test_data) + # Transform test data and prepare for model + X_test_encoded = self.feature_engineer.transform(test_data) + X_test_cat_raw = self._get_raw_categorical_data(test_data) + X_test = self._prepare_data_for_model(model_name, X_test_encoded, X_test_cat_raw) return model, X_test @@ -1067,7 +1208,13 @@ def __add_holdout_model_to_stats(self, model: object, model_name: Optional[str] model_name = model.__class__.__name__ model_copy = deepcopy(model) - model_copy.fit(self.X_train, self.y_train) + + # Prepare holdout data for this model (use raw categoricals for native-categorical models) + X_train_final = self._prepare_data_for_model(model_name, self.X_train, self.X_train_cat_raw) + + # Fit with proper categorical handling + self._fit_model(model_copy, X_train_final, self.y_train, model_name) + self._holdout_model_objects[model_name] = model_copy return model_copy @@ -1147,11 +1294,15 @@ def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_ else: model = self.__add_holdout_model_to_stats(model, model_name) - # If kind expects predictions + # Prepare holdout data for this model (use raw categoricals for native-categorical models) + X_train_final = self._prepare_data_for_model(model_name, self.X_train, self.X_train_cat_raw) + X_test_final = self._prepare_data_for_model(model_name, self.X_test, self.X_test_cat_raw) + + # If kind expects predictions if kind in ["confusion_matrix"]: - preds = model.predict(self.X_test) + preds = model.predict(X_test_final) elif kind in ["roc_curve", "calibration_curve"]: - preds = model.predict_proba(self.X_test) + preds = model.predict_proba(X_test_final) graph = None @@ -1164,13 +1315,13 @@ def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_ elif kind == "roc_curve": graph = plot_roc_curve(self.y_test, preds, self.y_class_mapping, **kwargs) elif kind == "residuals": - graph = plot_residuals(model, self.X_train, self.y_train, self.X_test, self.y_test, **kwargs) + graph = plot_residuals(model, X_train_final, self.y_train, X_test_final, self.y_test, **kwargs) elif kind == "prediction_error": - graph = plot_prediction_error(model, self.X_train, self.y_train, self.X_test, self.y_test, **kwargs) + graph = plot_prediction_error(model, X_train_final, self.y_train, X_test_final, self.y_test, **kwargs) elif kind == "calibration_curve": graph = plot_calibration_curve(self.y_test, preds, self.y_class_mapping, **kwargs) elif 'shap' in kind: - graph = plot_shap(model, self.X_test, kind, **kwargs) + graph = plot_shap(model, X_test_final, kind, **kwargs) else: error_msg = f"Invalid plot type: {kind}. Available plot types: {available_plot_types}" self.__logger.error(error_msg) From 97a82d2ab0ceef41857bd4a9d10ae835489c242e Mon Sep 17 00:00:00 2001 From: Ozgur Aslan <56040583+ozguraslank@users.noreply.github.com> Date: Sat, 10 Jan 2026 18:29:53 +0300 Subject: [PATCH 02/14] Native categorical support part 2 --- flexml/_feature_engineer.py | 27 +++++++++++++++++++ flexml/_model_tuner.py | 6 ++++- flexml/structures/supervised_base.py | 40 ++++++++++++++++++++++++++-- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/flexml/_feature_engineer.py b/flexml/_feature_engineer.py index f2d8897..e306e33 100644 --- a/flexml/_feature_engineer.py +++ b/flexml/_feature_engineer.py @@ -27,6 +27,33 @@ def transform(self, X): A DataFrame with the specified columns dropped """ return X.drop(columns=self.drop_columns, axis=1, errors='ignore') + + +class CategoricalTypeConverter(BaseEstimator, TransformerMixin): + """ + A transformer to convert categorical columns to 'category' dtype. + Used for tree-based models that support native categorical features. + """ + def __init__(self, categorical_columns: Optional[List[str]] = None): + self.categorical_columns = categorical_columns or [] + + def fit(self, X, y=None): + return self + + def transform(self, X): + """ + Converts specified categorical columns to 'category' dtype + + Returns + ------- + pd.DataFrame + A DataFrame with categorical columns converted to 'category' dtype + """ + X = X.copy() + for col in self.categorical_columns: + if col in X.columns: + X[col] = X[col].astype('category') + return X class ColumnImputer(BaseEstimator, TransformerMixin): diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py index 6153719..c769c67 100644 --- a/flexml/_model_tuner.py +++ b/flexml/_model_tuner.py @@ -204,7 +204,11 @@ def _setup_tuning( model = model.named_steps['model'] if "CatBoost" in model.__class__.__name__: - model_params = model.get_all_params() + # Use get_all_params() only if the model is fitted, otherwise use get_params() + if model.is_fitted(): + model_params = model.get_all_params() + else: + model_params = model.get_params() else: model_params = model.get_params() diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 9c797f4..ef4b41f 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -1020,6 +1020,14 @@ def save_model( return model + # Warn users about native categorical models in Pipeline mode + if model_name in NATIVE_CATEGORICAL_MODELS and hasattr(self, 'categorical_columns') and self.categorical_columns: + self.__logger.warning( + f"'{model_name}' supports native categorical features, but Pipeline mode encodes categorical data. " + f"For optimal performance, consider using 'model_only=True' and handle feature engineering separately, " + f"or use the 'predict()' method directly which handles native categoricals automatically." + ) + # Add the model to the pipeline pipeline_steps.append(('model', model)) @@ -1749,8 +1757,36 @@ def _show_tuning_report(tuning_report: Optional[dict] = None): y_encoded = self.y # No need to encode the target for regression or if the target is already encoded self.model_tuner = ModelTuner(self.__ML_TASK_TYPE, self.X, y_encoded, self.logging_to_file) - pipeline = self.feature_engineer.pipeline - pipeline = Pipeline(steps=pipeline.steps + [('model', model)]) + # Get model name for native categorical check + model_name = model.__class__.__name__ + + # Check if model supports native categorical features + if model_name in NATIVE_CATEGORICAL_MODELS and hasattr(self, 'categorical_columns') and self.categorical_columns: + # Clone the model to avoid modifying the fitted model + from sklearn.base import clone + model = clone(model) + + # For CatBoost, set cat_features parameter on the cloned model + if 'CatBoost' in model_name: + model.set_params(cat_features=list(self.categorical_columns)) + + # Create pipeline WITHOUT the encoder step (keep other steps like imputer, normalizer) + # and add a step to convert categoricals to 'category' dtype + from flexml._feature_engineer import CategoricalTypeConverter + pipeline_steps_without_encoder = [ + (name, step) for name, step in self.feature_engineer.pipeline.steps + if name != 'encoder' + ] + # Add categorical type converter for native categorical models + pipeline_steps_without_encoder.append( + ('cat_type_converter', CategoricalTypeConverter(list(self.categorical_columns))) + ) + pipeline = Pipeline(steps=pipeline_steps_without_encoder + [('model', model)]) + + self.__logger.info(f"Using native categorical features for {model_name} during tuning (encoding step removed)") + else: + # Standard pipeline with encoding for non-native categorical models + pipeline = Pipeline(steps=self.feature_engineer.pipeline.steps + [('model', model)]) self.__logger.info(f"[PROCESS] Model Tuning process started with '{tuning_method}' method") tuning_method = tuning_method.lower() From a27e1eac27beda39de3e31eb93ac1dc7fa255af1 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 10 Jan 2026 22:48:11 +0300 Subject: [PATCH 03/14] Native categorical support part 3 --- flexml/_model_tuner.py | 8 +------- flexml/structures/supervised_base.py | 24 +++++++----------------- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py index c769c67..a8bc0cd 100644 --- a/flexml/_model_tuner.py +++ b/flexml/_model_tuner.py @@ -198,17 +198,11 @@ def _setup_tuning( * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model """ - model_params = None - if isinstance(model, Pipeline): model = model.named_steps['model'] if "CatBoost" in model.__class__.__name__: - # Use get_all_params() only if the model is fitted, otherwise use get_params() - if model.is_fitted(): - model_params = model.get_all_params() - else: - model_params = model.get_params() + model_params = model.get_all_params() else: model_params = model.get_params() diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index ef4b41f..649eaa0 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -34,7 +34,7 @@ ) from flexml.structures.custom_score import CustomScore from flexml._model_tuner import ModelTuner -from flexml._feature_engineer import FeatureEngineering +from flexml._feature_engineer import FeatureEngineering, CategoricalTypeConverter import warnings warnings.filterwarnings("ignore") @@ -493,7 +493,10 @@ def _fit_model( model_name = model.__class__.__name__ if 'CatBoost' in model_name and hasattr(self, 'categorical_columns') and self.categorical_columns: - model.fit(X, y, cat_features=self.categorical_columns) + # check if model is fitted: + if not model.is_fitted(): + model.set_params(cat_features=list(self.categorical_columns)) + model.fit(X, y) else: model.fit(X, y) @@ -1759,20 +1762,9 @@ def _show_tuning_report(tuning_report: Optional[dict] = None): # Get model name for native categorical check model_name = model.__class__.__name__ - + # Check if model supports native categorical features - if model_name in NATIVE_CATEGORICAL_MODELS and hasattr(self, 'categorical_columns') and self.categorical_columns: - # Clone the model to avoid modifying the fitted model - from sklearn.base import clone - model = clone(model) - - # For CatBoost, set cat_features parameter on the cloned model - if 'CatBoost' in model_name: - model.set_params(cat_features=list(self.categorical_columns)) - - # Create pipeline WITHOUT the encoder step (keep other steps like imputer, normalizer) - # and add a step to convert categoricals to 'category' dtype - from flexml._feature_engineer import CategoricalTypeConverter + if model_name in NATIVE_CATEGORICAL_MODELS and hasattr(self, 'categorical_columns') and len(self.categorical_columns) > 0: pipeline_steps_without_encoder = [ (name, step) for name, step in self.feature_engineer.pipeline.steps if name != 'encoder' @@ -1782,8 +1774,6 @@ def _show_tuning_report(tuning_report: Optional[dict] = None): ('cat_type_converter', CategoricalTypeConverter(list(self.categorical_columns))) ) pipeline = Pipeline(steps=pipeline_steps_without_encoder + [('model', model)]) - - self.__logger.info(f"Using native categorical features for {model_name} during tuning (encoding step removed)") else: # Standard pipeline with encoding for non-native categorical models pipeline = Pipeline(steps=self.feature_engineer.pipeline.steps + [('model', model)]) From e79cb86e1913516fdebe64dde5f0fec3d069ee47 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sun, 11 Jan 2026 17:23:17 +0300 Subject: [PATCH 04/14] Native categorical support part 4 --- flexml/config/__init__.py | 1 + flexml/config/supervised_config.py | 8 + flexml/structures/supervised_base.py | 395 +++++++++++++++------------ 3 files changed, 226 insertions(+), 178 deletions(-) diff --git a/flexml/config/__init__.py b/flexml/config/__init__.py index f56a684..2346f85 100644 --- a/flexml/config/__init__.py +++ b/flexml/config/__init__.py @@ -3,6 +3,7 @@ ) from flexml.config.supervised_config import ( + NATIVE_CATEGORICAL_MODELS, EVALUATION_METRICS, TUNING_METRIC_TRANSFORMATIONS, CROSS_VALIDATION_METHODS, diff --git a/flexml/config/supervised_config.py b/flexml/config/supervised_config.py index 9c61dca..25b066e 100644 --- a/flexml/config/supervised_config.py +++ b/flexml/config/supervised_config.py @@ -1,3 +1,11 @@ +# Models that support native categorical features +NATIVE_CATEGORICAL_MODELS = { + 'CatBoostRegressor', 'CatBoostClassifier', + 'LGBMRegressor', 'LGBMClassifier', + 'XGBRegressor', 'XGBClassifier', + 'HistGradientBoostingRegressor', 'HistGradientBoostingClassifier' +} + # Regression & Classification Evaluation Metrics EVALUATION_METRICS = { "Regression": {"DEFAULT": "R2", diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 649eaa0..bd5709f 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -12,6 +12,7 @@ from flexml.logger import get_logger from flexml.config import ( get_ml_models, + NATIVE_CATEGORICAL_MODELS, EVALUATION_METRICS, CROSS_VALIDATION_METHODS, PLOT_TYPES @@ -41,33 +42,6 @@ pd.set_option('display.max_columns', None) -# Models that support native categorical features (no encoding needed) -NATIVE_CATEGORICAL_MODELS = { - 'CatBoostRegressor', 'CatBoostClassifier', - 'LGBMRegressor', 'LGBMClassifier', - 'XGBRegressor', 'XGBClassifier', - 'HistGradientBoostingRegressor', 'HistGradientBoostingClassifier' # sklearn also supports! -} - -def _get_encoded_categorical_columns(encoded_columns, original_cat_cols): - """ - Identifies which columns in the encoded dataframe came from categorical encoding. - Handles both label encoding (same name) and one-hot encoding (prefix_value format). - """ - encoded_cat_cols = [] - for col in encoded_columns: - # Check if it's the original column name (label/ordinal encoding) - if col in original_cat_cols: - encoded_cat_cols.append(col) - else: - # Check if it's a one-hot encoded column (format: original_value) - for orig_col in original_cat_cols: - if col.startswith(f"{orig_col}_"): - encoded_cat_cols.append(col) - break - return encoded_cat_cols - - class SupervisedBase: """ Base class for Supervised tasks (regression & classification) @@ -354,14 +328,15 @@ def __prepare_holdout_data(self, test_size: Optional[float] = None): self.feature_engineer.setup(data=train_data) self.categorical_columns = self.feature_engineer.categorical_columns - # Store raw categorical columns for holdout data (for native-categorical models) - self.X_train_cat_raw = self._get_raw_categorical_data(train_data) - self.X_test_cat_raw = self._get_raw_categorical_data(test_data) - - self.X_train, self.y_train = self.feature_engineer.fit_transform() - self.X_test, self.y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) - self.encoded_categorical_columns = _get_encoded_categorical_columns(self.X_train.columns, self.categorical_columns) - self.feature_names = list(self.X_train.columns) + # Store raw holdout data (preprocessing will be done per-model when needed) + self.X_train_raw = train_data.drop(columns=[self.target_col]) + self.X_test_raw = test_data.drop(columns=[self.target_col]) + self.y_train, self.y_test = self._encode_target( + train_data[self.target_col], + test_data[self.target_col] + ) + + self.feature_names = list(self.X_train_raw.columns) self.y_class_mapping = self.feature_engineer.y_class_mapping def __prepare_models(self, experiment_size: str, num_class: int, random_state: Optional[int] = None, n_jobs: Optional[int] = -1): @@ -413,60 +388,6 @@ def __top_n_models_checker(self, top_n_models: Optional[int]) -> int: return top_n_models - def _get_raw_categorical_data(self, data: pd.DataFrame) -> Optional[pd.DataFrame]: - """ - Extract raw categorical columns from data and convert to category dtype. - Used to preserve original categoricals before encoding for native-categorical models. - - Parameters - ---------- - data : pd.DataFrame - The data containing categorical columns - - Returns - ------- - Optional[pd.DataFrame] - DataFrame with categorical columns converted to 'category' dtype, or None if no categorical columns - """ - if not hasattr(self, 'categorical_columns') or not self.categorical_columns: - return None - cat_df = data[self.categorical_columns].copy() - for col in self.categorical_columns: - cat_df[col] = cat_df[col].astype('category') - return cat_df - - def _prepare_data_for_model( - self, - model_name: str, - X_encoded: pd.DataFrame, - X_cat_raw: Optional[pd.DataFrame] = None - ) -> pd.DataFrame: - """ - Prepares data for a specific model by swapping encoded categoricals - with raw ones for models that support native categorical features. - - Parameters - ---------- - model_name : str - The name of the model - X_encoded : pd.DataFrame - The encoded feature data - X_cat_raw : Optional[pd.DataFrame] - The raw categorical columns (with 'category' dtype) - - Returns - ------- - pd.DataFrame - Prepared feature data for the model - """ - if model_name in NATIVE_CATEGORICAL_MODELS and X_cat_raw is not None and hasattr(self, 'encoded_categorical_columns') and self.encoded_categorical_columns: - # Drop encoded categorical columns - X_final = X_encoded.drop(columns=self.encoded_categorical_columns, errors='ignore') - # Merge with raw categorical columns (aligned by index) - X_final = pd.concat([X_final, X_cat_raw.loc[X_final.index]], axis=1) - return X_final - return X_encoded - def _fit_model( self, model: object, @@ -500,6 +421,58 @@ def _fit_model( else: model.fit(X, y) + def _encode_target( + self, + y: pd.Series, + y_test: Optional[pd.Series] = None, + fit: bool = True + ) -> Union[pd.Series, tuple]: + """ + Encodes the target variable for classification tasks. + + Parameters + ---------- + y : pd.Series + The target variable to encode + y_test : pd.Series, optional + Test target to transform (uses already fitted encoder) + fit : bool + If True, fits the encoder on y. If False, only transforms. + + Returns + ------- + pd.Series or tuple + Encoded y, or (encoded_y, encoded_y_test) if y_test provided + """ + # Skip encoding for regression or already numeric targets + if self.__ML_TASK_TYPE != 'Classification' or y.dtype not in ['object', 'category']: + return (y, y_test) if y_test is not None else y + + # Encode y + if fit: + encoded_y = pd.Series( + self.feature_engineer.target_encoder.fit_transform(y), + name=y.name, + index=y.index + ) + else: + encoded_y = pd.Series( + self.feature_engineer.target_encoder.transform(y), + name=y.name, + index=y.index + ) + + # Encode y_test if provided + if y_test is not None: + encoded_y_test = pd.Series( + self.feature_engineer.target_encoder.transform(y_test), + name=y_test.name, + index=y_test.index + ) + return encoded_y, encoded_y_test + + return encoded_y + def __process_experiment_result(self, experiment_stats: dict): """ Processes and aggregates the results of an experiment, calculating average metrics and selecting the best model. @@ -745,13 +718,14 @@ def start_experiment( ], axis=1) self.feature_engineer.setup(data=train_data) - - # Save raw categorical columns BEFORE encoding (for native-categorical models) - X_train_cat_raw = self._get_raw_categorical_data(train_data) - X_test_cat_raw = self._get_raw_categorical_data(test_data) - X_train_encoded, y_train = self.feature_engineer.fit_transform() - X_test_encoded, y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) + # Get raw X and y from train/test data + X_train_raw = train_data.drop(columns=[self.target_col]) + X_test_raw = test_data.drop(columns=[self.target_col]) + y_train, y_test = self._encode_target( + train_data[self.target_col], + test_data[self.target_col] + ) for model_idx in range(len(self.__ML_MODELS)): model_info = self.__ML_MODELS[model_idx] @@ -763,9 +737,12 @@ def start_experiment( model = model_info['model'] - # Prepare data based on model type (native categorical vs encoded) - X_train_final = self._prepare_data_for_model(model_name, X_train_encoded, X_train_cat_raw) - X_test_final = self._prepare_data_for_model(model_name, X_test_encoded, X_test_cat_raw) + # Get preprocessing pipeline for this specific model + preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) + + # Transform data using model-specific preprocessing + X_train_final = preprocessing_pipeline.fit_transform(X_train_raw) + X_test_final = preprocessing_pipeline.transform(X_test_raw) try: all_metrics = [] @@ -974,32 +951,51 @@ def save_model( raise ValueError(error_msg) else: # If model is an object, we can't know its name, so we use its class name model_name = model.__class__.__name__ - - # Initialize pipeline steps - pipeline_steps = [] - - # Initialize and setup feature engineering if needed - if not model_only: - # Add the feature engineering pipeline directly - pipeline_steps.extend(self.feature_engineer.pipeline.steps) # Handle full training scenario if required if full_train: already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) - if not already_trained: - self.__logger.info("Training the model using the whole data") + + # Check if this is a native categorical model that will use a different pipeline structure + is_native_cat_model = ( + model_name in NATIVE_CATEGORICAL_MODELS and + hasattr(self, 'categorical_columns') and + len(self.categorical_columns) > 0 and + not model_only # Only use special flow if we're saving a pipeline + ) + + # For native categorical models being saved as pipeline: + # ALWAYS retrain using the pipeline structure, even if previously "full trained" + # because the previous training used encode→swap, not the pipeline structure + needs_training = not already_trained or is_native_cat_model + + if needs_training: + if is_native_cat_model and already_trained: + self.__logger.info( + f"Retraining '{model_name}' to match pipeline structure for native categorical support." + ) + else: + self.__logger.info("Training the model using the whole data") + self.feature_engineer.setup(data=self.data) - # Get raw categoricals before encoding - X_cat_raw = self._get_raw_categorical_data(self.data) + # Get preprocessing pipeline for this model + preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) - X_train_encoded, y_train = self.feature_engineer.fit_transform() + # Prepare training data + X_raw = self.data.drop(columns=[self.target_col]) + y_train = self._encode_target(self.data[self.target_col]) - # Prepare data for this specific model - X_train_final = self._prepare_data_for_model(model_name, X_train_encoded, X_cat_raw) + # Fit and transform data through the preprocessing pipeline + X_train_final = preprocessing_pipeline.fit_transform(X_raw) - # Fit with proper cat_features handling + # Fit model with proper cat_features handling self._fit_model(model, X_train_final, y_train, model_name) + + if is_native_cat_model: + self.__logger.info(f"Model '{model_name}' trained using native categorical pipeline.") + else: + self.__logger.info(f"Model '{model_name}' trained with full data.") # find the model in leaderboard and update the full_train to True, and update the model object in there for model_info in self.__model_training_info: @@ -1023,19 +1019,8 @@ def save_model( return model - # Warn users about native categorical models in Pipeline mode - if model_name in NATIVE_CATEGORICAL_MODELS and hasattr(self, 'categorical_columns') and self.categorical_columns: - self.__logger.warning( - f"'{model_name}' supports native categorical features, but Pipeline mode encodes categorical data. " - f"For optimal performance, consider using 'model_only=True' and handle feature engineering separately, " - f"or use the 'predict()' method directly which handles native categoricals automatically." - ) - - # Add the model to the pipeline - pipeline_steps.append(('model', model)) - - # Create the pipeline - pipeline = Pipeline(pipeline_steps) + # Build pipeline with proper handling for native categorical models + pipeline = self._get_model_pipeline(model, include_model=True) # Save the pipeline try: @@ -1074,6 +1059,68 @@ def _check_if_model_is_full_trained(self, model_name: str, model_taken_from_lead return True return False + def _is_native_categorical_model(self, model_name: str) -> bool: + """Check if model supports native categorical features.""" + return ( + model_name in NATIVE_CATEGORICAL_MODELS and + hasattr(self, 'categorical_columns') and + len(self.categorical_columns) > 0 + ) + + def _get_preprocessing_steps(self, model_name: str) -> list: + """ + Returns the appropriate preprocessing steps for a given model. + + For native categorical models: no encoder, uses CategoricalTypeConverter + For other models: includes encoder + + Parameters + ---------- + model_name : str + The name of the model + + Returns + ------- + list + List of preprocessing steps as (name, transformer) tuples + """ + if self._is_native_categorical_model(model_name): + # Pipeline without encoder, with CategoricalTypeConverter + steps = [ + (name, step) for name, step in self.feature_engineer.pipeline.steps + if name != 'encoder' + ] + steps.append(('cat_type_converter', CategoricalTypeConverter(list(self.categorical_columns)))) + else: + # Standard pipeline with encoder + steps = list(self.feature_engineer.pipeline.steps) + + return steps + + def _get_model_pipeline(self, model, include_model: bool = True) -> Pipeline: + """ + Returns a complete Pipeline for a given model. + + Parameters + ---------- + model : object + The model object + include_model : bool, optional + Whether to include the model as the last step (default: True) + + Returns + ------- + Pipeline + sklearn Pipeline with preprocessing steps (and optionally the model) + """ + model_name = model.__class__.__name__ + steps = self._get_preprocessing_steps(model_name) + + if include_model: + steps.append(('model', model)) + + return Pipeline(steps) + def _predict_helper( self, test_data: pd.DataFrame, @@ -1097,8 +1144,8 @@ def _predict_helper( if extra: error_msg += f" Extra: {extra}." raise ValueError(error_msg) - model_taken_from_leaderboard = False # If the model object is from leaderboard, track this - + # Get model from leaderboard or use provided model + model_taken_from_leaderboard = False if model is None: model = self.get_best_models() model_name = self.__last_searched_model_name @@ -1111,38 +1158,38 @@ def _predict_helper( model_name = model model = self.get_model_by_name(model) model_taken_from_leaderboard = True - else: # If model is an object, we can't know its name, so we use its class name + else: model_name = model.__class__.__name__ - # Prepare training data if needed + # Get the preprocessing pipeline for this model (consistent with save_model) + self.feature_engineer.setup(data=self.data) + preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) + + # Train model on full data if needed if full_train: - # Check If model_taken_from_leaderboard is True and Full Train in self.__model_training_info is True, then we don't need to train the model again already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) if not already_trained: self.__logger.info("Training the model using the whole data") - self.feature_engineer.setup(data=self.data) - # Get raw categoricals before encoding - X_cat_raw = self._get_raw_categorical_data(self.data) + # Prepare training data + X_raw = self.data.drop(columns=[self.target_col]) + y_train = self._encode_target(self.data[self.target_col]) - X_train_encoded, y_train = self.feature_engineer.fit_transform() - X_train_final = self._prepare_data_for_model(model_name, X_train_encoded, X_cat_raw) + # Fit and transform through preprocessing pipeline + X_train_final = preprocessing_pipeline.fit_transform(X_raw) self._fit_model(model, X_train_final, y_train, model_name) - # find the model in leaderboard and update the full_train to True, and update the model object in there + # Update leaderboard for model_info in self.__model_training_info: for name, info in model_info.items(): if name == model_name: info["model_stats"]["Full Train"] = True info["model"] = model break - # Update leaderboard self.get_best_models() - # Transform test data and prepare for model - X_test_encoded = self.feature_engineer.transform(test_data) - X_test_cat_raw = self._get_raw_categorical_data(test_data) - X_test = self._prepare_data_for_model(model_name, X_test_encoded, X_test_cat_raw) + # Transform test data through the same preprocessing pipeline + X_test = preprocessing_pipeline.transform(test_data) return model, X_test @@ -1209,7 +1256,8 @@ def predict_proba( model, X_test = self._predict_helper(test_data, model, full_train) return model.predict_proba(X_test) - def __get_holdout_model_from_stats(self, model_name: str) -> object: + def __get_holdout_model_from_stats(self, model_name: str) -> Optional[dict]: + """Returns dict with 'model' and 'preprocessing_pipeline' keys, or None.""" if self._holdout_model_objects is None or self._holdout_model_objects == {}: return None return self._holdout_model_objects.get(model_name) @@ -1220,13 +1268,18 @@ def __add_holdout_model_to_stats(self, model: object, model_name: Optional[str] model_copy = deepcopy(model) - # Prepare holdout data for this model (use raw categoricals for native-categorical models) - X_train_final = self._prepare_data_for_model(model_name, self.X_train, self.X_train_cat_raw) + # Get preprocessing pipeline for this model + preprocessing_pipeline = self._get_model_pipeline(model_copy, include_model=False) - # Fit with proper categorical handling + # Transform holdout training data and fit model + X_train_final = preprocessing_pipeline.fit_transform(self.X_train_raw) self._fit_model(model_copy, X_train_final, self.y_train, model_name) - self._holdout_model_objects[model_name] = model_copy + # Store the fitted preprocessing pipeline with the model for later use + self._holdout_model_objects[model_name] = { + 'model': model_copy, + 'preprocessing_pipeline': preprocessing_pipeline + } return model_copy def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_importance", **kwargs): @@ -1297,17 +1350,22 @@ def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_ elif isinstance(model, str): model_name = model model = self.get_model_by_name(model) - else: # If model is an object, we can't know its name, so we use its class name + else: model_name = model.__class__.__name__ - if self.__get_holdout_model_from_stats(model_name) is not None: - model = self.__get_holdout_model_from_stats(model_name) + # Get or create holdout model with its preprocessing pipeline + holdout_data = self.__get_holdout_model_from_stats(model_name) + if holdout_data is not None: + model = holdout_data['model'] + preprocessing_pipeline = holdout_data['preprocessing_pipeline'] else: model = self.__add_holdout_model_to_stats(model, model_name) + holdout_data = self.__get_holdout_model_from_stats(model_name) + preprocessing_pipeline = holdout_data['preprocessing_pipeline'] - # Prepare holdout data for this model (use raw categoricals for native-categorical models) - X_train_final = self._prepare_data_for_model(model_name, self.X_train, self.X_train_cat_raw) - X_test_final = self._prepare_data_for_model(model_name, self.X_test, self.X_test_cat_raw) + # Transform holdout data using the model's preprocessing pipeline + X_train_final = preprocessing_pipeline.transform(self.X_train_raw) + X_test_final = preprocessing_pipeline.transform(self.X_test_raw) # If kind expects predictions if kind in ["confusion_matrix"]: @@ -1319,7 +1377,7 @@ def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_ if kind == "feature_importance": if not hasattr(self, 'feature_names'): - self.feature_names = list(self.X_train.columns) + self.feature_names = list(self.X_train_raw.columns) graph = plot_feature_importance(model, self.feature_names, **kwargs) elif kind == "confusion_matrix": graph = plot_confusion_matrix(self.y_test, preds, self.y_class_mapping, **kwargs) @@ -1751,32 +1809,13 @@ def _show_tuning_report(tuning_report: Optional[dict] = None): logging_to_file=self.logging_to_file )) - # Create the ModelTuner object If It's not created before, avoid creating it everytime tune_model() function is called + # Create the ModelTuner object If It's not created before if not hasattr(self, 'model_tuner'): - if self.__ML_TASK_TYPE == 'Classification' and self.y.dtype in ['object', 'category']: - y_encoded = pd.Series(self.feature_engineer.target_encoder.fit_transform(self.y), name=self.target_col) - y_encoded.index = self.y.index - else: - y_encoded = self.y # No need to encode the target for regression or if the target is already encoded + y_encoded = self._encode_target(self.y) self.model_tuner = ModelTuner(self.__ML_TASK_TYPE, self.X, y_encoded, self.logging_to_file) - # Get model name for native categorical check - model_name = model.__class__.__name__ - - # Check if model supports native categorical features - if model_name in NATIVE_CATEGORICAL_MODELS and hasattr(self, 'categorical_columns') and len(self.categorical_columns) > 0: - pipeline_steps_without_encoder = [ - (name, step) for name, step in self.feature_engineer.pipeline.steps - if name != 'encoder' - ] - # Add categorical type converter for native categorical models - pipeline_steps_without_encoder.append( - ('cat_type_converter', CategoricalTypeConverter(list(self.categorical_columns))) - ) - pipeline = Pipeline(steps=pipeline_steps_without_encoder + [('model', model)]) - else: - # Standard pipeline with encoding for non-native categorical models - pipeline = Pipeline(steps=self.feature_engineer.pipeline.steps + [('model', model)]) + # Build pipeline with proper handling for native categorical models + pipeline = self._get_model_pipeline(model, include_model=True) self.__logger.info(f"[PROCESS] Model Tuning process started with '{tuning_method}' method") tuning_method = tuning_method.lower() From 447d9f2299c358194d572d82641b46bf8c12015d Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sun, 11 Jan 2026 18:45:15 +0300 Subject: [PATCH 05/14] Delete unnecessary retrain in save_model --- flexml/structures/supervised_base.py | 34 ++++------------------------ 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index bd5709f..aed3b4f 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -956,27 +956,9 @@ def save_model( if full_train: already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) - # Check if this is a native categorical model that will use a different pipeline structure - is_native_cat_model = ( - model_name in NATIVE_CATEGORICAL_MODELS and - hasattr(self, 'categorical_columns') and - len(self.categorical_columns) > 0 and - not model_only # Only use special flow if we're saving a pipeline - ) - - # For native categorical models being saved as pipeline: - # ALWAYS retrain using the pipeline structure, even if previously "full trained" - # because the previous training used encode→swap, not the pipeline structure - needs_training = not already_trained or is_native_cat_model - - if needs_training: - if is_native_cat_model and already_trained: - self.__logger.info( - f"Retraining '{model_name}' to match pipeline structure for native categorical support." - ) - else: - self.__logger.info("Training the model using the whole data") - + if not already_trained: + self.__logger.info("Training the model using the whole data") + self.feature_engineer.setup(data=self.data) # Get preprocessing pipeline for this model @@ -989,22 +971,16 @@ def save_model( # Fit and transform data through the preprocessing pipeline X_train_final = preprocessing_pipeline.fit_transform(X_raw) - # Fit model with proper cat_features handling + # Fit model self._fit_model(model, X_train_final, y_train, model_name) - - if is_native_cat_model: - self.__logger.info(f"Model '{model_name}' trained using native categorical pipeline.") - else: - self.__logger.info(f"Model '{model_name}' trained with full data.") - # find the model in leaderboard and update the full_train to True, and update the model object in there + # Update leaderboard for model_info in self.__model_training_info: for name, info in model_info.items(): if name == model_name: info["model_stats"]["Full Train"] = True info["model"] = model break - # Update leaderboard self.get_best_models() # If no feature pipeline is included, return the model directly From fe52039544931a650546ce41d8b820cad3df1b24 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sun, 11 Jan 2026 18:59:37 +0300 Subject: [PATCH 06/14] Improve code quality --- flexml/structures/supervised_base.py | 56 +++++++++------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index aed3b4f..3609586 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -316,24 +316,17 @@ def __prepare_holdout_data(self, test_size: Optional[float] = None): )[0] train_labels, test_labels = holdout_cv_splits[0], holdout_cv_splits[1] - train_data = pd.concat([ - self.X.loc[train_labels], - self.y.loc[train_labels] - ], axis=1) - test_data = pd.concat([ - self.X.loc[test_labels], - self.y.loc[test_labels] - ], axis=1) - + # Setup feature engineer with train data + train_data = pd.concat([self.X.loc[train_labels], self.y.loc[train_labels]], axis=1) self.feature_engineer.setup(data=train_data) self.categorical_columns = self.feature_engineer.categorical_columns - # Store raw holdout data (preprocessing will be done per-model when needed) - self.X_train_raw = train_data.drop(columns=[self.target_col]) - self.X_test_raw = test_data.drop(columns=[self.target_col]) + # Store raw holdout data (use X/y directly instead of concat→drop) + self.X_train_raw = self.X.loc[train_labels] + self.X_test_raw = self.X.loc[test_labels] self.y_train, self.y_test = self._encode_target( - train_data[self.target_col], - test_data[self.target_col] + self.y.loc[train_labels], + self.y.loc[test_labels] ) self.feature_names = list(self.X_train_raw.columns) @@ -708,23 +701,16 @@ def start_experiment( train_labels = train_idx test_labels = test_idx - train_data = pd.concat([ - self.X.loc[train_labels], - self.y.loc[train_labels] - ], axis=1) - test_data = pd.concat([ - self.X.loc[test_labels], - self.y.loc[test_labels] - ], axis=1) - + # Setup feature engineer with train data + train_data = pd.concat([self.X.loc[train_labels], self.y.loc[train_labels]], axis=1) self.feature_engineer.setup(data=train_data) - # Get raw X and y from train/test data - X_train_raw = train_data.drop(columns=[self.target_col]) - X_test_raw = test_data.drop(columns=[self.target_col]) + # Use X/y directly instead of concat→drop + X_train_raw = self.X.loc[train_labels] + X_test_raw = self.X.loc[test_labels] y_train, y_test = self._encode_target( - train_data[self.target_col], - test_data[self.target_col] + self.y.loc[train_labels], + self.y.loc[test_labels] ) for model_idx in range(len(self.__ML_MODELS)): @@ -964,12 +950,9 @@ def save_model( # Get preprocessing pipeline for this model preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) - # Prepare training data - X_raw = self.data.drop(columns=[self.target_col]) - y_train = self._encode_target(self.data[self.target_col]) - # Fit and transform data through the preprocessing pipeline - X_train_final = preprocessing_pipeline.fit_transform(X_raw) + X_train_final = preprocessing_pipeline.fit_transform(self.X) + y_train = self._encode_target(self.y) # Fit model self._fit_model(model, X_train_final, y_train, model_name) @@ -1147,12 +1130,9 @@ def _predict_helper( if not already_trained: self.__logger.info("Training the model using the whole data") - # Prepare training data - X_raw = self.data.drop(columns=[self.target_col]) - y_train = self._encode_target(self.data[self.target_col]) - # Fit and transform through preprocessing pipeline - X_train_final = preprocessing_pipeline.fit_transform(X_raw) + X_train_final = preprocessing_pipeline.fit_transform(self.X) + y_train = self._encode_target(self.y) self._fit_model(model, X_train_final, y_train, model_name) # Update leaderboard From b108716f13c1f4dcff707c22775444f01b0b89a7 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sun, 11 Jan 2026 19:02:45 +0300 Subject: [PATCH 07/14] Improve supervised test coverage --- tests/test_supervised.py | 46 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/test_supervised.py b/tests/test_supervised.py index 992760a..1cb787b 100644 --- a/tests/test_supervised.py +++ b/tests/test_supervised.py @@ -266,4 +266,48 @@ def test_25_plot_multiclass_classification_shap_summary(self): def test_26_plot_multiclass_classification_shap_violin(self): exp_obj = self.test_config['MulticlassClassification']['exp_obj'] - exp_obj.plot("RandomForestClassifier", kind="shap_violin") \ No newline at end of file + exp_obj.plot("RandomForestClassifier", kind="shap_violin") + + def test_27_native_categorical_pipeline_consistency(self): + """Test that saved pipeline predictions match exp.predict_proba() for native categorical models""" + exp_obj = self.test_config['BinaryClassification']['exp_obj'] + test_data = self.test_config['BinaryClassification'].get('data').drop(columns=['target']) + + # Get predictions via FlexML (this trains with full data) + flexml_probs = exp_obj.predict_proba(test_data, model='LGBMClassifier', full_train=True) + + # Save pipeline (should use already trained model, no retraining) + save_path = "test_native_cat_pipeline.pkl" + exp_obj.save_model(model='LGBMClassifier', save_path=save_path, model_only=False, full_train=True) + + # Load and predict via pipeline + with open(save_path, 'rb') as f: + loaded_pipeline = pickle.load(f) + pipeline_probs = loaded_pipeline.predict_proba(test_data) + + # Predictions should match + np.testing.assert_array_almost_equal(flexml_probs, pipeline_probs, decimal=5, + err_msg="Loaded pipeline predictions don't match FlexML predictions") + os.remove(save_path) + + def test_28_predict_column_mismatch_error(self): + """Test that predict raises proper error for column mismatch""" + exp_obj = self.test_config['Regression']['exp_obj'] + test_data = self.test_config['Regression'].get('data').drop(columns=['target']) + + # Remove a column to create mismatch + bad_data = test_data.drop(columns=[test_data.columns[0]]) + + with self.assertRaises(ValueError) as context: + exp_obj.predict(bad_data, full_train=False) + + self.assertIn("Missing", str(context.exception)) + + def test_29_get_model_by_invalid_name(self): + """Test get_model_by_name raises error for invalid model name""" + exp_obj = self.test_config['Regression']['exp_obj'] + + with self.assertRaises(ValueError) as context: + exp_obj.get_model_by_name("NonExistentModel") + + self.assertIn("not found", str(context.exception)) \ No newline at end of file From 46b593f352be12fda12fcbca9a8132798fbee797 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sun, 11 Jan 2026 19:30:20 +0300 Subject: [PATCH 08/14] Error fix for ordinal encoding --- flexml/_feature_engineer.py | 36 ++++++++++++++++++++++++---- flexml/structures/supervised_base.py | 7 +++++- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/flexml/_feature_engineer.py b/flexml/_feature_engineer.py index e306e33..b9ef060 100644 --- a/flexml/_feature_engineer.py +++ b/flexml/_feature_engineer.py @@ -33,16 +33,19 @@ class CategoricalTypeConverter(BaseEstimator, TransformerMixin): """ A transformer to convert categorical columns to 'category' dtype. Used for tree-based models that support native categorical features. + Supports ordered categories via ordinal_encode_map. """ - def __init__(self, categorical_columns: Optional[List[str]] = None): + def __init__(self, categorical_columns: Optional[List[str]] = None, ordinal_encode_map: Optional[Dict[str, List]] = None): self.categorical_columns = categorical_columns or [] + self.ordinal_encode_map = ordinal_encode_map or {} def fit(self, X, y=None): return self def transform(self, X): """ - Converts specified categorical columns to 'category' dtype + Converts specified categorical columns to 'category' dtype. + For columns in ordinal_encode_map, creates ordered categorical with specified order. Returns ------- @@ -52,7 +55,13 @@ def transform(self, X): X = X.copy() for col in self.categorical_columns: if col in X.columns: - X[col] = X[col].astype('category') + if col in self.ordinal_encode_map: + # Create ordered categorical with specified order + categories = self.ordinal_encode_map[col] + X[col] = pd.Categorical(X[col].astype(str), categories=categories, ordered=True) + else: + # Regular unordered categorical + X[col] = X[col].astype('category') return X @@ -74,6 +83,8 @@ def fit(self, X, y=None): return self def transform(self, X) -> pd.DataFrame: + X = X.copy() # Avoid modifying original data + # Categorical columns are converted to string categorical_cols = X.select_dtypes(exclude=['number']).columns X[categorical_cols] = X[categorical_cols].astype(str) @@ -131,9 +142,17 @@ def __init__( self.ordinal_encoders = {} def fit(self, X, y=None): - # Categorical columns are converted to string + X = X.copy() # Avoid modifying original data + + # First, convert all non-numeric columns to string (original behavior) categorical_cols = X.select_dtypes(exclude=['number']).columns X[categorical_cols] = X[categorical_cols].astype(str) + + # Also ensure columns in encoding_method_mapper are string + # (handles case where column is numeric but needs encoding) + for col in self.encoding_method_mapper.keys(): + if col in X.columns and col not in categorical_cols: + X[col] = X[col].astype(str) for col, method in self.encoding_method_mapper.items(): if method == "label_encoder": @@ -160,9 +179,16 @@ def fit(self, X, y=None): return self def transform(self, X) -> pd.DataFrame: - # Categorical columns are converted to string + X = X.copy() # Avoid modifying original data + + # First, convert all non-numeric columns to string (original behavior) categorical_cols = X.select_dtypes(exclude=['number']).columns X[categorical_cols] = X[categorical_cols].astype(str) + + # Also ensure columns in encoding_method_mapper are string + for col in self.encoding_method_mapper.keys(): + if col in X.columns and col not in categorical_cols: + X[col] = X[col].astype(str) for col, method in self.encoding_method_mapper.items(): if method == "label_encoder": diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 3609586..d32cb9b 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -1049,7 +1049,12 @@ def _get_preprocessing_steps(self, model_name: str) -> list: (name, step) for name, step in self.feature_engineer.pipeline.steps if name != 'encoder' ] - steps.append(('cat_type_converter', CategoricalTypeConverter(list(self.categorical_columns)))) + # Pass ordinal_encode_map to preserve category ordering for ordinal columns + ordinal_map = getattr(self.feature_engineer, 'ordinal_encode_map', None) or {} + steps.append(('cat_type_converter', CategoricalTypeConverter( + list(self.categorical_columns), + ordinal_encode_map=ordinal_map + ))) else: # Standard pipeline with encoder steps = list(self.feature_engineer.pipeline.steps) From 1a543fd9ffde947c31ede7e62a8820f01faa893a Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sun, 11 Jan 2026 23:51:12 +0300 Subject: [PATCH 09/14] Ensure pipeline consistency in save_model() --- flexml/structures/supervised_base.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index d32cb9b..915385f 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -938,7 +938,8 @@ def save_model( else: # If model is an object, we can't know its name, so we use its class name model_name = model.__class__.__name__ - # Handle full training scenario if required + + fitted_preprocessing_pipeline = None if full_train: already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) @@ -948,10 +949,10 @@ def save_model( self.feature_engineer.setup(data=self.data) # Get preprocessing pipeline for this model - preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) + fitted_preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) # Fit and transform data through the preprocessing pipeline - X_train_final = preprocessing_pipeline.fit_transform(self.X) + X_train_final = fitted_preprocessing_pipeline.fit_transform(self.X) y_train = self._encode_target(self.y) # Fit model @@ -978,8 +979,13 @@ def save_model( return model - # Build pipeline with proper handling for native categorical models - pipeline = self._get_model_pipeline(model, include_model=True) + if fitted_preprocessing_pipeline is not None: + # Combine the exact fitted preprocessing steps with the fitted model + steps = list(fitted_preprocessing_pipeline.steps) + [('model', model)] + pipeline = Pipeline(steps) + else: + # Model was already trained, get pipeline from feature_engineer + pipeline = self._get_model_pipeline(model, include_model=True) # Save the pipeline try: From 1a7d6d536133950577414220d5f44e470a0f5003 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Mon, 12 Jan 2026 00:02:21 +0300 Subject: [PATCH 10/14] Fill unseen categories to NaN --- flexml/_feature_engineer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/flexml/_feature_engineer.py b/flexml/_feature_engineer.py index b9ef060..975e918 100644 --- a/flexml/_feature_engineer.py +++ b/flexml/_feature_engineer.py @@ -56,9 +56,12 @@ def transform(self, X): for col in self.categorical_columns: if col in X.columns: if col in self.ordinal_encode_map: - # Create ordered categorical with specified order - categories = self.ordinal_encode_map[col] - X[col] = pd.Categorical(X[col].astype(str), categories=categories, ordered=True) + # Handle unseen categories by mapping them to NaN + categories = [str(c) for c in self.ordinal_encode_map[col]] + col_values = X[col].astype(str) + known_mask = col_values.isin(categories) + col_values = col_values.where(known_mask, other=np.nan) + X[col] = pd.Categorical(col_values, categories=categories, ordered=True) else: # Regular unordered categorical X[col] = X[col].astype('category') From ba6424b79849de964e2fbba882e6fce7ce99741f Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Mon, 12 Jan 2026 00:13:47 +0300 Subject: [PATCH 11/14] Improve categorical feature coverage --- flexml/_feature_engineer.py | 14 +++++++++----- tests/test_supervised.py | 20 +++++++++++++++++--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/flexml/_feature_engineer.py b/flexml/_feature_engineer.py index 975e918..92df339 100644 --- a/flexml/_feature_engineer.py +++ b/flexml/_feature_engineer.py @@ -36,8 +36,9 @@ class CategoricalTypeConverter(BaseEstimator, TransformerMixin): Supports ordered categories via ordinal_encode_map. """ def __init__(self, categorical_columns: Optional[List[str]] = None, ordinal_encode_map: Optional[Dict[str, List]] = None): - self.categorical_columns = categorical_columns or [] - self.ordinal_encode_map = ordinal_encode_map or {} + # Keep original values for sklearn clone compatibility + self.categorical_columns = categorical_columns + self.ordinal_encode_map = ordinal_encode_map def fit(self, X, y=None): return self @@ -53,11 +54,14 @@ def transform(self, X): A DataFrame with categorical columns converted to 'category' dtype """ X = X.copy() - for col in self.categorical_columns: + categorical_cols = self.categorical_columns or [] + ordinal_map = self.ordinal_encode_map or {} + + for col in categorical_cols: if col in X.columns: - if col in self.ordinal_encode_map: + if col in ordinal_map: # Handle unseen categories by mapping them to NaN - categories = [str(c) for c in self.ordinal_encode_map[col]] + categories = [str(c) for c in ordinal_map[col]] col_values = X[col].astype(str) known_mask = col_values.isin(categories) col_values = col_values.where(known_mask, other=np.nan) diff --git a/tests/test_supervised.py b/tests/test_supervised.py index 1cb787b..3fc4601 100644 --- a/tests/test_supervised.py +++ b/tests/test_supervised.py @@ -14,19 +14,33 @@ class TestRegression(unittest.TestCase): logger = get_logger(__name__, "TEST") logger.setLevel("DEBUG") + @staticmethod + def _add_synthetic_categorical_columns(df): + """Add synthetic categorical columns to test categorical encoding""" + n_rows = len(df) + np.random.seed(42) + + df['category_A'] = np.random.choice(['low', 'medium', 'high'], n_rows) + df['category_B'] = np.random.choice(['red', 'green', 'blue', 'yellow'], n_rows) + return df + test_config = { 'Regression': { - 'data': load_diabetes(as_frame=True)['frame'], + 'data': _add_synthetic_categorical_columns.__func__(load_diabetes(as_frame=True)['frame'].copy()), 'target_col': 'target', 'exp_obj': None }, 'BinaryClassification': { - 'data': load_breast_cancer(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'No', 1: 'Yes'})), + 'data': _add_synthetic_categorical_columns.__func__( + load_breast_cancer(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'No', 1: 'Yes'})).copy() + ), 'target_col': 'target', 'exp_obj': None }, 'MulticlassClassification': { - 'data': load_iris(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'Iris-Setosa', 1: 'Iris-Versicolor', 2: 'Iris-Virginica'})), + 'data': _add_synthetic_categorical_columns.__func__( + load_iris(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'Iris-Setosa', 1: 'Iris-Versicolor', 2: 'Iris-Virginica'})).copy() + ), 'target_col': 'target', 'exp_obj': None } From 9f9fdb8cf62d5c02a8628e24106bbcbeba9ca751 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 24 Jan 2026 23:08:21 +0300 Subject: [PATCH 12/14] Error fix / shap violin graph plot error due to category dtype --- flexml/helpers/plot_model_graphs.py | 4 ++ flexml/structures/supervised_base.py | 76 +++++++++++++++------------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/flexml/helpers/plot_model_graphs.py b/flexml/helpers/plot_model_graphs.py index 8c54235..333f044 100644 --- a/flexml/helpers/plot_model_graphs.py +++ b/flexml/helpers/plot_model_graphs.py @@ -410,6 +410,10 @@ def plot_shap( if shap_type == 'shap_summary': shap.summary_plot(shap_values, X_test) elif shap_type == 'shap_violin': + # While shap summary is okay with categorical columns, violin plot is not + cat_cols = X_test.select_dtypes(include=['category']).columns + for col in cat_cols: + X_test[col] = X_test[col].cat.codes shap.plots.violin(shap_values, X_test) else: return f"Invalid shap_type: {shap_type}" diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 915385f..35d4865 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -727,22 +727,22 @@ def start_experiment( preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) # Transform data using model-specific preprocessing - X_train_final = preprocessing_pipeline.fit_transform(X_train_raw) - X_test_final = preprocessing_pipeline.transform(X_test_raw) + X_train_processed = preprocessing_pipeline.fit_transform(X_train_raw) + X_test_processed = preprocessing_pipeline.transform(X_test_raw) try: all_metrics = [] all_times = [] t_start = time() - self._fit_model(model, X_train_final, y_train, model_name) + self._fit_model(model, X_train_processed, y_train, model_name) t_end = time() time_taken = round(t_end - t_start, 2) if self.__ML_TASK_TYPE == "Classification" and hasattr(model, 'predict_proba'): - y_pred = model.predict_proba(X_test_final) + y_pred = model.predict_proba(X_test_processed) else: - y_pred = model.predict(X_test_final) + y_pred = model.predict(X_test_processed) model_perf = evaluate_model_perf( self.__ML_TASK_TYPE, @@ -952,11 +952,11 @@ def save_model( fitted_preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) # Fit and transform data through the preprocessing pipeline - X_train_final = fitted_preprocessing_pipeline.fit_transform(self.X) + X_train_processed = fitted_preprocessing_pipeline.fit_transform(self.X) y_train = self._encode_target(self.y) # Fit model - self._fit_model(model, X_train_final, y_train, model_name) + self._fit_model(model, X_train_processed, y_train, model_name) # Update leaderboard for model_info in self.__model_training_info: @@ -1136,24 +1136,28 @@ def _predict_helper( preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) # Train model on full data if needed - if full_train: - already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) - if not already_trained: - self.__logger.info("Training the model using the whole data") - - # Fit and transform through preprocessing pipeline - X_train_final = preprocessing_pipeline.fit_transform(self.X) - y_train = self._encode_target(self.y) - self._fit_model(model, X_train_final, y_train, model_name) + already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) + + if full_train and not already_trained: + # Fit the pipeline on full training data for consistent transformations + X_train_processed = preprocessing_pipeline.fit_transform(self.X) + + self.__logger.info("Training the model using the whole data") + + y_train = self._encode_target(self.y) + self._fit_model(model, X_train_processed, y_train, model_name) - # Update leaderboard - for model_info in self.__model_training_info: - for name, info in model_info.items(): - if name == model_name: - info["model_stats"]["Full Train"] = True - info["model"] = model - break - self.get_best_models() + # Update leaderboard + for model_info in self.__model_training_info: + for name, info in model_info.items(): + if name == model_name: + info["model_stats"]["Full Train"] = True + info["model"] = model + break + self.get_best_models() + else: + # Just fit the preprocessing pipeline without retraining the model + preprocessing_pipeline.fit(self.X) # Transform test data through the same preprocessing pipeline X_test = preprocessing_pipeline.transform(test_data) @@ -1239,8 +1243,8 @@ def __add_holdout_model_to_stats(self, model: object, model_name: Optional[str] preprocessing_pipeline = self._get_model_pipeline(model_copy, include_model=False) # Transform holdout training data and fit model - X_train_final = preprocessing_pipeline.fit_transform(self.X_train_raw) - self._fit_model(model_copy, X_train_final, self.y_train, model_name) + X_train_processed = preprocessing_pipeline.fit_transform(self.X_train_raw) + self._fit_model(model_copy, X_train_processed, self.y_train, model_name) # Store the fitted preprocessing pipeline with the model for later use self._holdout_model_objects[model_name] = { @@ -1331,33 +1335,33 @@ def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_ preprocessing_pipeline = holdout_data['preprocessing_pipeline'] # Transform holdout data using the model's preprocessing pipeline - X_train_final = preprocessing_pipeline.transform(self.X_train_raw) - X_test_final = preprocessing_pipeline.transform(self.X_test_raw) + X_train_processed = preprocessing_pipeline.transform(self.X_train_raw) + X_test_processed = preprocessing_pipeline.transform(self.X_test_raw) # If kind expects predictions if kind in ["confusion_matrix"]: - preds = model.predict(X_test_final) + preds = model.predict(X_test_processed) elif kind in ["roc_curve", "calibration_curve"]: - preds = model.predict_proba(X_test_final) + preds = model.predict_proba(X_test_processed) graph = None if kind == "feature_importance": - if not hasattr(self, 'feature_names'): - self.feature_names = list(self.X_train_raw.columns) - graph = plot_feature_importance(model, self.feature_names, **kwargs) + # Use feature names from transformed data (accounts for encoding) + feature_names = list(X_train_processed.columns) if hasattr(X_train_processed, 'columns') else None + graph = plot_feature_importance(model, feature_names, **kwargs) elif kind == "confusion_matrix": graph = plot_confusion_matrix(self.y_test, preds, self.y_class_mapping, **kwargs) elif kind == "roc_curve": graph = plot_roc_curve(self.y_test, preds, self.y_class_mapping, **kwargs) elif kind == "residuals": - graph = plot_residuals(model, X_train_final, self.y_train, X_test_final, self.y_test, **kwargs) + graph = plot_residuals(model, X_train_processed, self.y_train, X_test_processed, self.y_test, **kwargs) elif kind == "prediction_error": - graph = plot_prediction_error(model, X_train_final, self.y_train, X_test_final, self.y_test, **kwargs) + graph = plot_prediction_error(model, X_train_processed, self.y_train, X_test_processed, self.y_test, **kwargs) elif kind == "calibration_curve": graph = plot_calibration_curve(self.y_test, preds, self.y_class_mapping, **kwargs) elif 'shap' in kind: - graph = plot_shap(model, X_test_final, kind, **kwargs) + graph = plot_shap(model, X_test_processed, kind, **kwargs) else: error_msg = f"Invalid plot type: {kind}. Available plot types: {available_plot_types}" self.__logger.error(error_msg) From 6b9d90370e7e448a19a907686e12ed99b76f0907 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 24 Jan 2026 23:36:57 +0300 Subject: [PATCH 13/14] Add categorical feature support to HistGradientBoosting models --- flexml/config/ml_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flexml/config/ml_models.py b/flexml/config/ml_models.py index 752572e..8f37df2 100644 --- a/flexml/config/ml_models.py +++ b/flexml/config/ml_models.py @@ -66,7 +66,7 @@ def get_ml_models( KNN_REGRESSION = KNeighborsRegressor(n_jobs=n_jobs) BAYESIAN_RIDGE_REGRESSION = BayesianRidge() ADA_BOOST_REGRESSION = AdaBoostRegressor(random_state=random_state) - HIST_GRADIENT_BOOSTING_REGRESSION = HistGradientBoostingRegressor(random_state=random_state) + HIST_GRADIENT_BOOSTING_REGRESSION = HistGradientBoostingRegressor(random_state=random_state, categorical_features="from_dtype") GRADIENT_BOOSTING_REGRESSION = GradientBoostingRegressor(random_state=random_state) RANDOM_FOREST_REGRESSION = RandomForestRegressor(random_state=random_state, n_jobs=n_jobs) EXTRA_TREES_REGRESSION = ExtraTreesRegressor(random_state=random_state, n_jobs=n_jobs) @@ -318,7 +318,7 @@ def get_ml_models( # Wide Classification Models ADA_BOOST_CLASSIFIER = AdaBoostClassifier(random_state=random_state) - HIST_GRADIENT_BOOSTING_CLASSIFIER = HistGradientBoostingClassifier(random_state=random_state) + HIST_GRADIENT_BOOSTING_CLASSIFIER = HistGradientBoostingClassifier(random_state=random_state, categorical_features="from_dtype") GRADIENT_BOOSTING_CLASSIFIER = GradientBoostingClassifier(random_state=random_state) EXTRA_TREES_CLASSIFIER = ExtraTreesClassifier(random_state=random_state, n_jobs=n_jobs) QDA_CLASSIFIER = QuadraticDiscriminantAnalysis() From 81e727a675e3e157393afa11f428b09ce33f3c17 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 24 Jan 2026 23:58:27 +0300 Subject: [PATCH 14/14] Add better approach for tree based model detection to plot_shap() --- flexml/helpers/plot_model_graphs.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/flexml/helpers/plot_model_graphs.py b/flexml/helpers/plot_model_graphs.py index 333f044..d233225 100644 --- a/flexml/helpers/plot_model_graphs.py +++ b/flexml/helpers/plot_model_graphs.py @@ -383,15 +383,8 @@ def plot_shap( or an error message if an error occurs during the process. """ try: - # Check if model is a tree-based model - model_type = str(type(model)) - - tree_based_models = [ - "RandomForest", "GradientBoosting", "AdaBoost", - "HistGradientBoosting", "DecisionTree", "ExtraTrees", - "XGB", "CatBoost", "LGBM" - ] - is_tree_based = any(model_name in model_type for model_name in tree_based_models) + # Check if the model is tree-based + is_tree_based = hasattr(model, 'feature_importances_') if is_tree_based: explainer = shap.TreeExplainer(model)