diff --git a/flexml/_feature_engineer.py b/flexml/_feature_engineer.py index f2d8897..92df339 100644 --- a/flexml/_feature_engineer.py +++ b/flexml/_feature_engineer.py @@ -27,6 +27,49 @@ def transform(self, X): A DataFrame with the specified columns dropped """ return X.drop(columns=self.drop_columns, axis=1, errors='ignore') + + +class CategoricalTypeConverter(BaseEstimator, TransformerMixin): + """ + A transformer to convert categorical columns to 'category' dtype. + Used for tree-based models that support native categorical features. + Supports ordered categories via ordinal_encode_map. + """ + def __init__(self, categorical_columns: Optional[List[str]] = None, ordinal_encode_map: Optional[Dict[str, List]] = None): + # Keep original values for sklearn clone compatibility + self.categorical_columns = categorical_columns + self.ordinal_encode_map = ordinal_encode_map + + def fit(self, X, y=None): + return self + + def transform(self, X): + """ + Converts specified categorical columns to 'category' dtype. + For columns in ordinal_encode_map, creates ordered categorical with specified order. + + Returns + ------- + pd.DataFrame + A DataFrame with categorical columns converted to 'category' dtype + """ + X = X.copy() + categorical_cols = self.categorical_columns or [] + ordinal_map = self.ordinal_encode_map or {} + + for col in categorical_cols: + if col in X.columns: + if col in ordinal_map: + # Handle unseen categories by mapping them to NaN + categories = [str(c) for c in ordinal_map[col]] + col_values = X[col].astype(str) + known_mask = col_values.isin(categories) + col_values = col_values.where(known_mask, other=np.nan) + X[col] = pd.Categorical(col_values, categories=categories, ordered=True) + else: + # Regular unordered categorical + X[col] = X[col].astype('category') + return X class ColumnImputer(BaseEstimator, TransformerMixin): @@ -47,6 +90,8 @@ def fit(self, X, y=None): return self def transform(self, X) -> pd.DataFrame: + X = X.copy() # Avoid modifying original data + # Categorical columns are converted to string categorical_cols = X.select_dtypes(exclude=['number']).columns X[categorical_cols] = X[categorical_cols].astype(str) @@ -104,9 +149,17 @@ def __init__( self.ordinal_encoders = {} def fit(self, X, y=None): - # Categorical columns are converted to string + X = X.copy() # Avoid modifying original data + + # First, convert all non-numeric columns to string (original behavior) categorical_cols = X.select_dtypes(exclude=['number']).columns X[categorical_cols] = X[categorical_cols].astype(str) + + # Also ensure columns in encoding_method_mapper are string + # (handles case where column is numeric but needs encoding) + for col in self.encoding_method_mapper.keys(): + if col in X.columns and col not in categorical_cols: + X[col] = X[col].astype(str) for col, method in self.encoding_method_mapper.items(): if method == "label_encoder": @@ -133,9 +186,16 @@ def fit(self, X, y=None): return self def transform(self, X) -> pd.DataFrame: - # Categorical columns are converted to string + X = X.copy() # Avoid modifying original data + + # First, convert all non-numeric columns to string (original behavior) categorical_cols = X.select_dtypes(exclude=['number']).columns X[categorical_cols] = X[categorical_cols].astype(str) + + # Also ensure columns in encoding_method_mapper are string + for col in self.encoding_method_mapper.keys(): + if col in X.columns and col not in categorical_cols: + X[col] = X[col].astype(str) for col, method in self.encoding_method_mapper.items(): if method == "label_encoder": diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py index 6153719..a8bc0cd 100644 --- a/flexml/_model_tuner.py +++ b/flexml/_model_tuner.py @@ -198,8 +198,6 @@ def _setup_tuning( * 'tuned_model_evaluation_metric': The evaluation metric that is used to evaluate the tuned model """ - model_params = None - if isinstance(model, Pipeline): model = model.named_steps['model'] diff --git a/flexml/config/__init__.py b/flexml/config/__init__.py index f56a684..2346f85 100644 --- a/flexml/config/__init__.py +++ b/flexml/config/__init__.py @@ -3,6 +3,7 @@ ) from flexml.config.supervised_config import ( + NATIVE_CATEGORICAL_MODELS, EVALUATION_METRICS, TUNING_METRIC_TRANSFORMATIONS, CROSS_VALIDATION_METHODS, diff --git a/flexml/config/ml_models.py b/flexml/config/ml_models.py index 9e4147e..8f37df2 100644 --- a/flexml/config/ml_models.py +++ b/flexml/config/ml_models.py @@ -66,7 +66,7 @@ def get_ml_models( KNN_REGRESSION = KNeighborsRegressor(n_jobs=n_jobs) BAYESIAN_RIDGE_REGRESSION = BayesianRidge() ADA_BOOST_REGRESSION = AdaBoostRegressor(random_state=random_state) - HIST_GRADIENT_BOOSTING_REGRESSION = HistGradientBoostingRegressor(random_state=random_state) + HIST_GRADIENT_BOOSTING_REGRESSION = HistGradientBoostingRegressor(random_state=random_state, categorical_features="from_dtype") GRADIENT_BOOSTING_REGRESSION = GradientBoostingRegressor(random_state=random_state) RANDOM_FOREST_REGRESSION = RandomForestRegressor(random_state=random_state, n_jobs=n_jobs) EXTRA_TREES_REGRESSION = ExtraTreesRegressor(random_state=random_state, n_jobs=n_jobs) @@ -308,8 +308,8 @@ def get_ml_models( # Quick Classification Models LOGISTIC_REGRESSION = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=n_jobs) - XGBOOST_CLASSIFIER = XGBClassifier(objective=xgb_objective, random_state=random_state, n_jobs=n_jobs) - LIGHTGBM_CLASSIFIER = LGBMClassifier(verbose=-1, random_state=random_state, n_jobs=n_jobs) + XGBOOST_CLASSIFIER = XGBClassifier(enable_categorical=True, objective=xgb_objective, random_state=random_state, n_jobs=n_jobs) + LIGHTGBM_CLASSIFIER = LGBMClassifier(enable_categorical=True, verbose=-1, random_state=random_state, n_jobs=n_jobs) CATBOOST_CLASSIFIER = CatBoostClassifier(allow_writing_files=False, silent=True, random_seed=random_state, thread_count=n_jobs) DECISION_TREE_CLASSIFIER = DecisionTreeClassifier(random_state=random_state) RANDOM_FOREST_CLASSIFIER = RandomForestClassifier(random_state=random_state, n_jobs=n_jobs) @@ -318,7 +318,7 @@ def get_ml_models( # Wide Classification Models ADA_BOOST_CLASSIFIER = AdaBoostClassifier(random_state=random_state) - HIST_GRADIENT_BOOSTING_CLASSIFIER = HistGradientBoostingClassifier(random_state=random_state) + HIST_GRADIENT_BOOSTING_CLASSIFIER = HistGradientBoostingClassifier(random_state=random_state, categorical_features="from_dtype") GRADIENT_BOOSTING_CLASSIFIER = GradientBoostingClassifier(random_state=random_state) EXTRA_TREES_CLASSIFIER = ExtraTreesClassifier(random_state=random_state, n_jobs=n_jobs) QDA_CLASSIFIER = QuadraticDiscriminantAnalysis() diff --git a/flexml/config/supervised_config.py b/flexml/config/supervised_config.py index 9c61dca..25b066e 100644 --- a/flexml/config/supervised_config.py +++ b/flexml/config/supervised_config.py @@ -1,3 +1,11 @@ +# Models that support native categorical features +NATIVE_CATEGORICAL_MODELS = { + 'CatBoostRegressor', 'CatBoostClassifier', + 'LGBMRegressor', 'LGBMClassifier', + 'XGBRegressor', 'XGBClassifier', + 'HistGradientBoostingRegressor', 'HistGradientBoostingClassifier' +} + # Regression & Classification Evaluation Metrics EVALUATION_METRICS = { "Regression": {"DEFAULT": "R2", diff --git a/flexml/helpers/plot_model_graphs.py b/flexml/helpers/plot_model_graphs.py index 8c54235..d233225 100644 --- a/flexml/helpers/plot_model_graphs.py +++ b/flexml/helpers/plot_model_graphs.py @@ -383,15 +383,8 @@ def plot_shap( or an error message if an error occurs during the process. """ try: - # Check if model is a tree-based model - model_type = str(type(model)) - - tree_based_models = [ - "RandomForest", "GradientBoosting", "AdaBoost", - "HistGradientBoosting", "DecisionTree", "ExtraTrees", - "XGB", "CatBoost", "LGBM" - ] - is_tree_based = any(model_name in model_type for model_name in tree_based_models) + # Check if the model is tree-based + is_tree_based = hasattr(model, 'feature_importances_') if is_tree_based: explainer = shap.TreeExplainer(model) @@ -410,6 +403,10 @@ def plot_shap( if shap_type == 'shap_summary': shap.summary_plot(shap_values, X_test) elif shap_type == 'shap_violin': + # While shap summary is okay with categorical columns, violin plot is not + cat_cols = X_test.select_dtypes(include=['category']).columns + for col in cat_cols: + X_test[col] = X_test[col].cat.codes shap.plots.violin(shap_values, X_test) else: return f"Invalid shap_type: {shap_type}" diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 78e7c82..35d4865 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -12,6 +12,7 @@ from flexml.logger import get_logger from flexml.config import ( get_ml_models, + NATIVE_CATEGORICAL_MODELS, EVALUATION_METRICS, CROSS_VALIDATION_METHODS, PLOT_TYPES @@ -34,7 +35,7 @@ ) from flexml.structures.custom_score import CustomScore from flexml._model_tuner import ModelTuner -from flexml._feature_engineer import FeatureEngineering +from flexml._feature_engineer import FeatureEngineering, CategoricalTypeConverter import warnings warnings.filterwarnings("ignore") @@ -315,20 +316,20 @@ def __prepare_holdout_data(self, test_size: Optional[float] = None): )[0] train_labels, test_labels = holdout_cv_splits[0], holdout_cv_splits[1] - train_data = pd.concat([ - self.X.loc[train_labels], - self.y.loc[train_labels] - ], axis=1) - test_data = pd.concat([ - self.X.loc[test_labels], - self.y.loc[test_labels] - ], axis=1) - + # Setup feature engineer with train data + train_data = pd.concat([self.X.loc[train_labels], self.y.loc[train_labels]], axis=1) self.feature_engineer.setup(data=train_data) - - self.X_train, self.y_train = self.feature_engineer.fit_transform() - self.X_test, self.y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) - self.feature_names = list(self.X_train.columns) + self.categorical_columns = self.feature_engineer.categorical_columns + + # Store raw holdout data (use X/y directly instead of concat→drop) + self.X_train_raw = self.X.loc[train_labels] + self.X_test_raw = self.X.loc[test_labels] + self.y_train, self.y_test = self._encode_target( + self.y.loc[train_labels], + self.y.loc[test_labels] + ) + + self.feature_names = list(self.X_train_raw.columns) self.y_class_mapping = self.feature_engineer.y_class_mapping def __prepare_models(self, experiment_size: str, num_class: int, random_state: Optional[int] = None, n_jobs: Optional[int] = -1): @@ -380,6 +381,91 @@ def __top_n_models_checker(self, top_n_models: Optional[int]) -> int: return top_n_models + def _fit_model( + self, + model: object, + X: pd.DataFrame, + y: pd.Series, + model_name: Optional[str] = None + ): + """ + Fits a model with proper categorical feature handling. + Passes cat_features to CatBoost models for native categorical support. + + Parameters + ---------- + model : object + The model to fit + X : pd.DataFrame + The feature data + y : pd.Series + The target data + model_name : Optional[str] + The name of the model (if None, uses model's class name) + """ + if model_name is None: + model_name = model.__class__.__name__ + + if 'CatBoost' in model_name and hasattr(self, 'categorical_columns') and self.categorical_columns: + # check if model is fitted: + if not model.is_fitted(): + model.set_params(cat_features=list(self.categorical_columns)) + model.fit(X, y) + else: + model.fit(X, y) + + def _encode_target( + self, + y: pd.Series, + y_test: Optional[pd.Series] = None, + fit: bool = True + ) -> Union[pd.Series, tuple]: + """ + Encodes the target variable for classification tasks. + + Parameters + ---------- + y : pd.Series + The target variable to encode + y_test : pd.Series, optional + Test target to transform (uses already fitted encoder) + fit : bool + If True, fits the encoder on y. If False, only transforms. + + Returns + ------- + pd.Series or tuple + Encoded y, or (encoded_y, encoded_y_test) if y_test provided + """ + # Skip encoding for regression or already numeric targets + if self.__ML_TASK_TYPE != 'Classification' or y.dtype not in ['object', 'category']: + return (y, y_test) if y_test is not None else y + + # Encode y + if fit: + encoded_y = pd.Series( + self.feature_engineer.target_encoder.fit_transform(y), + name=y.name, + index=y.index + ) + else: + encoded_y = pd.Series( + self.feature_engineer.target_encoder.transform(y), + name=y.name, + index=y.index + ) + + # Encode y_test if provided + if y_test is not None: + encoded_y_test = pd.Series( + self.feature_engineer.target_encoder.transform(y_test), + name=y_test.name, + index=y_test.index + ) + return encoded_y, encoded_y_test + + return encoded_y + def __process_experiment_result(self, experiment_stats: dict): """ Processes and aggregates the results of an experiment, calculating average metrics and selecting the best model. @@ -615,19 +701,17 @@ def start_experiment( train_labels = train_idx test_labels = test_idx - train_data = pd.concat([ - self.X.loc[train_labels], - self.y.loc[train_labels] - ], axis=1) - test_data = pd.concat([ - self.X.loc[test_labels], - self.y.loc[test_labels] - ], axis=1) - + # Setup feature engineer with train data + train_data = pd.concat([self.X.loc[train_labels], self.y.loc[train_labels]], axis=1) self.feature_engineer.setup(data=train_data) - X_train, y_train = self.feature_engineer.fit_transform() - X_test, y_test = self.feature_engineer.transform(test_data=test_data, y_included=True) + # Use X/y directly instead of concat→drop + X_train_raw = self.X.loc[train_labels] + X_test_raw = self.X.loc[test_labels] + y_train, y_test = self._encode_target( + self.y.loc[train_labels], + self.y.loc[test_labels] + ) for model_idx in range(len(self.__ML_MODELS)): model_info = self.__ML_MODELS[model_idx] @@ -638,19 +722,27 @@ def start_experiment( continue # Skip already trained or raised error models model = model_info['model'] + + # Get preprocessing pipeline for this specific model + preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) + + # Transform data using model-specific preprocessing + X_train_processed = preprocessing_pipeline.fit_transform(X_train_raw) + X_test_processed = preprocessing_pipeline.transform(X_test_raw) + try: all_metrics = [] all_times = [] t_start = time() - model.fit(X_train, y_train) + self._fit_model(model, X_train_processed, y_train, model_name) t_end = time() time_taken = round(t_end - t_start, 2) if self.__ML_TASK_TYPE == "Classification" and hasattr(model, 'predict_proba'): - y_pred = model.predict_proba(X_test) + y_pred = model.predict_proba(X_test_processed) else: - y_pred = model.predict(X_test) + y_pred = model.predict(X_test_processed) model_perf = evaluate_model_perf( self.__ML_TASK_TYPE, @@ -845,32 +937,34 @@ def save_model( raise ValueError(error_msg) else: # If model is an object, we can't know its name, so we use its class name model_name = model.__class__.__name__ - - # Initialize pipeline steps - pipeline_steps = [] - # Initialize and setup feature engineering if needed - if not model_only: - # Add the feature engineering pipeline directly - pipeline_steps.extend(self.feature_engineer.pipeline.steps) - # Handle full training scenario if required + fitted_preprocessing_pipeline = None if full_train: already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) + if not already_trained: self.__logger.info("Training the model using the whole data") + self.feature_engineer.setup(data=self.data) - X_train, y_train = self.feature_engineer.fit_transform() - model.fit(X_train, y_train) + + # Get preprocessing pipeline for this model + fitted_preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) + + # Fit and transform data through the preprocessing pipeline + X_train_processed = fitted_preprocessing_pipeline.fit_transform(self.X) + y_train = self._encode_target(self.y) + + # Fit model + self._fit_model(model, X_train_processed, y_train, model_name) - # find the model in leaderboard and update the full_train to True, and update the model object in there + # Update leaderboard for model_info in self.__model_training_info: for name, info in model_info.items(): if name == model_name: info["model_stats"]["Full Train"] = True info["model"] = model break - # Update leaderboard self.get_best_models() # If no feature pipeline is included, return the model directly @@ -885,11 +979,13 @@ def save_model( return model - # Add the model to the pipeline - pipeline_steps.append(('model', model)) - - # Create the pipeline - pipeline = Pipeline(pipeline_steps) + if fitted_preprocessing_pipeline is not None: + # Combine the exact fitted preprocessing steps with the fitted model + steps = list(fitted_preprocessing_pipeline.steps) + [('model', model)] + pipeline = Pipeline(steps) + else: + # Model was already trained, get pipeline from feature_engineer + pipeline = self._get_model_pipeline(model, include_model=True) # Save the pipeline try: @@ -928,6 +1024,73 @@ def _check_if_model_is_full_trained(self, model_name: str, model_taken_from_lead return True return False + def _is_native_categorical_model(self, model_name: str) -> bool: + """Check if model supports native categorical features.""" + return ( + model_name in NATIVE_CATEGORICAL_MODELS and + hasattr(self, 'categorical_columns') and + len(self.categorical_columns) > 0 + ) + + def _get_preprocessing_steps(self, model_name: str) -> list: + """ + Returns the appropriate preprocessing steps for a given model. + + For native categorical models: no encoder, uses CategoricalTypeConverter + For other models: includes encoder + + Parameters + ---------- + model_name : str + The name of the model + + Returns + ------- + list + List of preprocessing steps as (name, transformer) tuples + """ + if self._is_native_categorical_model(model_name): + # Pipeline without encoder, with CategoricalTypeConverter + steps = [ + (name, step) for name, step in self.feature_engineer.pipeline.steps + if name != 'encoder' + ] + # Pass ordinal_encode_map to preserve category ordering for ordinal columns + ordinal_map = getattr(self.feature_engineer, 'ordinal_encode_map', None) or {} + steps.append(('cat_type_converter', CategoricalTypeConverter( + list(self.categorical_columns), + ordinal_encode_map=ordinal_map + ))) + else: + # Standard pipeline with encoder + steps = list(self.feature_engineer.pipeline.steps) + + return steps + + def _get_model_pipeline(self, model, include_model: bool = True) -> Pipeline: + """ + Returns a complete Pipeline for a given model. + + Parameters + ---------- + model : object + The model object + include_model : bool, optional + Whether to include the model as the last step (default: True) + + Returns + ------- + Pipeline + sklearn Pipeline with preprocessing steps (and optionally the model) + """ + model_name = model.__class__.__name__ + steps = self._get_preprocessing_steps(model_name) + + if include_model: + steps.append(('model', model)) + + return Pipeline(steps) + def _predict_helper( self, test_data: pd.DataFrame, @@ -951,8 +1114,8 @@ def _predict_helper( if extra: error_msg += f" Extra: {extra}." raise ValueError(error_msg) - model_taken_from_leaderboard = False # If the model object is from leaderboard, track this - + # Get model from leaderboard or use provided model + model_taken_from_leaderboard = False if model is None: model = self.get_best_models() model_name = self.__last_searched_model_name @@ -965,32 +1128,39 @@ def _predict_helper( model_name = model model = self.get_model_by_name(model) model_taken_from_leaderboard = True - else: # If model is an object, we can't know its name, so we use its class name + else: model_name = model.__class__.__name__ - # Prepare training data if needed - if full_train: - # Check If model_taken_from_leaderboard is True and Full Train in self.__model_training_info is True, then we don't need to train the model again - already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) - if not already_trained: - self.__logger.info("Training the model using the whole data") - self.feature_engineer.setup(data=self.data) - X_train, y_train = self.feature_engineer.fit_transform() - model.fit(X_train, y_train) - - # find the model in leaderboard and update the full_train to True, and update the model object in there - for model_info in self.__model_training_info: - for name, info in model_info.items(): - if name == model_name: - info["model_stats"]["Full Train"] = True - info["model"] = model - break - # Update leaderboard - self.get_best_models() - X_test = self.feature_engineer.transform(test_data) + # Get the preprocessing pipeline for this model (consistent with save_model) + self.feature_engineer.setup(data=self.data) + preprocessing_pipeline = self._get_model_pipeline(model, include_model=False) + + # Train model on full data if needed + already_trained = self._check_if_model_is_full_trained(model_name, model_taken_from_leaderboard) + + if full_train and not already_trained: + # Fit the pipeline on full training data for consistent transformations + X_train_processed = preprocessing_pipeline.fit_transform(self.X) + + self.__logger.info("Training the model using the whole data") + + y_train = self._encode_target(self.y) + self._fit_model(model, X_train_processed, y_train, model_name) + # Update leaderboard + for model_info in self.__model_training_info: + for name, info in model_info.items(): + if name == model_name: + info["model_stats"]["Full Train"] = True + info["model"] = model + break + self.get_best_models() else: - X_test = self.feature_engineer.transform(test_data) + # Just fit the preprocessing pipeline without retraining the model + preprocessing_pipeline.fit(self.X) + + # Transform test data through the same preprocessing pipeline + X_test = preprocessing_pipeline.transform(test_data) return model, X_test @@ -1057,7 +1227,8 @@ def predict_proba( model, X_test = self._predict_helper(test_data, model, full_train) return model.predict_proba(X_test) - def __get_holdout_model_from_stats(self, model_name: str) -> object: + def __get_holdout_model_from_stats(self, model_name: str) -> Optional[dict]: + """Returns dict with 'model' and 'preprocessing_pipeline' keys, or None.""" if self._holdout_model_objects is None or self._holdout_model_objects == {}: return None return self._holdout_model_objects.get(model_name) @@ -1067,8 +1238,19 @@ def __add_holdout_model_to_stats(self, model: object, model_name: Optional[str] model_name = model.__class__.__name__ model_copy = deepcopy(model) - model_copy.fit(self.X_train, self.y_train) - self._holdout_model_objects[model_name] = model_copy + + # Get preprocessing pipeline for this model + preprocessing_pipeline = self._get_model_pipeline(model_copy, include_model=False) + + # Transform holdout training data and fit model + X_train_processed = preprocessing_pipeline.fit_transform(self.X_train_raw) + self._fit_model(model_copy, X_train_processed, self.y_train, model_name) + + # Store the fitted preprocessing pipeline with the model for later use + self._holdout_model_objects[model_name] = { + 'model': model_copy, + 'preprocessing_pipeline': preprocessing_pipeline + } return model_copy def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_importance", **kwargs): @@ -1139,38 +1321,47 @@ def plot(self, model: Optional[Union[str, object]] = None, kind: str = "feature_ elif isinstance(model, str): model_name = model model = self.get_model_by_name(model) - else: # If model is an object, we can't know its name, so we use its class name + else: model_name = model.__class__.__name__ - if self.__get_holdout_model_from_stats(model_name) is not None: - model = self.__get_holdout_model_from_stats(model_name) + # Get or create holdout model with its preprocessing pipeline + holdout_data = self.__get_holdout_model_from_stats(model_name) + if holdout_data is not None: + model = holdout_data['model'] + preprocessing_pipeline = holdout_data['preprocessing_pipeline'] else: model = self.__add_holdout_model_to_stats(model, model_name) + holdout_data = self.__get_holdout_model_from_stats(model_name) + preprocessing_pipeline = holdout_data['preprocessing_pipeline'] + + # Transform holdout data using the model's preprocessing pipeline + X_train_processed = preprocessing_pipeline.transform(self.X_train_raw) + X_test_processed = preprocessing_pipeline.transform(self.X_test_raw) - # If kind expects predictions + # If kind expects predictions if kind in ["confusion_matrix"]: - preds = model.predict(self.X_test) + preds = model.predict(X_test_processed) elif kind in ["roc_curve", "calibration_curve"]: - preds = model.predict_proba(self.X_test) + preds = model.predict_proba(X_test_processed) graph = None if kind == "feature_importance": - if not hasattr(self, 'feature_names'): - self.feature_names = list(self.X_train.columns) - graph = plot_feature_importance(model, self.feature_names, **kwargs) + # Use feature names from transformed data (accounts for encoding) + feature_names = list(X_train_processed.columns) if hasattr(X_train_processed, 'columns') else None + graph = plot_feature_importance(model, feature_names, **kwargs) elif kind == "confusion_matrix": graph = plot_confusion_matrix(self.y_test, preds, self.y_class_mapping, **kwargs) elif kind == "roc_curve": graph = plot_roc_curve(self.y_test, preds, self.y_class_mapping, **kwargs) elif kind == "residuals": - graph = plot_residuals(model, self.X_train, self.y_train, self.X_test, self.y_test, **kwargs) + graph = plot_residuals(model, X_train_processed, self.y_train, X_test_processed, self.y_test, **kwargs) elif kind == "prediction_error": - graph = plot_prediction_error(model, self.X_train, self.y_train, self.X_test, self.y_test, **kwargs) + graph = plot_prediction_error(model, X_train_processed, self.y_train, X_test_processed, self.y_test, **kwargs) elif kind == "calibration_curve": graph = plot_calibration_curve(self.y_test, preds, self.y_class_mapping, **kwargs) elif 'shap' in kind: - graph = plot_shap(model, self.X_test, kind, **kwargs) + graph = plot_shap(model, X_test_processed, kind, **kwargs) else: error_msg = f"Invalid plot type: {kind}. Available plot types: {available_plot_types}" self.__logger.error(error_msg) @@ -1589,17 +1780,13 @@ def _show_tuning_report(tuning_report: Optional[dict] = None): logging_to_file=self.logging_to_file )) - # Create the ModelTuner object If It's not created before, avoid creating it everytime tune_model() function is called + # Create the ModelTuner object If It's not created before if not hasattr(self, 'model_tuner'): - if self.__ML_TASK_TYPE == 'Classification' and self.y.dtype in ['object', 'category']: - y_encoded = pd.Series(self.feature_engineer.target_encoder.fit_transform(self.y), name=self.target_col) - y_encoded.index = self.y.index - else: - y_encoded = self.y # No need to encode the target for regression or if the target is already encoded + y_encoded = self._encode_target(self.y) self.model_tuner = ModelTuner(self.__ML_TASK_TYPE, self.X, y_encoded, self.logging_to_file) - pipeline = self.feature_engineer.pipeline - pipeline = Pipeline(steps=pipeline.steps + [('model', model)]) + # Build pipeline with proper handling for native categorical models + pipeline = self._get_model_pipeline(model, include_model=True) self.__logger.info(f"[PROCESS] Model Tuning process started with '{tuning_method}' method") tuning_method = tuning_method.lower() diff --git a/tests/test_supervised.py b/tests/test_supervised.py index 992760a..3fc4601 100644 --- a/tests/test_supervised.py +++ b/tests/test_supervised.py @@ -14,19 +14,33 @@ class TestRegression(unittest.TestCase): logger = get_logger(__name__, "TEST") logger.setLevel("DEBUG") + @staticmethod + def _add_synthetic_categorical_columns(df): + """Add synthetic categorical columns to test categorical encoding""" + n_rows = len(df) + np.random.seed(42) + + df['category_A'] = np.random.choice(['low', 'medium', 'high'], n_rows) + df['category_B'] = np.random.choice(['red', 'green', 'blue', 'yellow'], n_rows) + return df + test_config = { 'Regression': { - 'data': load_diabetes(as_frame=True)['frame'], + 'data': _add_synthetic_categorical_columns.__func__(load_diabetes(as_frame=True)['frame'].copy()), 'target_col': 'target', 'exp_obj': None }, 'BinaryClassification': { - 'data': load_breast_cancer(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'No', 1: 'Yes'})), + 'data': _add_synthetic_categorical_columns.__func__( + load_breast_cancer(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'No', 1: 'Yes'})).copy() + ), 'target_col': 'target', 'exp_obj': None }, 'MulticlassClassification': { - 'data': load_iris(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'Iris-Setosa', 1: 'Iris-Versicolor', 2: 'Iris-Virginica'})), + 'data': _add_synthetic_categorical_columns.__func__( + load_iris(as_frame=True)['frame'].assign(target=lambda df: df['target'].map({0: 'Iris-Setosa', 1: 'Iris-Versicolor', 2: 'Iris-Virginica'})).copy() + ), 'target_col': 'target', 'exp_obj': None } @@ -266,4 +280,48 @@ def test_25_plot_multiclass_classification_shap_summary(self): def test_26_plot_multiclass_classification_shap_violin(self): exp_obj = self.test_config['MulticlassClassification']['exp_obj'] - exp_obj.plot("RandomForestClassifier", kind="shap_violin") \ No newline at end of file + exp_obj.plot("RandomForestClassifier", kind="shap_violin") + + def test_27_native_categorical_pipeline_consistency(self): + """Test that saved pipeline predictions match exp.predict_proba() for native categorical models""" + exp_obj = self.test_config['BinaryClassification']['exp_obj'] + test_data = self.test_config['BinaryClassification'].get('data').drop(columns=['target']) + + # Get predictions via FlexML (this trains with full data) + flexml_probs = exp_obj.predict_proba(test_data, model='LGBMClassifier', full_train=True) + + # Save pipeline (should use already trained model, no retraining) + save_path = "test_native_cat_pipeline.pkl" + exp_obj.save_model(model='LGBMClassifier', save_path=save_path, model_only=False, full_train=True) + + # Load and predict via pipeline + with open(save_path, 'rb') as f: + loaded_pipeline = pickle.load(f) + pipeline_probs = loaded_pipeline.predict_proba(test_data) + + # Predictions should match + np.testing.assert_array_almost_equal(flexml_probs, pipeline_probs, decimal=5, + err_msg="Loaded pipeline predictions don't match FlexML predictions") + os.remove(save_path) + + def test_28_predict_column_mismatch_error(self): + """Test that predict raises proper error for column mismatch""" + exp_obj = self.test_config['Regression']['exp_obj'] + test_data = self.test_config['Regression'].get('data').drop(columns=['target']) + + # Remove a column to create mismatch + bad_data = test_data.drop(columns=[test_data.columns[0]]) + + with self.assertRaises(ValueError) as context: + exp_obj.predict(bad_data, full_train=False) + + self.assertIn("Missing", str(context.exception)) + + def test_29_get_model_by_invalid_name(self): + """Test get_model_by_name raises error for invalid model name""" + exp_obj = self.test_config['Regression']['exp_obj'] + + with self.assertRaises(ValueError) as context: + exp_obj.get_model_by_name("NonExistentModel") + + self.assertIn("not found", str(context.exception)) \ No newline at end of file