From 738a4450ca6c690a7ab0ddc6c13f51c829b1c018 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 17:44:16 +0300 Subject: [PATCH 1/8] Added custom eval metric feature --- flexml/_model_tuner.py | 137 +++++--- flexml/helpers/supervised_helpers.py | 67 +++- flexml/helpers/validators.py | 18 +- flexml/structures/custom_score.py | 53 +++ flexml/structures/supervised_base.py | 124 +++++-- tests/test_custom_metrics.py | 506 +++++++++++++++++++++++++++ 6 files changed, 826 insertions(+), 79 deletions(-) create mode 100644 flexml/structures/custom_score.py create mode 100644 tests/test_custom_metrics.py diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py index a874fc1..6153719 100644 --- a/flexml/_model_tuner.py +++ b/flexml/_model_tuner.py @@ -1,19 +1,19 @@ -import numpy as np -import pandas as pd -import optuna +from time import time import joblib from joblib.parallel import BatchCompletionCallBack from contextlib import contextmanager +from tqdm import tqdm from typing import Optional, Union -from time import time +import numpy as np +import pandas as pd from sklearn.model_selection import ParameterGrid, GridSearchCV, RandomizedSearchCV from sklearn.pipeline import Pipeline from sklearn.base import clone +import optuna from flexml.config import TUNING_METRIC_TRANSFORMATIONS +from flexml.structures.custom_score import CustomScore from flexml.logger import get_logger from flexml.helpers import evaluate_model_perf -from copy import deepcopy -from tqdm import tqdm class TqdmBatchCompletionCallback(BatchCompletionCallBack): @@ -233,8 +233,8 @@ def grid_search( self, pipeline: Pipeline, param_grid: dict, - eval_metric: str, - cv: list, + eval_metric: Union[str, CustomScore], + cv: list, n_jobs: int = -1, verbose: int = 0 ) -> Optional[dict]: @@ -249,7 +249,7 @@ def grid_search( param_grid : dict The dictionary that contains the hyperparameters and their possible values - eval_metric : str + eval_metric : str or CustomScore The evaluation metric that will be used to evaluate the model. It can be one of the following: * 'R2' for R^2 score @@ -270,6 +270,8 @@ def grid_search( * 'F1 Score' for F1 score + * Or a custom CustomScore object + cv : list of tuples A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices for the training and test sets for that fold. For example: @@ -309,6 +311,18 @@ def grid_search( model_stats = self._setup_tuning("GridSearchCV", pipeline, param_grid, n_iter=None, n_jobs=n_jobs) param_grid = model_stats['tuning_param_grid'] + # Handle custom metrics + is_custom_metric = isinstance(eval_metric, CustomScore) + if is_custom_metric: + eval_metric_name = eval_metric.name + custom_scorer = eval_metric.get_scorer() + scoring = {eval_metric_name: custom_scorer} + refit_metric = eval_metric_name + else: + eval_metric_name = eval_metric + scoring = self.eval_metrics_in_tuning_format + refit_metric = eval_metric + try: t_start = time() @@ -321,8 +335,8 @@ def grid_search( search = GridSearchCV( pipeline, param_grid, - scoring=self.eval_metrics_in_tuning_format, - refit=eval_metric, + scoring=scoring, + refit=refit_metric, cv=cv, n_jobs=n_jobs, verbose=verbose @@ -339,21 +353,25 @@ def grid_search( t_end = time() time_taken = round(t_end - t_start, 2) - scores = { - metric: ( - -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] - if metric in self.reverse_signed_eval_metrics else - search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] - ) - for metric in list(self.eval_metrics_in_tuning_format.keys()) - } + if is_custom_metric: + mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_] + scores = {eval_metric_name: round(mean_score, 6)} + else: + scores = { + metric: ( + -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] + if metric in self.reverse_signed_eval_metrics else + search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] + ) + for metric in list(self.eval_metrics_in_tuning_format.keys()) + } + mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_] model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model'] - mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_] model_stats['tuned_model_score'] = round(mean_score, 6) model_stats['model_perf'] = scores model_stats['time_taken_sec'] = time_taken - model_stats['tuned_model_evaluation_metric'] = eval_metric + model_stats['tuned_model_evaluation_metric'] = eval_metric_name return model_stats except Exception as e: self.logger.error(f"Error while tuning the model with GridSearchCV, Error: {e}") @@ -363,7 +381,7 @@ def randomized_search( self, pipeline: Pipeline, param_grid: dict, - eval_metric: str, + eval_metric: Union[str, CustomScore], cv: list, n_iter: int = 10, n_jobs: int = -1, @@ -380,7 +398,7 @@ def randomized_search( param_grid : dict The dictionary that contains the hyperparameters and their possible values - eval_metric : str + eval_metric : str or CustomScore The evaluation metric that will be used to evaluate the model. It can be one of the following: * 'R2' for R^2 score @@ -401,6 +419,8 @@ def randomized_search( * 'F1 Score' for F1 score + * Or a custom CustomScore object + cv : list of tuples A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices for the training and test sets for that fold. For example: @@ -432,6 +452,18 @@ def randomized_search( model_stats = self._setup_tuning("randomized_search", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs) param_grid = model_stats['tuning_param_grid'] + # Handle custom metrics + is_custom_metric = isinstance(eval_metric, CustomScore) + if is_custom_metric: + eval_metric_name = eval_metric.name + custom_scorer = eval_metric.get_scorer() + scoring = {eval_metric_name: custom_scorer} + refit_metric = eval_metric_name + else: + eval_metric_name = eval_metric + scoring = self.eval_metrics_in_tuning_format + refit_metric = eval_metric + t_start = time() # Calculate total fits @@ -443,8 +475,8 @@ def randomized_search( estimator=pipeline, param_distributions=param_grid, n_iter=n_iter, - scoring=self.eval_metrics_in_tuning_format, - refit=eval_metric, + scoring=scoring, + refit=refit_metric, cv=cv, n_jobs=n_jobs, verbose=verbose @@ -461,21 +493,25 @@ def randomized_search( t_end = time() time_taken = round(t_end - t_start, 2) - scores = { - metric: ( - -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] - if metric in self.reverse_signed_eval_metrics else - search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] - ) - for metric in list(self.eval_metrics_in_tuning_format.keys()) - } + if is_custom_metric: + mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_] + scores = {eval_metric_name: round(mean_score, 6)} + else: + scores = { + metric: ( + -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] + if metric in self.reverse_signed_eval_metrics else + search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_] + ) + for metric in list(self.eval_metrics_in_tuning_format.keys()) + } + mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_] model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model'] - mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_] model_stats['tuned_model_score'] = round(mean_score, 6) model_stats['model_perf'] = scores model_stats['time_taken_sec'] = time_taken - model_stats['tuned_model_evaluation_metric'] = eval_metric + model_stats['tuned_model_evaluation_metric'] = eval_metric_name return model_stats @@ -483,7 +519,7 @@ def optuna_search( self, pipeline: Pipeline, param_grid: dict, - eval_metric: str, + eval_metric: Union[str, CustomScore], cv: list, n_iter: int = 10, timeout: Optional[int] = None, @@ -501,7 +537,7 @@ def optuna_search( param_grid : dict The dictionary that contains the hyperparameters and their possible values - eval_metric : str + eval_metric : str or CustomScore The evaluation metric that will be used to evaluate the model. It can be one of the following: * 'R2' for R^2 score @@ -522,6 +558,8 @@ def optuna_search( * 'F1 Score' for F1 score + * Or a custom CustomScore object + cv : list of tuples A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices for the training and test sets for that fold. For example: @@ -571,6 +609,15 @@ def optuna_search( model_stats = self._setup_tuning("optuna", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs, prefix_param_grid_flag=False) param_grid = model_stats['tuning_param_grid'] + # Handle custom metrics + is_custom_metric = isinstance(eval_metric, CustomScore) + if is_custom_metric: + eval_metric_name = eval_metric.name + study_direction = eval_metric.direction + else: + eval_metric_name = eval_metric + study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'] else "minimize" + # Set verbosity levels if verbose == 0: optuna.logging.set_verbosity(optuna.logging.CRITICAL) @@ -583,8 +630,6 @@ def optuna_search( elif verbose == 4: optuna.logging.set_verbosity(optuna.logging.DEBUG) - study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'] else "minimize" - def objective(trial): # Generate parameters for the trial params = pipeline.named_steps['model'].get_params() @@ -617,17 +662,25 @@ def objective(trial): new_pipeline.fit(X_train, y_train) + # Get predictions based on whether we need probabilities or labels if self.ml_problem_type == "Classification" and hasattr(new_pipeline, 'predict_proba'): y_pred = new_pipeline.predict_proba(X_test) else: y_pred = new_pipeline.predict(X_test) - # Evaluate performance - scores.append(evaluate_model_perf(self.ml_problem_type, y_test, y_pred)) + if is_custom_metric: + scores.append(evaluate_model_perf( + self.ml_problem_type, + y_test, + y_pred, + custom_score=eval_metric + )) + else: + scores.append(evaluate_model_perf(self.ml_problem_type, y_test, y_pred)) # Calculate the mean score across all folds avg_metrics = {k: np.mean([m[k] if m[k] is not None else -1 for m in scores]) for k in scores[0]} - mean_score = avg_metrics.get(eval_metric, float('inf')) + mean_score = avg_metrics.get(eval_metric_name, float('inf')) # Update the best score and model if model_stats['tuned_model_score'] is None or (study_direction == "maximize" and mean_score > model_stats['tuned_model_score']) or (study_direction == "minimize" and mean_score < model_stats['tuned_model_score']): diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py index 1393571..06308b5 100644 --- a/flexml/helpers/supervised_helpers.py +++ b/flexml/helpers/supervised_helpers.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd -from typing import Union +from typing import Union, Optional, Callable +from flexml.structures.custom_score import CustomScore from sklearn.metrics import ( r2_score, @@ -10,7 +11,8 @@ precision_score, recall_score, f1_score, - roc_auc_score) + roc_auc_score +) def _safe_mape(y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray]) -> float: @@ -36,8 +38,8 @@ def _safe_mape(y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np def _evaluate_preds( y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray], - eval_metric: str, - average: str = 'macro' + eval_metric: Union[str, CustomScore], + average: Optional[str] = 'macro' ) -> float: """ Evaluates the model with the given evaluation metric by using the test set @@ -50,7 +52,7 @@ def _evaluate_preds( y_pred : pd.Series or np.ndarray The predicted values/probabilities of the target column - eval_metric : str + eval_metric : str or CustomScore The evaluation metric that will be used to evaluate the model - Avaiable evalulation metrics for Regression: @@ -59,6 +61,8 @@ def _evaluate_preds( - Avaiable evalulation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC + - Or a custom CustomScore object + average : str, default='macro' The averaging method to use for multiclass classification metrics. Options are ['binary', 'micro', 'macro', 'weighted']. @@ -70,6 +74,14 @@ def _evaluate_preds( float The evaluation metric score for the desired eval metric """ + # Handle custom callable metrics + if isinstance(eval_metric, CustomScore): + try: + return round(float(eval_metric(y_true, y_pred)), 6) + except Exception as e: + raise ValueError(f"Error while evaluating with custom score: {str(e)}") + + # Handle standard string-based metrics if eval_metric == 'R2': return round(r2_score(y_true, y_pred), 6) elif eval_metric == 'MAE': @@ -102,7 +114,8 @@ def _evaluate_preds( def evaluate_model_perf( ml_task_type, y_test, - y_pred + y_pred, + custom_score: Optional[CustomScore] = None ) -> dict: """ Evaluates how good are the predictions by comparing them with the actual values, returns regression evaluation scores @@ -120,23 +133,30 @@ def evaluate_model_perf( For classification tasks: The predicted probabilities for each class. Note: Some models like Perceptron, PassiveAggressiveClassifier, etc. don't have predict_proba method, so they return class labels directly. + custom_score : CustomScore, optional (default=None) + A custom score object with signature: func(y_true, y_pred) -> float + If provided, this score will be calculated in addition to standard metrics + Returns ------- dict - A dictionary containing the evaluation metric of the current task + A dictionary containing the evaluation metrics of the current task * R2, MAE, MSE, RMSE, MAPE for Regression tasks * Accuracy, Precision, Recall, F1 Score, ROC-AUC for Classification tasks + + * Plus the custom eval metric if custom_score is provided """ - + # Standard metric evaluation + standard_metrics = {} if ml_task_type == "Regression": r2 = _evaluate_preds(y_test, y_pred, 'R2') mae = _evaluate_preds(y_test, y_pred, 'MAE') mse = _evaluate_preds(y_test, y_pred, 'MSE') rmse = _evaluate_preds(y_test, y_pred, 'RMSE') mape = _evaluate_preds(y_test, y_pred, 'MAPE') - return { + standard_metrics = { "R2": r2, "MAE": mae, "MSE": mse, @@ -164,10 +184,35 @@ def evaluate_model_perf( # Use probabilities for ROC-AUC roc_auc = _evaluate_preds(y_test, y_pred, 'ROC-AUC', average=avg_method) - return { + standard_metrics = { "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1, "ROC-AUC": roc_auc - } \ No newline at end of file + } + + # If custom metric is provided, calculate it and add to standard metrics + if custom_score is not None: + eval_metric_name = custom_score.name + + # For Classification: handle proba vs labels + if ml_task_type == "Classification" and not custom_score.needs_proba: + y_pred_for_metric = y_pred_labels + elif ml_task_type == "Classification" and custom_score.needs_proba: + # Determine if binary or multiclass + if y_pred.shape[1] == 2: + y_pred_for_metric = y_pred[:, 1] + else: + y_pred_for_metric = y_pred + else: + y_pred_for_metric = y_pred + + # Call custom function and add to standard metrics + try: + score = _evaluate_preds(y_test, y_pred_for_metric, custom_score) + standard_metrics[eval_metric_name] = round(score, 6) + except Exception as e: + raise ValueError(f"Error while evaluating with custom eval metric '{eval_metric_name}': {str(e)}") + + return standard_metrics \ No newline at end of file diff --git a/flexml/helpers/validators.py b/flexml/helpers/validators.py index 0176cb8..2f95fe9 100644 --- a/flexml/helpers/validators.py +++ b/flexml/helpers/validators.py @@ -1,15 +1,17 @@ import pandas as pd -from typing import Optional, List +from typing import Optional, List, Union from flexml.config import EVALUATION_METRICS, FEATURE_ENGINEERING_METHODS, CROSS_VALIDATION_METHODS from flexml.logger import get_logger import re +from flexml.structures.custom_score import CustomScore + def eval_metric_checker( ml_task_type: str, - eval_metric: Optional[str] = None, + eval_metric: Optional[Union[str, CustomScore]] = None, all_evaluation_metrics: Optional[List[str]] = None, default_evaluation_metric: Optional[str] = None -) -> str: +) -> Union[str, CustomScore]: """ Since eval_metric setting and validation is a common process for both Regression and Classification tasks... this method is used to set and validate the evaluation metric. @@ -19,7 +21,7 @@ def eval_metric_checker( ml_task_type : str The type of ML task ('Regression' or 'Classification') - eval_metric : str, optional (default='R2' for Regression, 'Accuracy' for Classification) + eval_metric : str or CustomScore, optional (default='R2' for Regression, 'Accuracy' for Classification) The evaluation metric to use for model evaluation - Avaiable evalulation metrics for Regression: @@ -27,6 +29,8 @@ def eval_metric_checker( - Avaiable evalulation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC + + - Or a custom CustomScore object all_evaluation_metrics : List[str], (default=None) All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE', 'MAPE'] for Regression @@ -40,11 +44,15 @@ def eval_metric_checker( Returns ------- - str + str or CustomScore The evaluation metric to use for model evaluation for the current task (Regression or Classification) """ logger = get_logger(__name__, "PROD", False) + + if isinstance(eval_metric, CustomScore): + return eval_metric + # Standard string-based metric validation if default_evaluation_metric is None or all_evaluation_metrics is None: default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"] all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"] diff --git a/flexml/structures/custom_score.py b/flexml/structures/custom_score.py new file mode 100644 index 0000000..5e4688c --- /dev/null +++ b/flexml/structures/custom_score.py @@ -0,0 +1,53 @@ +from typing import Union, Callable +import inspect +import numpy as np +import pandas as pd +from sklearn.metrics import make_scorer + + +class CustomScore: + def __init__( + self, + name: str, + score_func: Callable, + needs_proba: bool, + direction: str + ): + self.name = name + self.score_func = score_func + self.needs_proba = needs_proba + self.direction = direction + + if direction not in ['maximize', 'minimize']: + raise ValueError(f"direction must be either 'maximize' or 'minimize', got '{direction}'") + + if needs_proba is None or not isinstance(needs_proba, bool): + raise ValueError(f"needs_proba must be a boolean, got '{needs_proba}'") + + try: + sig = inspect.signature(score_func) + params = list(sig.parameters.keys()) + + # Check if function has exactly 2 parameters + if len(params) != 2: + raise ValueError( + f"Custom evaluation function must have exactly 2 parameters (y_true, y_pred), " + f"but got {len(params)} parameters: {params}" + ) + except Exception as e: + raise ValueError(f"Error validating custom evaluation function: {str(e)}") + + self.scorer = make_scorer( + self.score_func, + needs_proba=self.needs_proba, + greater_is_better=self.direction == 'maximize' + ) + + def __call__(self, y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray]) -> float: + return self.score_func(y_true, y_pred) + + def __repr__(self): + return f"CustomScore(name={self.name}, score_func={self.score_func.__name__}, needs_proba={self.needs_proba}, direction={self.direction})" + + def get_scorer(self): + return self.scorer \ No newline at end of file diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 70a48bd..4969280 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -4,7 +4,7 @@ from collections import defaultdict from copy import deepcopy from time import time -from typing import Union, Optional, List, Dict +from typing import Union, Optional, List, Dict, Callable from tqdm import tqdm from rich.console import Console from rich.table import Table @@ -32,6 +32,7 @@ plot_prediction_error, plot_calibration_curve ) +from flexml.structures.custom_score import CustomScore from flexml._model_tuner import ModelTuner from flexml._feature_engineer import FeatureEngineering @@ -405,7 +406,16 @@ def __process_experiment_result(self, experiment_stats: dict): for key, value in aggregated_metrics.items() } - best_model_entry = max(model_entries, key=lambda x: x["model_stats"][self.eval_metric]) + # Get the metric name to use for selecting best model + metric_key = self.eval_metric.name if self._is_custom_metric else self.eval_metric + + # For minimize metrics, select the min instead of max + if self._is_custom_metric and self.eval_metric.direction == 'minimize': + best_model_entry = min(model_entries, key=lambda x: x["model_stats"][metric_key]) + elif not self._is_custom_metric and self.__ML_TASK_TYPE == "Regression" and metric_key in ['MAE', 'MSE', 'RMSE', 'MAPE']: + best_model_entry = min(model_entries, key=lambda x: x["model_stats"][metric_key]) + else: + best_model_entry = max(model_entries, key=lambda x: x["model_stats"][metric_key]) self.__model_training_info.append({ model_name: { @@ -420,7 +430,10 @@ def start_experiment( cv_method: Optional[str] = None, n_folds: Optional[int] = None, test_size: Optional[float] = None, - eval_metric: Optional[str] = None, + eval_metric: Optional[Union[str, Callable]] = None, + custom_metric_name: Optional[str] = None, + custom_metric_needs_proba: Optional[bool] = None, + custom_metric_direction: Optional[str] = None, random_state: Optional[int] = 42, groups_col: Optional[str] = None, n_jobs: Optional[int] = -1 @@ -460,7 +473,7 @@ def start_experiment( test_size : float, (default=0.25 for hold-out cv, None for other methods) The size of the test data if using hold-out or shuffle-based splits - eval_metric : str, optional (default='R2' for Regression, 'Accuracy' for Classification) + eval_metric : str or callable, optional (default='R2' for Regression, 'Accuracy' for Classification) The evaluation metric to use for model evaluation - Avaiable evalulation metrics for Regression: @@ -468,6 +481,21 @@ def start_experiment( - Avaiable evalulation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC + + - Or a custom callable function with signature: func(y_true, y_pred) -> float + + custom_metric_name : str, optional (default=None) + The name to use for the custom metric. If None and eval_metric is callable, + uses the function's __name__ attribute + + custom_metric_needs_proba : bool, optional (default=False) + For classification tasks only: If True, passes probabilities to the custom metric function. + If False, converts probabilities to class labels before passing to the function. + Ignored for regression tasks (no error raised) + + custom_metric_direction : str, optional (default='maximize') + Direction for optimizing the custom metric. Either 'maximize' or 'minimize'. + Used for sorting models in the leaderboard when using custom metrics random_state : int, optional (default=None) The random state value for the model training process @@ -485,7 +513,20 @@ def start_experiment( - Defaults to a standard 5-fold if neither `n_folds` nor `test_size` is provided """ experiment_size = experiment_size.lower() # Convert to lowercase in case of any case mismatch - self.eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) + if isinstance(eval_metric, Callable): + self.eval_metric = CustomScore( + name=custom_metric_name, + score_func=eval_metric, + needs_proba=custom_metric_needs_proba, + direction=custom_metric_direction + ) + self._is_custom_metric = True + self._custom_metric_name = custom_metric_name + else: + self.eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) + self._is_custom_metric = False + self._custom_metric_name = None + random_state = random_state_checker(random_state) # Check cross-validation method params @@ -611,7 +652,12 @@ def start_experiment( else: y_pred = model.predict(X_test) - model_perf = evaluate_model_perf(self.__ML_TASK_TYPE, y_test, y_pred) + model_perf = evaluate_model_perf( + self.__ML_TASK_TYPE, + y_test, + y_pred, + custom_score=self.eval_metric if self._is_custom_metric else None + ) all_metrics.append(model_perf) all_times.append(time_taken) @@ -643,10 +689,11 @@ def start_experiment( pbar.update(1) self.__process_experiment_result(all_model_stats) - self.__logger.info("[PROCESS] Model training is finished!") - self.get_best_models(eval_metric) - self.show_model_stats(eval_metric) + + display_metric = self._custom_metric_name if self._is_custom_metric else eval_metric + self.get_best_models(display_metric) + self.show_model_stats(display_metric) def get_model_by_name(self, model_name: str) -> object: """ @@ -704,9 +751,15 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int = top_n_models = self.__top_n_models_checker(top_n_models) - if eval_metric is None and hasattr(self, 'eval_metric'): + if eval_metric is None: eval_metric = self.eval_metric - eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) + + if isinstance(eval_metric, CustomScore): + eval_metric = eval_metric.name + elif isinstance(eval_metric, str) and self._is_custom_metric and eval_metric == self.eval_metric.name: + eval_metric = self.eval_metric.name + else: + eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) model_stats = [] best_models = [] @@ -1158,11 +1211,15 @@ def __sort_models(self, eval_metric: Optional[str] = None): self.__logger.error(error_msg) raise ValueError(error_msg) - # Since lower is better for mae, mse and rmse in Regression tasks, they should be sorted in ascending order - if self.__ML_TASK_TYPE == "Regression" and eval_metric in ['MAE', 'MSE', 'RMSE', 'MAPE']: - return self._model_stats_df.sort_values(by=eval_metric, ascending=True).reset_index(drop = True) + # Determine sort direction + if self._is_custom_metric and eval_metric == self._custom_metric_name: + ascending = (self.eval_metric.direction == 'minimize') + elif self.__ML_TASK_TYPE == "Regression" and eval_metric in ['MAE', 'MSE', 'RMSE', 'MAPE']: + ascending = True else: - return self._model_stats_df.sort_values(by=eval_metric, ascending=False).reset_index(drop = True) + ascending = False # F1, ROC-AUC, R2, Accuracy, etc. + + return self._model_stats_df.sort_values(by=eval_metric, ascending=ascending).reset_index(drop = True) def show_model_stats(self, eval_metric: Optional[str] = None): """ @@ -1193,7 +1250,12 @@ def highlight_best(s: pd.Series) -> list[str]: list[str] A list of strings containing the green background color for the best value so we can highlight it while showing the model stats """ - if s.name in ['MAE', 'MSE', 'RMSE', 'MAPE']: + # Check if this is a custom metric that should be minimized + is_custom_minimize = (self._is_custom_metric and + self.eval_metric.direction == 'minimize') + + # Determine if we should minimize or maximize + if s.name in ['MAE', 'MSE', 'RMSE', 'MAPE'] or is_custom_minimize: s_nonneg = s.where(s >= 0, np.nan) best_val = s_nonneg.min() if best_val == float('inf'): @@ -1205,9 +1267,17 @@ def highlight_best(s: pd.Series) -> list[str]: return ['background-color: green' if v else '' for v in is_best] - if eval_metric is None and hasattr(self, 'eval_metric'): + if eval_metric is None: eval_metric = self.eval_metric - eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) + + if isinstance(eval_metric, CustomScore): + eval_metric = eval_metric.name + elif isinstance(eval_metric, str) and self._is_custom_metric and eval_metric == self.eval_metric.name: + eval_metric = self.eval_metric.name + else: + eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) + + sorted_model_stats_df = self.__sort_models(eval_metric) sorted_model_stats_df['Time (sec)'] = sorted_model_stats_df['Time (sec)'].apply(lambda x: f"{x:.2f}") @@ -1225,7 +1295,14 @@ def highlight_best(s: pd.Series) -> list[str]: if len(sorted_model_stats_df) < 2: display(sorted_model_stats_df) else: - styler = sorted_model_stats_df.style.apply(highlight_best, subset=self.__ALL_EVALUATION_METRICS) + # Determine which columns to highlight - always include all standard metrics + highlight_columns = self.__ALL_EVALUATION_METRICS.copy() + + # Add custom metric column if present + if self._is_custom_metric: + highlight_columns.append(self._custom_metric_name) + + styler = sorted_model_stats_df.style.apply(highlight_best, subset=highlight_columns) display(styler) # display is only supported in interactive kernels such as Jupyter Notebook/Google Colab def tune_model( @@ -1416,9 +1493,14 @@ def _show_tuning_report(tuning_report: Optional[dict] = None): self.__logger.error(error_msg) raise ValueError(error_msg) - if eval_metric is None and hasattr(self, 'eval_metric'): + # If no eval_metric provided, use the one from start_experiment + if eval_metric is None: eval_metric = self.eval_metric - eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) + + if isinstance(eval_metric, str) and self._is_custom_metric and eval_metric == self.eval_metric.name: + eval_metric = self.eval_metric + else: + eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) # If the user doesn't pass any cross-validation method params, use the last used ones if ( diff --git a/tests/test_custom_metrics.py b/tests/test_custom_metrics.py new file mode 100644 index 0000000..b16772e --- /dev/null +++ b/tests/test_custom_metrics.py @@ -0,0 +1,506 @@ +import unittest +import numpy as np +from sklearn.datasets import load_diabetes, load_iris, load_breast_cancer +from flexml import Regression, Classification +from flexml.logger import get_logger +import warnings +warnings.filterwarnings("ignore") + + +class TestCustomMetrics(unittest.TestCase): + """Test suite for custom scoring function feature""" + + logger = get_logger(__name__, "TEST") + logger.setLevel("DEBUG") + + # ========================================================================= + # Custom Metric Functions + # ========================================================================= + + @staticmethod + def custom_mse_doubled(y_true, y_pred): + """Custom MSE metric multiplied by 2 (for regression)""" + mse = np.mean((y_true - y_pred) ** 2) + return mse * 2 + + @staticmethod + def custom_accuracy_halved(y_true, y_pred_labels): + """Custom accuracy metric divided by 2 (for classification with labels)""" + accuracy = np.mean(y_true == y_pred_labels) + return accuracy / 2 + + @staticmethod + def custom_roc_auc_doubled(y_true, y_proba): + """Custom ROC-AUC metric multiplied by 2 (for classification with probabilities) + + Note: For binary classification, y_proba will be 1D (only positive class probabilities) + For multiclass classification, y_proba will be 2D (all class probabilities) + """ + from sklearn.metrics import roc_auc_score + + # Handle multiclass case (2D array with >2 classes) + if len(y_proba.shape) > 1 and y_proba.shape[1] > 2: + # Multiclass: use OVR strategy + roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro') + else: + # Binary classification: y_proba is already 1D with positive class probabilities + # OR it's a 2D array with shape (n_samples, 1) + if len(y_proba.shape) > 1: + y_proba = y_proba.ravel() # Flatten if needed + roc_auc = roc_auc_score(y_true, y_proba) + + return roc_auc * 2 + + # ========================================================================= + # Test 1: Regression with Custom Metric + # ========================================================================= + + def test_01_regression_custom_metric(self): + """Test regression task with custom metric (MSE * 2)""" + self.logger.info("=" * 80) + self.logger.info("TEST 1: Regression with Custom Metric (MSE * 2)") + self.logger.info("=" * 80) + + # Load diabetes dataset + df = load_diabetes(as_frame=True)['frame'] + + # Create regression model + model = Regression( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_mse_doubled, + custom_metric_name='Custom MSE x2', + custom_metric_direction='minimize', + custom_metric_needs_proba=False, + cv_method='kfold', + n_folds=3, + n_jobs=1 + ) + + # Verify custom metric was used + self.assertIsNotNone(model._model_stats_df) + self.assertIn('Custom MSE x2', model._model_stats_df.columns) + + # Verify metric values are reasonable (should be roughly 2x normal MSE) + # Normal MSE for this dataset is around 3000-6000, so doubled should be 6000-12000 + custom_metric_values = model._model_stats_df['Custom MSE x2'].values + self.assertTrue(all(val > 0 for val in custom_metric_values), + "Custom metric values should be positive") + self.assertTrue(all(val > 1000 for val in custom_metric_values), + "Custom MSE x2 values should be reasonably large") + + # Verify model selection works + best_model = model.get_best_models() + self.assertIsNotNone(best_model) + + self.logger.info("✓ Regression with custom metric test PASSED") + + # ========================================================================= + # Test 2: Binary Classification with Custom Metric (Labels) + # ========================================================================= + + def test_02_binary_classification_custom_metric_labels(self): + """Test binary classification with custom metric for labels (Accuracy / 2)""" + self.logger.info("=" * 80) + self.logger.info("TEST 2: Binary Classification with Custom Metric (Accuracy / 2)") + self.logger.info("=" * 80) + + # Load breast cancer dataset + df = load_breast_cancer(as_frame=True)['frame'] + + # Create classification model + model = Classification( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric that needs labels + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_accuracy_halved, + custom_metric_name='Custom Accuracy / 2', + custom_metric_needs_proba=False, # Pass labels, not probabilities + custom_metric_direction='maximize', + cv_method='kfold', + n_folds=3, + n_jobs=1 + ) + + # Verify custom metric was used + self.assertIsNotNone(model._model_stats_df) + self.assertIn('Custom Accuracy / 2', model._model_stats_df.columns) + + # Verify metric values are in expected range (0 to 0.5 since accuracy/2) + custom_metric_values = model._model_stats_df['Custom Accuracy / 2'].values + self.assertTrue(all(0 <= val <= 0.5 for val in custom_metric_values), + "Custom Accuracy / 2 should be between 0 and 0.5") + self.assertTrue(all(val > 0.2 for val in custom_metric_values), + "Custom Accuracy / 2 should be reasonably high (> 0.2)") + + # Verify model selection works + best_model = model.get_best_models() + self.assertIsNotNone(best_model) + + # Verify that the custom metric parameters were stored + self.assertTrue(model._is_custom_metric) + self.assertEqual(model.eval_metric.name, 'Custom Accuracy / 2') + self.assertEqual(model.eval_metric.needs_proba, False) + self.assertEqual(model.eval_metric.direction, 'maximize') + + self.logger.info("✓ Binary classification with custom metric (labels) test PASSED") + + # ========================================================================= + # Test 3: Binary Classification with Custom Metric (Probabilities) + # ========================================================================= + + def test_03_binary_classification_custom_metric_proba(self): + """Test binary classification with custom metric for probabilities (ROC-AUC * 2)""" + self.logger.info("=" * 80) + self.logger.info("TEST 3: Binary Classification with Custom Metric (ROC-AUC * 2)") + self.logger.info("=" * 80) + + # Load breast cancer dataset + df = load_breast_cancer(as_frame=True)['frame'] + + # Create classification model + model = Classification( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric that needs probabilities + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_roc_auc_doubled, + custom_metric_name='Custom ROC-AUC x2', + custom_metric_needs_proba=True, # Pass probabilities + custom_metric_direction='maximize', + cv_method='kfold', + n_folds=3, + n_jobs=1 + ) + + # Verify custom metric was used + self.assertIsNotNone(model._model_stats_df) + self.assertIn('Custom ROC-AUC x2', model._model_stats_df.columns) + + # Verify metric values are in expected range (0 to 2 since ROC-AUC*2) + custom_metric_values = model._model_stats_df['Custom ROC-AUC x2'].values + self.assertTrue(all(0 <= val <= 2 for val in custom_metric_values), + "Custom ROC-AUC x2 should be between 0 and 2") + self.assertTrue(all(val > 1.0 for val in custom_metric_values), + "Custom ROC-AUC x2 should be reasonably high (> 1.0)") + + # Verify model selection works + best_model = model.get_best_models() + self.assertIsNotNone(best_model) + + # Verify that the custom metric parameters were stored + self.assertTrue(model._is_custom_metric) + self.assertEqual(model.eval_metric.name, 'Custom ROC-AUC x2') + self.assertEqual(model.eval_metric.needs_proba, True) + self.assertEqual(model.eval_metric.direction, 'maximize') + + self.logger.info("✓ Binary classification with custom metric (probabilities) test PASSED") + + # ========================================================================= + # Test 4: Multiclass Classification with Custom Metric (Probabilities) + # ========================================================================= + + def test_04_multiclass_classification_custom_metric_proba(self): + """Test multiclass classification with custom metric for probabilities (ROC-AUC * 2)""" + self.logger.info("=" * 80) + self.logger.info("TEST 4: Multiclass Classification with Custom Metric (ROC-AUC * 2)") + self.logger.info("=" * 80) + + # Load iris dataset + df = load_iris(as_frame=True)['frame'] + + # Create classification model + model = Classification( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric that needs probabilities + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_roc_auc_doubled, + custom_metric_name='Custom ROC-AUC x2', + custom_metric_needs_proba=True, # Pass probabilities + custom_metric_direction='maximize', + cv_method='kfold', + n_folds=3, + n_jobs=1 + ) + + # Verify custom metric was used + self.assertIsNotNone(model._model_stats_df) + self.assertIn('Custom ROC-AUC x2', model._model_stats_df.columns) + + # Verify metric values are in expected range + custom_metric_values = model._model_stats_df['Custom ROC-AUC x2'].values + self.assertTrue(all(0 <= val <= 2 for val in custom_metric_values), + "Custom ROC-AUC x2 should be between 0 and 2") + self.assertTrue(all(val > 1.0 for val in custom_metric_values), + "Custom ROC-AUC x2 should be reasonably high (> 1.0)") + + # Verify model selection works + best_model = model.get_best_models() + self.assertIsNotNone(best_model) + + self.logger.info("✓ Multiclass classification with custom metric (probabilities) test PASSED") + + # ========================================================================= + # Test 5: Tuning with Custom Metric (Regression) + # ========================================================================= + + def test_05_tuning_with_custom_metric_regression(self): + """Test model tuning with custom metric in regression""" + self.logger.info("=" * 80) + self.logger.info("TEST 5: Tuning with Custom Metric (Regression)") + self.logger.info("=" * 80) + + # Load diabetes dataset + df = load_diabetes(as_frame=True)['frame'] + + # Create regression model + model = Regression( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_mse_doubled, + custom_metric_name='Custom MSE x2', + custom_metric_direction='minimize', + custom_metric_needs_proba=False, + cv_method='kfold', + n_folds=3, + n_jobs=1 + ) + + # Tune the best model with custom metric (should inherit parameters) + model.tune_model( + tuning_method='randomized_search', + n_iter=2, # Small number for quick test + n_jobs=-1, + verbose=0 + ) + + # Verify tuning completed and custom metric was used + # Check that the leaderboard has a tuned model + self.assertIsNotNone(model._model_stats_df) + tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('randomized_search', case=False)] + self.assertTrue(len(tuned_models) > 0, "Tuned model should be added to leaderboard") + + self.logger.info("✓ Tuning with custom metric (regression) test PASSED") + + # ========================================================================= + # Test 6: Tuning with Custom Metric (Classification) + # ========================================================================= + + def test_06_tuning_with_custom_metric_classification(self): + """Test model tuning with custom metric in classification""" + self.logger.info("=" * 80) + self.logger.info("TEST 6: Tuning with Custom Metric (Classification)") + self.logger.info("=" * 80) + + # Load breast cancer dataset + df = load_breast_cancer(as_frame=True)['frame'] + + # Create classification model + model = Classification( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_roc_auc_doubled, + custom_metric_name='Custom ROC-AUC x2', + custom_metric_needs_proba=True, + custom_metric_direction='maximize', + cv_method='kfold', + n_folds=3, + n_jobs=1 + ) + + # Tune the best model with custom metric + model.tune_model( + tuning_method='randomized_search', + n_iter=2, # Small number for quick test + n_jobs=-1, + verbose=0 + ) + + # Verify tuning completed + # Check that the leaderboard has a tuned model + self.assertIsNotNone(model._model_stats_df) + tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('randomized_search', case=False)] + self.assertTrue(len(tuned_models) > 0, "Tuned model should be added to leaderboard") + + self.logger.info("✓ Tuning with custom metric (classification) test PASSED") + + # ========================================================================= + # Test 7: Optuna Tuning with Custom Metric (Regression) + # ========================================================================= + + def test_07_optuna_tuning_regression(self): + """Test Optuna tuning with custom metric in regression""" + self.logger.info("=" * 80) + self.logger.info("TEST 7: Optuna Tuning with Custom Metric (Regression)") + self.logger.info("=" * 80) + + # Load diabetes dataset + df = load_diabetes(as_frame=True)['frame'] + + # Create regression model + model = Regression( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_mse_doubled, + custom_metric_name='Custom MSE x2', + custom_metric_direction='minimize', + custom_metric_needs_proba=False, + cv_method='kfold', + n_folds=3, + n_jobs=-1 + ) + + # Tune with Optuna (should inherit parameters) + model.tune_model( + tuning_method='optuna', + n_iter=2, # Small number for quick test + n_jobs=-1, + verbose=0 + ) + + # Verify tuning completed + self.assertIsNotNone(model._model_stats_df) + tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('optuna', case=False)] + self.assertTrue(len(tuned_models) > 0, "Optuna tuned model should be added to leaderboard") + + # Verify the custom metric was used in tuning + tuned_model_row = tuned_models.iloc[0] + self.assertIn('Custom MSE x2', tuned_model_row.index) + self.assertTrue(tuned_model_row['Custom MSE x2'] > 0, "Custom metric should have valid value") + + self.logger.info("✓ Optuna tuning with custom metric (regression) test PASSED") + + # ========================================================================= + # Test 8: Optuna Tuning with Custom Metric (Classification) + # ========================================================================= + + def test_08_optuna_tuning_classification(self): + """Test Optuna tuning with custom metric in classification""" + self.logger.info("=" * 80) + self.logger.info("TEST 8: Optuna Tuning with Custom Metric (Classification)") + self.logger.info("=" * 80) + + # Load breast cancer dataset + df = load_breast_cancer(as_frame=True)['frame'] + + # Create classification model + model = Classification( + data=df, + target_col='target', + random_state=42 + ) + + # Run experiment with custom metric that needs probabilities + model.start_experiment( + experiment_size='quick', + eval_metric=self.custom_roc_auc_doubled, + custom_metric_name='Custom ROC-AUC x2', + custom_metric_direction='maximize', + custom_metric_needs_proba=True, + cv_method='kfold', + n_folds=3, + n_jobs=-1 + ) + + # Tune with Optuna + model.tune_model( + tuning_method='optuna', + n_iter=2, # Small number for quick test + n_jobs=-1, + verbose=0 + ) + + # Verify tuning completed + self.assertIsNotNone(model._model_stats_df) + tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('optuna', case=False)] + self.assertTrue(len(tuned_models) > 0, "Optuna tuned model should be added to leaderboard") + + # Verify the custom metric was used in tuning + tuned_model_row = tuned_models.iloc[0] + self.assertIn('Custom ROC-AUC x2', tuned_model_row.index) + self.assertTrue(0 <= tuned_model_row['Custom ROC-AUC x2'] <= 2, + "Custom ROC-AUC x2 should be between 0 and 2") + + self.logger.info("✓ Optuna tuning with custom metric (classification) test PASSED") + + # ========================================================================= + # Test 9: Invalid Custom Metric Function + # ========================================================================= + + def test_09_invalid_custom_metric_function(self): + """Test that invalid custom metric functions are rejected""" + self.logger.info("=" * 80) + self.logger.info("TEST 9: Invalid Custom Metric Function") + self.logger.info("=" * 80) + + # Load dataset + df = load_diabetes(as_frame=True)['frame'] + + # Create regression model + model = Regression( + data=df, + target_col='target', + random_state=42 + ) + + # Define invalid function (wrong number of parameters) + def invalid_metric(y_true, y_pred, extra_param): + return np.mean((y_true - y_pred) ** 2) + + # Should raise ValueError + with self.assertRaises(ValueError) as context: + model.start_experiment( + experiment_size='quick', + eval_metric=invalid_metric, + custom_metric_direction='maximize', + custom_metric_needs_proba=False, + cv_method='kfold', + n_folds=3 + ) + + self.assertIn("exactly 2 parameters", str(context.exception)) + + self.logger.info("✓ Invalid custom metric function test PASSED") + + +if __name__ == '__main__': + unittest.main() + From 6ca26e37ec2f0b359ee459298639024eb20bd885 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 17:56:36 +0300 Subject: [PATCH 2/8] Added name validation to custom score --- flexml/structures/custom_score.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flexml/structures/custom_score.py b/flexml/structures/custom_score.py index 5e4688c..cf99349 100644 --- a/flexml/structures/custom_score.py +++ b/flexml/structures/custom_score.py @@ -18,6 +18,9 @@ def __init__( self.needs_proba = needs_proba self.direction = direction + if not isinstance(self.name, str) or not self.name.strip(): + raise ValueError(f"name must be a non-empty string, got '{self.name}'") + if direction not in ['maximize', 'minimize']: raise ValueError(f"direction must be either 'maximize' or 'minimize', got '{direction}'") From c8a0639244f26f7a547a88e83c12d54401b9b38d Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 17:58:39 +0300 Subject: [PATCH 3/8] Misspelling fixes --- flexml/helpers/supervised_helpers.py | 4 ++-- flexml/helpers/validators.py | 4 ++-- flexml/structures/supervised_base.py | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py index 06308b5..8bd7a70 100644 --- a/flexml/helpers/supervised_helpers.py +++ b/flexml/helpers/supervised_helpers.py @@ -55,10 +55,10 @@ def _evaluate_preds( eval_metric : str or CustomScore The evaluation metric that will be used to evaluate the model - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC - Or a custom CustomScore object diff --git a/flexml/helpers/validators.py b/flexml/helpers/validators.py index 2f95fe9..300f7a8 100644 --- a/flexml/helpers/validators.py +++ b/flexml/helpers/validators.py @@ -24,10 +24,10 @@ def eval_metric_checker( eval_metric : str or CustomScore, optional (default='R2' for Regression, 'Accuracy' for Classification) The evaluation metric to use for model evaluation - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC - Or a custom CustomScore object diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 4969280..53cc310 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -476,10 +476,10 @@ def start_experiment( eval_metric : str or callable, optional (default='R2' for Regression, 'Accuracy' for Classification) The evaluation metric to use for model evaluation - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC - Or a custom callable function with signature: func(y_true, y_pred) -> float @@ -735,10 +735,10 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int = eval_metric : str, optional Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC Returns @@ -1195,10 +1195,10 @@ def __sort_models(self, eval_metric: Optional[str] = None): eval_metric : str, optional Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC Returns @@ -1230,10 +1230,10 @@ def show_model_stats(self, eval_metric: Optional[str] = None): eval_metric : str, optional Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC """ def highlight_best(s: pd.Series) -> list[str]: @@ -1371,10 +1371,10 @@ def tune_model( eval_metric : str, optional Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used - - Avaiable evalulation metrics for Regression: + - Available evaluation metrics for Regression: - R2, MAE, MSE, RMSE, MAPE - - Avaiable evalulation metrics for Classification: + - Available evaluation metrics for Classification: - Accuracy, Precision, Recall, F1 Score, ROC-AUC param_grid : dict (default = defined custom param dict in flexml/config/tune_model_config.py) From 44c37f9bf98a334a94d94aa1e5ce9bc4b476ae49 Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 18:04:51 +0300 Subject: [PATCH 4/8] Error fix when show_model_stats() or get_best_models() are called before start_experiment() --- flexml/structures/supervised_base.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py index 53cc310..78e7c82 100644 --- a/flexml/structures/supervised_base.py +++ b/flexml/structures/supervised_base.py @@ -747,6 +747,7 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int = Single or a list of top n models based on the evaluation metric or None If no models have been trained yet. """ if len(self.__model_training_info) == 0: + self.__logger.warning("No models have been trained yet, start an experiment first via start_experiment()") return None top_n_models = self.__top_n_models_checker(top_n_models) @@ -1266,7 +1267,10 @@ def highlight_best(s: pd.Series) -> list[str]: is_best = (s == s.max()) & (s != float('inf')) & (s != -1) return ['background-color: green' if v else '' for v in is_best] - + if len(self.__model_training_info) == 0: + self.__logger.warning("No models have been trained yet, start an experiment first via start_experiment()") + return None + if eval_metric is None: eval_metric = self.eval_metric @@ -1277,8 +1281,6 @@ def highlight_best(s: pd.Series) -> list[str]: else: eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric) - - sorted_model_stats_df = self.__sort_models(eval_metric) sorted_model_stats_df['Time (sec)'] = sorted_model_stats_df['Time (sec)'].apply(lambda x: f"{x:.2f}") sorted_model_stats_df.index += 1 From 89f5e4417063a9a36a981a3842c0a160725b1e6c Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 18:08:07 +0300 Subject: [PATCH 5/8] Improved if logic at evaluate_model_perf(), custom_score calculation --- flexml/helpers/supervised_helpers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py index 8bd7a70..0c66bd5 100644 --- a/flexml/helpers/supervised_helpers.py +++ b/flexml/helpers/supervised_helpers.py @@ -197,15 +197,15 @@ def evaluate_model_perf( eval_metric_name = custom_score.name # For Classification: handle proba vs labels - if ml_task_type == "Classification" and not custom_score.needs_proba: - y_pred_for_metric = y_pred_labels - elif ml_task_type == "Classification" and custom_score.needs_proba: - # Determine if binary or multiclass - if y_pred.shape[1] == 2: - y_pred_for_metric = y_pred[:, 1] + if ml_task_type == "Classification": + if custom_score.needs_proba: + if y_pred.shape[1] == 2: + y_pred_for_metric = y_pred[:, 1] + else: + y_pred_for_metric = y_pred else: - y_pred_for_metric = y_pred - else: + y_pred_for_metric = y_pred_labels + else: # Regression y_pred_for_metric = y_pred # Call custom function and add to standard metrics From 6de04e13aa9ecdfbd213af732d4f48caca4bc34b Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 18:08:48 +0300 Subject: [PATCH 6/8] Removed unused import --- flexml/helpers/supervised_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py index 0c66bd5..ea33ab3 100644 --- a/flexml/helpers/supervised_helpers.py +++ b/flexml/helpers/supervised_helpers.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from typing import Union, Optional, Callable +from typing import Union, Optional from flexml.structures.custom_score import CustomScore from sklearn.metrics import ( From 55bd7bce0a66ac17b34359b12bac0efedbda1a5b Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 18:13:53 +0300 Subject: [PATCH 7/8] Improved custom score class --- flexml/structures/custom_score.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/flexml/structures/custom_score.py b/flexml/structures/custom_score.py index cf99349..98ac50c 100644 --- a/flexml/structures/custom_score.py +++ b/flexml/structures/custom_score.py @@ -21,24 +21,24 @@ def __init__( if not isinstance(self.name, str) or not self.name.strip(): raise ValueError(f"name must be a non-empty string, got '{self.name}'") + if not isinstance(score_func, Callable): + raise ValueError(f"score_func must be a callable, got '{type(score_func)}'") + + if needs_proba is None or not isinstance(needs_proba, bool): + raise ValueError(f"needs_proba must be a boolean, got '{type(needs_proba)}'") + if direction not in ['maximize', 'minimize']: raise ValueError(f"direction must be either 'maximize' or 'minimize', got '{direction}'") - if needs_proba is None or not isinstance(needs_proba, bool): - raise ValueError(f"needs_proba must be a boolean, got '{needs_proba}'") - - try: - sig = inspect.signature(score_func) - params = list(sig.parameters.keys()) - - # Check if function has exactly 2 parameters - if len(params) != 2: - raise ValueError( - f"Custom evaluation function must have exactly 2 parameters (y_true, y_pred), " - f"but got {len(params)} parameters: {params}" - ) - except Exception as e: - raise ValueError(f"Error validating custom evaluation function: {str(e)}") + sig = inspect.signature(score_func) + params = list(sig.parameters.keys()) + + # Check if function has exactly 2 parameters + if len(params) != 2: + raise ValueError( + f"Custom evaluation function must have exactly 2 parameters (y_true, y_pred), " + f"but got {len(params)} parameters: {params}" + ) self.scorer = make_scorer( self.score_func, From 2abe8a40151d88ffd2f5c710f0658648287cd47d Mon Sep 17 00:00:00 2001 From: Ozgur Aslan Date: Sat, 3 Jan 2026 18:23:06 +0300 Subject: [PATCH 8/8] Passed custom score name as a argument in tests --- tests/test_custom_metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_custom_metrics.py b/tests/test_custom_metrics.py index b16772e..aaf1d98 100644 --- a/tests/test_custom_metrics.py +++ b/tests/test_custom_metrics.py @@ -490,6 +490,7 @@ def invalid_metric(y_true, y_pred, extra_param): model.start_experiment( experiment_size='quick', eval_metric=invalid_metric, + custom_metric_name='Invalid Metric', custom_metric_direction='maximize', custom_metric_needs_proba=False, cv_method='kfold',