From 738a4450ca6c690a7ab0ddc6c13f51c829b1c018 Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 17:44:16 +0300
Subject: [PATCH 1/8] Added custom eval metric feature

---
 flexml/_model_tuner.py               | 137 +++++---
 flexml/helpers/supervised_helpers.py |  67 +++-
 flexml/helpers/validators.py         |  18 +-
 flexml/structures/custom_score.py    |  53 +++
 flexml/structures/supervised_base.py | 124 +++++--
 tests/test_custom_metrics.py         | 506 +++++++++++++++++++++++++++
 6 files changed, 826 insertions(+), 79 deletions(-)
 create mode 100644 flexml/structures/custom_score.py
 create mode 100644 tests/test_custom_metrics.py

diff --git a/flexml/_model_tuner.py b/flexml/_model_tuner.py
index a874fc1..6153719 100644
--- a/flexml/_model_tuner.py
+++ b/flexml/_model_tuner.py
@@ -1,19 +1,19 @@
-import numpy as np
-import pandas as pd
-import optuna
+from time import time
 import joblib
 from joblib.parallel import BatchCompletionCallBack
 from contextlib import contextmanager
+from tqdm import tqdm
 from typing import Optional, Union
-from time import time
+import numpy as np
+import pandas as pd
 from sklearn.model_selection import ParameterGrid, GridSearchCV, RandomizedSearchCV
 from sklearn.pipeline import Pipeline
 from sklearn.base import clone
+import optuna
 from flexml.config import TUNING_METRIC_TRANSFORMATIONS
+from flexml.structures.custom_score import CustomScore
 from flexml.logger import get_logger
 from flexml.helpers import evaluate_model_perf
-from copy import deepcopy
-from tqdm import tqdm
 
 
 class TqdmBatchCompletionCallback(BatchCompletionCallBack):
@@ -233,8 +233,8 @@ def grid_search(
         self,
         pipeline: Pipeline,
         param_grid: dict,
-        eval_metric: str,
-        cv: list,
+        eval_metric: Union[str, CustomScore],
+        cv: list,   
         n_jobs: int = -1,
         verbose: int = 0
     ) -> Optional[dict]:
@@ -249,7 +249,7 @@ def grid_search(
         param_grid : dict
             The dictionary that contains the hyperparameters and their possible values
 
-        eval_metric : str
+        eval_metric : str or CustomScore
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
             * 'R2' for R^2 score
@@ -270,6 +270,8 @@ def grid_search(
             
             * 'F1 Score' for F1 score
 
+            * Or a custom CustomScore object
+            
         cv : list of tuples
             A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices
             for the training and test sets for that fold. For example:
@@ -309,6 +311,18 @@ def grid_search(
         model_stats = self._setup_tuning("GridSearchCV", pipeline, param_grid, n_iter=None, n_jobs=n_jobs)
         param_grid = model_stats['tuning_param_grid']
         
+        # Handle custom metrics
+        is_custom_metric = isinstance(eval_metric, CustomScore)
+        if is_custom_metric:
+            eval_metric_name = eval_metric.name
+            custom_scorer = eval_metric.get_scorer()
+            scoring = {eval_metric_name: custom_scorer}
+            refit_metric = eval_metric_name
+        else:
+            eval_metric_name = eval_metric
+            scoring = self.eval_metrics_in_tuning_format
+            refit_metric = eval_metric
+        
         try:
             t_start = time()
             
@@ -321,8 +335,8 @@ def grid_search(
             search = GridSearchCV(
                 pipeline,
                 param_grid,
-                scoring=self.eval_metrics_in_tuning_format,
-                refit=eval_metric,
+                scoring=scoring,
+                refit=refit_metric,
                 cv=cv,
                 n_jobs=n_jobs,
                 verbose=verbose
@@ -339,21 +353,25 @@ def grid_search(
             t_end = time()
             time_taken = round(t_end - t_start, 2)
 
-            scores = {
-                metric: (
-                    -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
-                    if metric in self.reverse_signed_eval_metrics else
-                    search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
-                )
-                for metric in list(self.eval_metrics_in_tuning_format.keys())
-            }
+            if is_custom_metric:
+                mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_]
+                scores = {eval_metric_name: round(mean_score, 6)}
+            else:
+                scores = {
+                    metric: (
+                        -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
+                        if metric in self.reverse_signed_eval_metrics else
+                        search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
+                    )
+                    for metric in list(self.eval_metrics_in_tuning_format.keys())
+                }
+                mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_]
 
             model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model'] 
-            mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_]
             model_stats['tuned_model_score'] = round(mean_score, 6)
             model_stats['model_perf'] = scores
             model_stats['time_taken_sec'] = time_taken
-            model_stats['tuned_model_evaluation_metric'] = eval_metric
+            model_stats['tuned_model_evaluation_metric'] = eval_metric_name
             return model_stats
         except Exception as e:
             self.logger.error(f"Error while tuning the model with GridSearchCV, Error: {e}")
@@ -363,7 +381,7 @@ def randomized_search(
         self,
         pipeline: Pipeline,
         param_grid: dict,
-        eval_metric: str,
+        eval_metric: Union[str, CustomScore],
         cv: list,
         n_iter: int = 10,
         n_jobs: int = -1,
@@ -380,7 +398,7 @@ def randomized_search(
         param_grid : dict
             The dictionary that contains the hyperparameters and their possible values
 
-        eval_metric : str
+        eval_metric : str or CustomScore
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
             * 'R2' for R^2 score
@@ -401,6 +419,8 @@ def randomized_search(
             
             * 'F1 Score' for F1 score
 
+            * Or a custom CustomScore object
+
         cv : list of tuples
             A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices
             for the training and test sets for that fold. For example:
@@ -432,6 +452,18 @@ def randomized_search(
         model_stats = self._setup_tuning("randomized_search", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs)
         param_grid = model_stats['tuning_param_grid']
 
+        # Handle custom metrics
+        is_custom_metric = isinstance(eval_metric, CustomScore)
+        if is_custom_metric:
+            eval_metric_name = eval_metric.name
+            custom_scorer = eval_metric.get_scorer()
+            scoring = {eval_metric_name: custom_scorer}
+            refit_metric = eval_metric_name
+        else:
+            eval_metric_name = eval_metric
+            scoring = self.eval_metrics_in_tuning_format
+            refit_metric = eval_metric
+
         t_start = time()
         
         # Calculate total fits
@@ -443,8 +475,8 @@ def randomized_search(
             estimator=pipeline,
             param_distributions=param_grid, 
             n_iter=n_iter,
-            scoring=self.eval_metrics_in_tuning_format, 
-            refit=eval_metric,
+            scoring=scoring, 
+            refit=refit_metric,
             cv=cv,
             n_jobs=n_jobs,
             verbose=verbose
@@ -461,21 +493,25 @@ def randomized_search(
         t_end = time()
         time_taken = round(t_end - t_start, 2)
 
-        scores = {
-            metric: (
-                -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
-                if metric in self.reverse_signed_eval_metrics else
-                search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
-            )
-            for metric in list(self.eval_metrics_in_tuning_format.keys())
-        }
+        if is_custom_metric:
+            mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_]
+            scores = {eval_metric_name: round(mean_score, 6)}
+        else:
+            scores = {
+                metric: (
+                    -search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
+                    if metric in self.reverse_signed_eval_metrics else
+                    search_result.cv_results_[f'mean_test_{metric}'][search_result.best_index_]
+                )
+                for metric in list(self.eval_metrics_in_tuning_format.keys())
+            }
+            mean_score = search_result.cv_results_[f'mean_test_{eval_metric_name}'][search_result.best_index_]
 
         model_stats['tuned_model'] = search_result.best_estimator_.named_steps['model']
-        mean_score = search_result.cv_results_[f'mean_test_{eval_metric}'][search_result.best_index_]
         model_stats['tuned_model_score'] = round(mean_score, 6)
         model_stats['model_perf'] = scores
         model_stats['time_taken_sec'] = time_taken
-        model_stats['tuned_model_evaluation_metric'] = eval_metric
+        model_stats['tuned_model_evaluation_metric'] = eval_metric_name
         return model_stats
 
         
@@ -483,7 +519,7 @@ def optuna_search(
         self,
         pipeline: Pipeline,
         param_grid: dict,
-        eval_metric: str,
+        eval_metric: Union[str, CustomScore],
         cv: list,
         n_iter: int = 10,
         timeout: Optional[int] = None,
@@ -501,7 +537,7 @@ def optuna_search(
         param_grid : dict
             The dictionary that contains the hyperparameters and their possible values
 
-        eval_metric : str
+        eval_metric : str or CustomScore
             The evaluation metric that will be used to evaluate the model. It can be one of the following:
             
             * 'R2' for R^2 score
@@ -522,6 +558,8 @@ def optuna_search(
             
             * 'F1 Score' for F1 score
 
+            * Or a custom CustomScore object
+
         cv : list of tuples
             A list of (train_idx, test_idx) tuples where each tuple contains numpy arrays of indices
             for the training and test sets for that fold. For example:
@@ -571,6 +609,15 @@ def optuna_search(
         model_stats = self._setup_tuning("optuna", pipeline, param_grid, n_iter=n_iter, n_jobs=n_jobs, prefix_param_grid_flag=False)
         param_grid = model_stats['tuning_param_grid']
 
+        # Handle custom metrics
+        is_custom_metric = isinstance(eval_metric, CustomScore)
+        if is_custom_metric:
+            eval_metric_name = eval_metric.name
+            study_direction = eval_metric.direction
+        else:
+            eval_metric_name = eval_metric
+            study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'] else "minimize"
+
         # Set verbosity levels
         if verbose == 0:
             optuna.logging.set_verbosity(optuna.logging.CRITICAL)
@@ -583,8 +630,6 @@ def optuna_search(
         elif verbose == 4:
             optuna.logging.set_verbosity(optuna.logging.DEBUG)
 
-        study_direction = "maximize" if eval_metric in ['R2', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'] else "minimize"
-
         def objective(trial):
             # Generate parameters for the trial
             params = pipeline.named_steps['model'].get_params()
@@ -617,17 +662,25 @@ def objective(trial):
 
                 new_pipeline.fit(X_train, y_train)
 
+                # Get predictions based on whether we need probabilities or labels
                 if self.ml_problem_type == "Classification" and hasattr(new_pipeline, 'predict_proba'):
                     y_pred = new_pipeline.predict_proba(X_test)
                 else:   
                     y_pred = new_pipeline.predict(X_test)
 
-                # Evaluate performance
-                scores.append(evaluate_model_perf(self.ml_problem_type, y_test, y_pred))
+                if is_custom_metric:
+                    scores.append(evaluate_model_perf(
+                        self.ml_problem_type, 
+                        y_test, 
+                        y_pred,
+                        custom_score=eval_metric
+                    ))
+                else:
+                    scores.append(evaluate_model_perf(self.ml_problem_type, y_test, y_pred))
 
             # Calculate the mean score across all folds
             avg_metrics = {k: np.mean([m[k] if m[k] is not None else -1 for m in scores]) for k in scores[0]}
-            mean_score = avg_metrics.get(eval_metric, float('inf'))
+            mean_score = avg_metrics.get(eval_metric_name, float('inf'))
 
             # Update the best score and model
             if model_stats['tuned_model_score'] is None or (study_direction == "maximize" and mean_score > model_stats['tuned_model_score']) or (study_direction == "minimize" and mean_score < model_stats['tuned_model_score']):
diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py
index 1393571..06308b5 100644
--- a/flexml/helpers/supervised_helpers.py
+++ b/flexml/helpers/supervised_helpers.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
-from typing import Union
+from typing import Union, Optional, Callable
+from flexml.structures.custom_score import CustomScore
 
 from sklearn.metrics import (
     r2_score, 
@@ -10,7 +11,8 @@
     precision_score,
     recall_score,
     f1_score,
-    roc_auc_score)
+    roc_auc_score
+)
 
 
 def _safe_mape(y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray]) -> float:
@@ -36,8 +38,8 @@ def _safe_mape(y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np
 def _evaluate_preds(
     y_true: Union[pd.Series, np.ndarray],
     y_pred: Union[pd.Series, np.ndarray],
-    eval_metric: str,
-    average: str = 'macro'
+    eval_metric: Union[str, CustomScore],
+    average: Optional[str] = 'macro'
 ) -> float:
     """
     Evaluates the model with the given evaluation metric by using the test set
@@ -50,7 +52,7 @@ def _evaluate_preds(
     y_pred : pd.Series or np.ndarray
         The predicted values/probabilities of the target column
 
-    eval_metric : str
+    eval_metric : str or CustomScore
         The evaluation metric that will be used to evaluate the model   
                  
         - Avaiable evalulation metrics for Regression:    
@@ -59,6 +61,8 @@ def _evaluate_preds(
         - Avaiable evalulation metrics for Classification:    
             - Accuracy, Precision, Recall, F1 Score, ROC-AUC
         
+        - Or a custom CustomScore object
+        
     average : str, default='macro'
         The averaging method to use for multiclass classification metrics.
         Options are ['binary', 'micro', 'macro', 'weighted'].
@@ -70,6 +74,14 @@ def _evaluate_preds(
     float
         The evaluation metric score for the desired eval metric
     """
+    # Handle custom callable metrics
+    if isinstance(eval_metric, CustomScore):
+        try:
+            return round(float(eval_metric(y_true, y_pred)), 6)
+        except Exception as e:
+            raise ValueError(f"Error while evaluating with custom score: {str(e)}")
+    
+    # Handle standard string-based metrics
     if eval_metric == 'R2':
         return round(r2_score(y_true, y_pred), 6)
     elif eval_metric == 'MAE':
@@ -102,7 +114,8 @@ def _evaluate_preds(
 def evaluate_model_perf(
     ml_task_type, 
     y_test,
-    y_pred
+    y_pred,
+    custom_score: Optional[CustomScore] = None
 ) -> dict:
     """
     Evaluates how good are the predictions by comparing them with the actual values, returns regression evaluation scores
@@ -120,23 +133,30 @@ def evaluate_model_perf(
         For classification tasks: The predicted probabilities for each class.
         Note: Some models like Perceptron, PassiveAggressiveClassifier, etc. don't have predict_proba method, so they return class labels directly.
     
+    custom_score : CustomScore, optional (default=None)
+        A custom score object with signature: func(y_true, y_pred) -> float
+        If provided, this score will be calculated in addition to standard metrics
+
     Returns
     -------
     dict
-        A dictionary containing the evaluation metric of the current task
+        A dictionary containing the evaluation metrics of the current task
             
             * R2, MAE, MSE, RMSE, MAPE for Regression tasks
 
             * Accuracy, Precision, Recall, F1 Score, ROC-AUC for Classification tasks
+            
+            * Plus the custom eval metric if custom_score is provided
     """
-
+    # Standard metric evaluation
+    standard_metrics = {}
     if ml_task_type == "Regression":
         r2 = _evaluate_preds(y_test, y_pred, 'R2')
         mae = _evaluate_preds(y_test, y_pred, 'MAE')
         mse = _evaluate_preds(y_test, y_pred, 'MSE')
         rmse = _evaluate_preds(y_test, y_pred, 'RMSE')
         mape = _evaluate_preds(y_test, y_pred, 'MAPE')
-        return {
+        standard_metrics = {
             "R2": r2,
             "MAE": mae,
             "MSE": mse,
@@ -164,10 +184,35 @@ def evaluate_model_perf(
         # Use probabilities for ROC-AUC
         roc_auc = _evaluate_preds(y_test, y_pred, 'ROC-AUC', average=avg_method)
         
-        return {
+        standard_metrics = {
             "Accuracy": accuracy,
             "Precision": precision,
             "Recall": recall,
             "F1 Score": f1,
             "ROC-AUC": roc_auc
-        }
\ No newline at end of file
+        }
+    
+    # If custom metric is provided, calculate it and add to standard metrics
+    if custom_score is not None:
+        eval_metric_name = custom_score.name
+        
+        # For Classification: handle proba vs labels
+        if ml_task_type == "Classification" and not custom_score.needs_proba:
+            y_pred_for_metric = y_pred_labels
+        elif ml_task_type == "Classification" and custom_score.needs_proba:
+            # Determine if binary or multiclass
+            if y_pred.shape[1] == 2:
+                y_pred_for_metric = y_pred[:, 1]
+            else:
+                y_pred_for_metric = y_pred
+        else:
+            y_pred_for_metric = y_pred
+
+        # Call custom function and add to standard metrics
+        try:
+            score = _evaluate_preds(y_test, y_pred_for_metric, custom_score)
+            standard_metrics[eval_metric_name] = round(score, 6)
+        except Exception as e:
+            raise ValueError(f"Error while evaluating with custom eval metric '{eval_metric_name}': {str(e)}")
+
+    return standard_metrics
\ No newline at end of file
diff --git a/flexml/helpers/validators.py b/flexml/helpers/validators.py
index 0176cb8..2f95fe9 100644
--- a/flexml/helpers/validators.py
+++ b/flexml/helpers/validators.py
@@ -1,15 +1,17 @@
 import pandas as pd
-from typing import Optional, List
+from typing import Optional, List, Union
 from flexml.config import EVALUATION_METRICS, FEATURE_ENGINEERING_METHODS, CROSS_VALIDATION_METHODS
 from flexml.logger import get_logger
 import re
+from flexml.structures.custom_score import CustomScore
+
 
 def eval_metric_checker(
     ml_task_type: str,
-    eval_metric: Optional[str] = None,
+    eval_metric: Optional[Union[str, CustomScore]] = None,
     all_evaluation_metrics: Optional[List[str]] = None,
     default_evaluation_metric: Optional[str] = None
-) -> str:
+) -> Union[str, CustomScore]:
     """
     Since eval_metric setting and validation is a common process for both Regression and Classification tasks...
     this method is used to set and validate the evaluation metric.
@@ -19,7 +21,7 @@ def eval_metric_checker(
     ml_task_type : str
         The type of ML task ('Regression' or 'Classification')
 
-    eval_metric : str, optional (default='R2' for Regression, 'Accuracy' for Classification)
+    eval_metric : str or CustomScore, optional (default='R2' for Regression, 'Accuracy' for Classification)
         The evaluation metric to use for model evaluation
 
         - Avaiable evalulation metrics for Regression:    
@@ -27,6 +29,8 @@ def eval_metric_checker(
 
         - Avaiable evalulation metrics for Classification:    
             - Accuracy, Precision, Recall, F1 Score, ROC-AUC
+        
+        - Or a custom CustomScore object
     
     all_evaluation_metrics : List[str], (default=None)
         All possible evaluation metrics for the current task (Regression or Classification), e.g. ['R2', 'MAE', 'MSE', 'RMSE', 'MAPE'] for Regression
@@ -40,11 +44,15 @@ def eval_metric_checker(
 
     Returns
     -------
-    str
+    str or CustomScore
         The evaluation metric to use for model evaluation for the current task (Regression or Classification)
     """
     logger = get_logger(__name__, "PROD", False)
+
+    if isinstance(eval_metric, CustomScore):
+        return eval_metric
     
+    # Standard string-based metric validation
     if default_evaluation_metric is None or all_evaluation_metrics is None:
         default_evaluation_metric = EVALUATION_METRICS[ml_task_type]["DEFAULT"]
         all_evaluation_metrics = EVALUATION_METRICS[ml_task_type]["ALL"]
diff --git a/flexml/structures/custom_score.py b/flexml/structures/custom_score.py
new file mode 100644
index 0000000..5e4688c
--- /dev/null
+++ b/flexml/structures/custom_score.py
@@ -0,0 +1,53 @@
+from typing import Union, Callable
+import inspect
+import numpy as np
+import pandas as pd
+from sklearn.metrics import make_scorer
+
+
+class CustomScore:
+    def __init__(
+        self,
+        name: str,
+        score_func: Callable,
+        needs_proba: bool,
+        direction: str
+    ):
+        self.name = name
+        self.score_func = score_func
+        self.needs_proba = needs_proba
+        self.direction = direction
+
+        if direction not in ['maximize', 'minimize']:
+            raise ValueError(f"direction must be either 'maximize' or 'minimize', got '{direction}'")
+
+        if needs_proba is None or not isinstance(needs_proba, bool):
+            raise ValueError(f"needs_proba must be a boolean, got '{needs_proba}'")
+
+        try:
+            sig = inspect.signature(score_func)
+            params = list(sig.parameters.keys())
+            
+            # Check if function has exactly 2 parameters
+            if len(params) != 2:
+                raise ValueError(
+                    f"Custom evaluation function must have exactly 2 parameters (y_true, y_pred), "
+                    f"but got {len(params)} parameters: {params}"
+                )
+        except Exception as e:
+            raise ValueError(f"Error validating custom evaluation function: {str(e)}")
+
+        self.scorer = make_scorer(
+            self.score_func,
+            needs_proba=self.needs_proba,
+            greater_is_better=self.direction == 'maximize'
+        )
+
+    def __call__(self, y_true: Union[pd.Series, np.ndarray], y_pred: Union[pd.Series, np.ndarray]) -> float:
+        return self.score_func(y_true, y_pred)
+
+    def __repr__(self):
+        return f"CustomScore(name={self.name}, score_func={self.score_func.__name__}, needs_proba={self.needs_proba}, direction={self.direction})"
+
+    def get_scorer(self):
+        return self.scorer
\ No newline at end of file
diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py
index 70a48bd..4969280 100644
--- a/flexml/structures/supervised_base.py
+++ b/flexml/structures/supervised_base.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from copy import deepcopy
 from time import time
-from typing import Union, Optional, List, Dict
+from typing import Union, Optional, List, Dict, Callable
 from tqdm import tqdm
 from rich.console import Console
 from rich.table import Table
@@ -32,6 +32,7 @@
     plot_prediction_error,
     plot_calibration_curve
 )
+from flexml.structures.custom_score import CustomScore
 from flexml._model_tuner import ModelTuner
 from flexml._feature_engineer import FeatureEngineering
 
@@ -405,7 +406,16 @@ def __process_experiment_result(self, experiment_stats: dict):
                 for key, value in aggregated_metrics.items()
             }
             
-            best_model_entry = max(model_entries, key=lambda x: x["model_stats"][self.eval_metric])
+            # Get the metric name to use for selecting best model
+            metric_key = self.eval_metric.name if self._is_custom_metric else self.eval_metric
+            
+            # For minimize metrics, select the min instead of max
+            if self._is_custom_metric and self.eval_metric.direction == 'minimize':
+                best_model_entry = min(model_entries, key=lambda x: x["model_stats"][metric_key])
+            elif not self._is_custom_metric and self.__ML_TASK_TYPE == "Regression" and metric_key in ['MAE', 'MSE', 'RMSE', 'MAPE']:
+                best_model_entry = min(model_entries, key=lambda x: x["model_stats"][metric_key])
+            else:
+                best_model_entry = max(model_entries, key=lambda x: x["model_stats"][metric_key])
             
             self.__model_training_info.append({
                 model_name: {
@@ -420,7 +430,10 @@ def start_experiment(
         cv_method: Optional[str] = None,
         n_folds: Optional[int] = None,
         test_size: Optional[float] = None,
-        eval_metric: Optional[str] = None,
+        eval_metric: Optional[Union[str, Callable]] = None,
+        custom_metric_name: Optional[str] = None,
+        custom_metric_needs_proba: Optional[bool] = None,
+        custom_metric_direction: Optional[str] = None,
         random_state: Optional[int] = 42,
         groups_col: Optional[str] = None,
         n_jobs: Optional[int] = -1
@@ -460,7 +473,7 @@ def start_experiment(
         test_size : float, (default=0.25 for hold-out cv, None for other methods)
             The size of the test data if using hold-out or shuffle-based splits
 
-        eval_metric : str, optional (default='R2' for Regression, 'Accuracy' for Classification)
+        eval_metric : str or callable, optional (default='R2' for Regression, 'Accuracy' for Classification)
             The evaluation metric to use for model evaluation
             
             - Avaiable evalulation metrics for Regression:    
@@ -468,6 +481,21 @@ def start_experiment(
 
             - Avaiable evalulation metrics for Classification:    
                 - Accuracy, Precision, Recall, F1 Score, ROC-AUC
+            
+            - Or a custom callable function with signature: func(y_true, y_pred) -> float
+
+        custom_metric_name : str, optional (default=None)
+            The name to use for the custom metric. If None and eval_metric is callable,
+            uses the function's __name__ attribute
+        
+        custom_metric_needs_proba : bool, optional (default=False)
+            For classification tasks only: If True, passes probabilities to the custom metric function.
+            If False, converts probabilities to class labels before passing to the function.
+            Ignored for regression tasks (no error raised)
+        
+        custom_metric_direction : str, optional (default='maximize')
+            Direction for optimizing the custom metric. Either 'maximize' or 'minimize'.
+            Used for sorting models in the leaderboard when using custom metrics
 
         random_state : int, optional (default=None)
             The random state value for the model training process
@@ -485,7 +513,20 @@ def start_experiment(
         - Defaults to a standard 5-fold if neither `n_folds` nor `test_size` is provided
         """
         experiment_size = experiment_size.lower() # Convert to lowercase in case of any case mismatch
-        self.eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
+        if isinstance(eval_metric, Callable):
+            self.eval_metric = CustomScore(
+                name=custom_metric_name,
+                score_func=eval_metric,
+                needs_proba=custom_metric_needs_proba,
+                direction=custom_metric_direction
+            )
+            self._is_custom_metric = True
+            self._custom_metric_name = custom_metric_name
+        else:
+            self.eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
+            self._is_custom_metric = False
+            self._custom_metric_name = None
+
         random_state = random_state_checker(random_state)
 
         # Check cross-validation method params
@@ -611,7 +652,12 @@ def start_experiment(
                         else:
                             y_pred = model.predict(X_test)
 
-                        model_perf = evaluate_model_perf(self.__ML_TASK_TYPE, y_test, y_pred)
+                        model_perf = evaluate_model_perf(
+                            self.__ML_TASK_TYPE,
+                            y_test,
+                            y_pred,
+                            custom_score=self.eval_metric if self._is_custom_metric else None
+                        )
 
                         all_metrics.append(model_perf)
                         all_times.append(time_taken)
@@ -643,10 +689,11 @@ def start_experiment(
                         pbar.update(1)
 
         self.__process_experiment_result(all_model_stats)
-
         self.__logger.info("[PROCESS] Model training is finished!")
-        self.get_best_models(eval_metric)
-        self.show_model_stats(eval_metric)
+
+        display_metric = self._custom_metric_name if self._is_custom_metric else eval_metric
+        self.get_best_models(display_metric)
+        self.show_model_stats(display_metric)
 
     def get_model_by_name(self, model_name: str) -> object:
         """
@@ -704,9 +751,15 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int =
         
         top_n_models = self.__top_n_models_checker(top_n_models)
 
-        if eval_metric is None and hasattr(self, 'eval_metric'):
+        if eval_metric is None:
             eval_metric = self.eval_metric
-        eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
+        
+        if isinstance(eval_metric, CustomScore):
+            eval_metric = eval_metric.name
+        elif isinstance(eval_metric, str) and self._is_custom_metric and eval_metric == self.eval_metric.name:
+            eval_metric = self.eval_metric.name
+        else:
+            eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
         
         model_stats = []
         best_models = []
@@ -1158,11 +1211,15 @@ def __sort_models(self, eval_metric: Optional[str] = None):
             self.__logger.error(error_msg)
             raise ValueError(error_msg)
         
-        # Since lower is better for mae, mse and rmse in Regression tasks, they should be sorted in ascending order
-        if self.__ML_TASK_TYPE == "Regression" and eval_metric in ['MAE', 'MSE', 'RMSE', 'MAPE']:
-            return self._model_stats_df.sort_values(by=eval_metric, ascending=True).reset_index(drop = True)
+        # Determine sort direction
+        if self._is_custom_metric and eval_metric == self._custom_metric_name:
+            ascending = (self.eval_metric.direction == 'minimize')
+        elif self.__ML_TASK_TYPE == "Regression" and eval_metric in ['MAE', 'MSE', 'RMSE', 'MAPE']:
+            ascending = True
         else:
-            return self._model_stats_df.sort_values(by=eval_metric, ascending=False).reset_index(drop = True)
+            ascending = False # F1, ROC-AUC, R2, Accuracy, etc.
+            
+        return self._model_stats_df.sort_values(by=eval_metric, ascending=ascending).reset_index(drop = True)
 
     def show_model_stats(self, eval_metric: Optional[str] = None):
         """
@@ -1193,7 +1250,12 @@ def highlight_best(s: pd.Series) -> list[str]:
             list[str]
                 A list of strings containing the green background color for the best value so we can highlight it while showing the model stats
             """
-            if s.name in ['MAE', 'MSE', 'RMSE', 'MAPE']:
+            # Check if this is a custom metric that should be minimized
+            is_custom_minimize = (self._is_custom_metric and 
+                                 self.eval_metric.direction == 'minimize')
+            
+            # Determine if we should minimize or maximize
+            if s.name in ['MAE', 'MSE', 'RMSE', 'MAPE'] or is_custom_minimize:
                 s_nonneg = s.where(s >= 0, np.nan)
                 best_val = s_nonneg.min()
                 if best_val == float('inf'):
@@ -1205,9 +1267,17 @@ def highlight_best(s: pd.Series) -> list[str]:
             return ['background-color: green' if v else '' for v in is_best]
         
         
-        if eval_metric is None and hasattr(self, 'eval_metric'):
+        if eval_metric is None:
             eval_metric = self.eval_metric
-        eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
+        
+        if isinstance(eval_metric, CustomScore):
+            eval_metric = eval_metric.name
+        elif isinstance(eval_metric, str) and self._is_custom_metric and eval_metric == self.eval_metric.name:
+            eval_metric = self.eval_metric.name
+        else:
+            eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
+
+        
 
         sorted_model_stats_df = self.__sort_models(eval_metric)
         sorted_model_stats_df['Time (sec)'] = sorted_model_stats_df['Time (sec)'].apply(lambda x: f"{x:.2f}")
@@ -1225,7 +1295,14 @@ def highlight_best(s: pd.Series) -> list[str]:
             if len(sorted_model_stats_df) < 2:
                 display(sorted_model_stats_df)
             else:
-                styler = sorted_model_stats_df.style.apply(highlight_best, subset=self.__ALL_EVALUATION_METRICS)
+                # Determine which columns to highlight - always include all standard metrics
+                highlight_columns = self.__ALL_EVALUATION_METRICS.copy()
+                
+                # Add custom metric column if present
+                if self._is_custom_metric:
+                    highlight_columns.append(self._custom_metric_name)
+                
+                styler = sorted_model_stats_df.style.apply(highlight_best, subset=highlight_columns)
                 display(styler) # display is only supported in interactive kernels such as Jupyter Notebook/Google Colab
 
     def tune_model(
@@ -1416,9 +1493,14 @@ def _show_tuning_report(tuning_report: Optional[dict] = None):
             self.__logger.error(error_msg)
             raise ValueError(error_msg)
         
-        if eval_metric is None and hasattr(self, 'eval_metric'):
+        # If no eval_metric provided, use the one from start_experiment
+        if eval_metric is None:
             eval_metric = self.eval_metric
-        eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
+
+        if isinstance(eval_metric, str) and self._is_custom_metric and eval_metric == self.eval_metric.name:
+            eval_metric = self.eval_metric
+        else:
+            eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
         
         # If the user doesn't pass any cross-validation method params, use the last used ones
         if (
diff --git a/tests/test_custom_metrics.py b/tests/test_custom_metrics.py
new file mode 100644
index 0000000..b16772e
--- /dev/null
+++ b/tests/test_custom_metrics.py
@@ -0,0 +1,506 @@
+import unittest
+import numpy as np
+from sklearn.datasets import load_diabetes, load_iris, load_breast_cancer
+from flexml import Regression, Classification
+from flexml.logger import get_logger
+import warnings
+warnings.filterwarnings("ignore")
+
+
+class TestCustomMetrics(unittest.TestCase):
+    """Test suite for custom scoring function feature"""
+    
+    logger = get_logger(__name__, "TEST")
+    logger.setLevel("DEBUG")
+    
+    # =========================================================================
+    # Custom Metric Functions
+    # =========================================================================
+    
+    @staticmethod
+    def custom_mse_doubled(y_true, y_pred):
+        """Custom MSE metric multiplied by 2 (for regression)"""
+        mse = np.mean((y_true - y_pred) ** 2)
+        return mse * 2
+    
+    @staticmethod
+    def custom_accuracy_halved(y_true, y_pred_labels):
+        """Custom accuracy metric divided by 2 (for classification with labels)"""
+        accuracy = np.mean(y_true == y_pred_labels)
+        return accuracy / 2
+    
+    @staticmethod
+    def custom_roc_auc_doubled(y_true, y_proba):
+        """Custom ROC-AUC metric multiplied by 2 (for classification with probabilities)
+        
+        Note: For binary classification, y_proba will be 1D (only positive class probabilities)
+        For multiclass classification, y_proba will be 2D (all class probabilities)
+        """
+        from sklearn.metrics import roc_auc_score
+        
+        # Handle multiclass case (2D array with >2 classes)
+        if len(y_proba.shape) > 1 and y_proba.shape[1] > 2:
+            # Multiclass: use OVR strategy
+            roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
+        else:
+            # Binary classification: y_proba is already 1D with positive class probabilities
+            # OR it's a 2D array with shape (n_samples, 1)
+            if len(y_proba.shape) > 1:
+                y_proba = y_proba.ravel()  # Flatten if needed
+            roc_auc = roc_auc_score(y_true, y_proba)
+        
+        return roc_auc * 2
+    
+    # =========================================================================
+    # Test 1: Regression with Custom Metric
+    # =========================================================================
+    
+    def test_01_regression_custom_metric(self):
+        """Test regression task with custom metric (MSE * 2)"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 1: Regression with Custom Metric (MSE * 2)")
+        self.logger.info("=" * 80)
+        
+        # Load diabetes dataset
+        df = load_diabetes(as_frame=True)['frame']
+        
+        # Create regression model
+        model = Regression(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_mse_doubled,
+            custom_metric_name='Custom MSE x2',
+            custom_metric_direction='minimize',
+            custom_metric_needs_proba=False,
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=1
+        )
+        
+        # Verify custom metric was used
+        self.assertIsNotNone(model._model_stats_df)
+        self.assertIn('Custom MSE x2', model._model_stats_df.columns)
+        
+        # Verify metric values are reasonable (should be roughly 2x normal MSE)
+        # Normal MSE for this dataset is around 3000-6000, so doubled should be 6000-12000
+        custom_metric_values = model._model_stats_df['Custom MSE x2'].values
+        self.assertTrue(all(val > 0 for val in custom_metric_values), 
+                       "Custom metric values should be positive")
+        self.assertTrue(all(val > 1000 for val in custom_metric_values),
+                       "Custom MSE x2 values should be reasonably large")
+        
+        # Verify model selection works
+        best_model = model.get_best_models()
+        self.assertIsNotNone(best_model)
+        
+        self.logger.info("✓ Regression with custom metric test PASSED")
+    
+    # =========================================================================
+    # Test 2: Binary Classification with Custom Metric (Labels)
+    # =========================================================================
+    
+    def test_02_binary_classification_custom_metric_labels(self):
+        """Test binary classification with custom metric for labels (Accuracy / 2)"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 2: Binary Classification with Custom Metric (Accuracy / 2)")
+        self.logger.info("=" * 80)
+        
+        # Load breast cancer dataset
+        df = load_breast_cancer(as_frame=True)['frame']
+        
+        # Create classification model
+        model = Classification(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric that needs labels
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_accuracy_halved,
+            custom_metric_name='Custom Accuracy / 2',
+            custom_metric_needs_proba=False,  # Pass labels, not probabilities
+            custom_metric_direction='maximize',
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=1
+        )
+        
+        # Verify custom metric was used
+        self.assertIsNotNone(model._model_stats_df)
+        self.assertIn('Custom Accuracy / 2', model._model_stats_df.columns)
+        
+        # Verify metric values are in expected range (0 to 0.5 since accuracy/2)
+        custom_metric_values = model._model_stats_df['Custom Accuracy / 2'].values
+        self.assertTrue(all(0 <= val <= 0.5 for val in custom_metric_values),
+                       "Custom Accuracy / 2 should be between 0 and 0.5")
+        self.assertTrue(all(val > 0.2 for val in custom_metric_values),
+                       "Custom Accuracy / 2 should be reasonably high (> 0.2)")
+        
+        # Verify model selection works
+        best_model = model.get_best_models()
+        self.assertIsNotNone(best_model)
+        
+        # Verify that the custom metric parameters were stored
+        self.assertTrue(model._is_custom_metric)
+        self.assertEqual(model.eval_metric.name, 'Custom Accuracy / 2')
+        self.assertEqual(model.eval_metric.needs_proba, False)
+        self.assertEqual(model.eval_metric.direction, 'maximize')
+        
+        self.logger.info("✓ Binary classification with custom metric (labels) test PASSED")
+    
+    # =========================================================================
+    # Test 3: Binary Classification with Custom Metric (Probabilities)
+    # =========================================================================
+    
+    def test_03_binary_classification_custom_metric_proba(self):
+        """Test binary classification with custom metric for probabilities (ROC-AUC * 2)"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 3: Binary Classification with Custom Metric (ROC-AUC * 2)")
+        self.logger.info("=" * 80)
+        
+        # Load breast cancer dataset
+        df = load_breast_cancer(as_frame=True)['frame']
+        
+        # Create classification model
+        model = Classification(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric that needs probabilities
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_roc_auc_doubled,
+            custom_metric_name='Custom ROC-AUC x2',
+            custom_metric_needs_proba=True,  # Pass probabilities
+            custom_metric_direction='maximize',
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=1
+        )
+        
+        # Verify custom metric was used
+        self.assertIsNotNone(model._model_stats_df)
+        self.assertIn('Custom ROC-AUC x2', model._model_stats_df.columns)
+        
+        # Verify metric values are in expected range (0 to 2 since ROC-AUC*2)
+        custom_metric_values = model._model_stats_df['Custom ROC-AUC x2'].values
+        self.assertTrue(all(0 <= val <= 2 for val in custom_metric_values),
+                       "Custom ROC-AUC x2 should be between 0 and 2")
+        self.assertTrue(all(val > 1.0 for val in custom_metric_values),
+                       "Custom ROC-AUC x2 should be reasonably high (> 1.0)")
+        
+        # Verify model selection works
+        best_model = model.get_best_models()
+        self.assertIsNotNone(best_model)
+        
+        # Verify that the custom metric parameters were stored
+        self.assertTrue(model._is_custom_metric)
+        self.assertEqual(model.eval_metric.name, 'Custom ROC-AUC x2')
+        self.assertEqual(model.eval_metric.needs_proba, True)
+        self.assertEqual(model.eval_metric.direction, 'maximize')
+        
+        self.logger.info("✓ Binary classification with custom metric (probabilities) test PASSED")
+    
+    # =========================================================================
+    # Test 4: Multiclass Classification with Custom Metric (Probabilities)
+    # =========================================================================
+    
+    def test_04_multiclass_classification_custom_metric_proba(self):
+        """Test multiclass classification with custom metric for probabilities (ROC-AUC * 2)"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 4: Multiclass Classification with Custom Metric (ROC-AUC * 2)")
+        self.logger.info("=" * 80)
+        
+        # Load iris dataset
+        df = load_iris(as_frame=True)['frame']
+        
+        # Create classification model
+        model = Classification(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric that needs probabilities
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_roc_auc_doubled,
+            custom_metric_name='Custom ROC-AUC x2',
+            custom_metric_needs_proba=True,  # Pass probabilities
+            custom_metric_direction='maximize',
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=1
+        )
+        
+        # Verify custom metric was used
+        self.assertIsNotNone(model._model_stats_df)
+        self.assertIn('Custom ROC-AUC x2', model._model_stats_df.columns)
+        
+        # Verify metric values are in expected range
+        custom_metric_values = model._model_stats_df['Custom ROC-AUC x2'].values
+        self.assertTrue(all(0 <= val <= 2 for val in custom_metric_values),
+                       "Custom ROC-AUC x2 should be between 0 and 2")
+        self.assertTrue(all(val > 1.0 for val in custom_metric_values),
+                       "Custom ROC-AUC x2 should be reasonably high (> 1.0)")
+        
+        # Verify model selection works
+        best_model = model.get_best_models()
+        self.assertIsNotNone(best_model)
+        
+        self.logger.info("✓ Multiclass classification with custom metric (probabilities) test PASSED")
+    
+    # =========================================================================
+    # Test 5: Tuning with Custom Metric (Regression)
+    # =========================================================================
+    
+    def test_05_tuning_with_custom_metric_regression(self):
+        """Test model tuning with custom metric in regression"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 5: Tuning with Custom Metric (Regression)")
+        self.logger.info("=" * 80)
+        
+        # Load diabetes dataset
+        df = load_diabetes(as_frame=True)['frame']
+        
+        # Create regression model
+        model = Regression(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_mse_doubled,
+            custom_metric_name='Custom MSE x2',
+            custom_metric_direction='minimize',
+            custom_metric_needs_proba=False,
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=1
+        )
+        
+        # Tune the best model with custom metric (should inherit parameters)
+        model.tune_model(
+            tuning_method='randomized_search',
+            n_iter=2,  # Small number for quick test
+            n_jobs=-1,
+            verbose=0
+        )
+        
+        # Verify tuning completed and custom metric was used
+        # Check that the leaderboard has a tuned model
+        self.assertIsNotNone(model._model_stats_df)
+        tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('randomized_search', case=False)]
+        self.assertTrue(len(tuned_models) > 0, "Tuned model should be added to leaderboard")
+        
+        self.logger.info("✓ Tuning with custom metric (regression) test PASSED")
+    
+    # =========================================================================
+    # Test 6: Tuning with Custom Metric (Classification)
+    # =========================================================================
+    
+    def test_06_tuning_with_custom_metric_classification(self):
+        """Test model tuning with custom metric in classification"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 6: Tuning with Custom Metric (Classification)")
+        self.logger.info("=" * 80)
+        
+        # Load breast cancer dataset
+        df = load_breast_cancer(as_frame=True)['frame']
+        
+        # Create classification model
+        model = Classification(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_roc_auc_doubled,
+            custom_metric_name='Custom ROC-AUC x2',
+            custom_metric_needs_proba=True,
+            custom_metric_direction='maximize',
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=1
+        )
+        
+        # Tune the best model with custom metric
+        model.tune_model(
+            tuning_method='randomized_search',
+            n_iter=2,  # Small number for quick test
+            n_jobs=-1,
+            verbose=0
+        )
+        
+        # Verify tuning completed
+        # Check that the leaderboard has a tuned model
+        self.assertIsNotNone(model._model_stats_df)
+        tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('randomized_search', case=False)]
+        self.assertTrue(len(tuned_models) > 0, "Tuned model should be added to leaderboard")
+        
+        self.logger.info("✓ Tuning with custom metric (classification) test PASSED")
+    
+    # =========================================================================
+    # Test 7: Optuna Tuning with Custom Metric (Regression)
+    # =========================================================================
+    
+    def test_07_optuna_tuning_regression(self):
+        """Test Optuna tuning with custom metric in regression"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 7: Optuna Tuning with Custom Metric (Regression)")
+        self.logger.info("=" * 80)
+        
+        # Load diabetes dataset
+        df = load_diabetes(as_frame=True)['frame']
+        
+        # Create regression model
+        model = Regression(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_mse_doubled,
+            custom_metric_name='Custom MSE x2',
+            custom_metric_direction='minimize',
+            custom_metric_needs_proba=False,
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=-1
+        )
+        
+        # Tune with Optuna (should inherit parameters)
+        model.tune_model(
+            tuning_method='optuna',
+            n_iter=2,  # Small number for quick test
+            n_jobs=-1,
+            verbose=0
+        )
+        
+        # Verify tuning completed
+        self.assertIsNotNone(model._model_stats_df)
+        tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('optuna', case=False)]
+        self.assertTrue(len(tuned_models) > 0, "Optuna tuned model should be added to leaderboard")
+        
+        # Verify the custom metric was used in tuning
+        tuned_model_row = tuned_models.iloc[0]
+        self.assertIn('Custom MSE x2', tuned_model_row.index)
+        self.assertTrue(tuned_model_row['Custom MSE x2'] > 0, "Custom metric should have valid value")
+        
+        self.logger.info("✓ Optuna tuning with custom metric (regression) test PASSED")
+    
+    # =========================================================================
+    # Test 8: Optuna Tuning with Custom Metric (Classification)
+    # =========================================================================
+    
+    def test_08_optuna_tuning_classification(self):
+        """Test Optuna tuning with custom metric in classification"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 8: Optuna Tuning with Custom Metric (Classification)")
+        self.logger.info("=" * 80)
+        
+        # Load breast cancer dataset
+        df = load_breast_cancer(as_frame=True)['frame']
+        
+        # Create classification model
+        model = Classification(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Run experiment with custom metric that needs probabilities
+        model.start_experiment(
+            experiment_size='quick',
+            eval_metric=self.custom_roc_auc_doubled,
+            custom_metric_name='Custom ROC-AUC x2',
+            custom_metric_direction='maximize',
+            custom_metric_needs_proba=True,
+            cv_method='kfold',
+            n_folds=3,
+            n_jobs=-1
+        )
+        
+        # Tune with Optuna
+        model.tune_model(
+            tuning_method='optuna',
+            n_iter=2,  # Small number for quick test
+            n_jobs=-1,
+            verbose=0
+        )
+        
+        # Verify tuning completed
+        self.assertIsNotNone(model._model_stats_df)
+        tuned_models = model._model_stats_df[model._model_stats_df['Model Name'].str.contains('optuna', case=False)]
+        self.assertTrue(len(tuned_models) > 0, "Optuna tuned model should be added to leaderboard")
+        
+        # Verify the custom metric was used in tuning
+        tuned_model_row = tuned_models.iloc[0]
+        self.assertIn('Custom ROC-AUC x2', tuned_model_row.index)
+        self.assertTrue(0 <= tuned_model_row['Custom ROC-AUC x2'] <= 2, 
+                       "Custom ROC-AUC x2 should be between 0 and 2")
+        
+        self.logger.info("✓ Optuna tuning with custom metric (classification) test PASSED")
+    
+    # =========================================================================
+    # Test 9: Invalid Custom Metric Function
+    # =========================================================================
+    
+    def test_09_invalid_custom_metric_function(self):
+        """Test that invalid custom metric functions are rejected"""
+        self.logger.info("=" * 80)
+        self.logger.info("TEST 9: Invalid Custom Metric Function")
+        self.logger.info("=" * 80)
+        
+        # Load dataset
+        df = load_diabetes(as_frame=True)['frame']
+        
+        # Create regression model
+        model = Regression(
+            data=df,
+            target_col='target',
+            random_state=42
+        )
+        
+        # Define invalid function (wrong number of parameters)
+        def invalid_metric(y_true, y_pred, extra_param):
+            return np.mean((y_true - y_pred) ** 2)
+        
+        # Should raise ValueError
+        with self.assertRaises(ValueError) as context:
+            model.start_experiment(
+                experiment_size='quick',
+                eval_metric=invalid_metric,
+                custom_metric_direction='maximize',
+                custom_metric_needs_proba=False,
+                cv_method='kfold',
+                n_folds=3
+            )
+        
+        self.assertIn("exactly 2 parameters", str(context.exception))
+        
+        self.logger.info("✓ Invalid custom metric function test PASSED")
+
+
+if __name__ == '__main__':
+    unittest.main()
+

From 6ca26e37ec2f0b359ee459298639024eb20bd885 Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 17:56:36 +0300
Subject: [PATCH 2/8] Added name validation to custom score

---
 flexml/structures/custom_score.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flexml/structures/custom_score.py b/flexml/structures/custom_score.py
index 5e4688c..cf99349 100644
--- a/flexml/structures/custom_score.py
+++ b/flexml/structures/custom_score.py
@@ -18,6 +18,9 @@ def __init__(
         self.needs_proba = needs_proba
         self.direction = direction
 
+        if not isinstance(self.name, str) or not self.name.strip():
+            raise ValueError(f"name must be a non-empty string, got '{self.name}'")
+
         if direction not in ['maximize', 'minimize']:
             raise ValueError(f"direction must be either 'maximize' or 'minimize', got '{direction}'")
 

From c8a0639244f26f7a547a88e83c12d54401b9b38d Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 17:58:39 +0300
Subject: [PATCH 3/8] Misspelling fixes

---
 flexml/helpers/supervised_helpers.py |  4 ++--
 flexml/helpers/validators.py         |  4 ++--
 flexml/structures/supervised_base.py | 20 ++++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py
index 06308b5..8bd7a70 100644
--- a/flexml/helpers/supervised_helpers.py
+++ b/flexml/helpers/supervised_helpers.py
@@ -55,10 +55,10 @@ def _evaluate_preds(
     eval_metric : str or CustomScore
         The evaluation metric that will be used to evaluate the model   
                  
-        - Avaiable evalulation metrics for Regression:    
+        - Available evaluation metrics for Regression:    
             - R2, MAE, MSE, RMSE, MAPE
 
-        - Avaiable evalulation metrics for Classification:    
+        - Available evaluation metrics for Classification:    
             - Accuracy, Precision, Recall, F1 Score, ROC-AUC
         
         - Or a custom CustomScore object
diff --git a/flexml/helpers/validators.py b/flexml/helpers/validators.py
index 2f95fe9..300f7a8 100644
--- a/flexml/helpers/validators.py
+++ b/flexml/helpers/validators.py
@@ -24,10 +24,10 @@ def eval_metric_checker(
     eval_metric : str or CustomScore, optional (default='R2' for Regression, 'Accuracy' for Classification)
         The evaluation metric to use for model evaluation
 
-        - Avaiable evalulation metrics for Regression:    
+        - Available evaluation metrics for Regression:    
             - R2, MAE, MSE, RMSE, MAPE
 
-        - Avaiable evalulation metrics for Classification:    
+        - Available evaluation metrics for Classification:    
             - Accuracy, Precision, Recall, F1 Score, ROC-AUC
         
         - Or a custom CustomScore object
diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py
index 4969280..53cc310 100644
--- a/flexml/structures/supervised_base.py
+++ b/flexml/structures/supervised_base.py
@@ -476,10 +476,10 @@ def start_experiment(
         eval_metric : str or callable, optional (default='R2' for Regression, 'Accuracy' for Classification)
             The evaluation metric to use for model evaluation
             
-            - Avaiable evalulation metrics for Regression:    
+            - Available evaluation metrics for Regression:    
                 - R2, MAE, MSE, RMSE, MAPE
 
-            - Avaiable evalulation metrics for Classification:    
+            - Available evaluation metrics for Classification:    
                 - Accuracy, Precision, Recall, F1 Score, ROC-AUC
             
             - Or a custom callable function with signature: func(y_true, y_pred) -> float
@@ -735,10 +735,10 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int =
         eval_metric : str, optional
             Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used
         
-            - Avaiable evalulation metrics for Regression:    
+            - Available evaluation metrics for Regression:    
                 - R2, MAE, MSE, RMSE, MAPE
 
-            - Avaiable evalulation metrics for Classification:    
+            - Available evaluation metrics for Classification:    
                 - Accuracy, Precision, Recall, F1 Score, ROC-AUC
         
         Returns
@@ -1195,10 +1195,10 @@ def __sort_models(self, eval_metric: Optional[str] = None):
         eval_metric : str, optional
             Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used
         
-            - Avaiable evalulation metrics for Regression:    
+            - Available evaluation metrics for Regression:    
                 - R2, MAE, MSE, RMSE, MAPE
 
-            - Avaiable evalulation metrics for Classification:    
+            - Available evaluation metrics for Classification:    
                 - Accuracy, Precision, Recall, F1 Score, ROC-AUC
 
         Returns
@@ -1230,10 +1230,10 @@ def show_model_stats(self, eval_metric: Optional[str] = None):
         eval_metric : str, optional
             Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used
         
-            - Avaiable evalulation metrics for Regression:    
+            - Available evaluation metrics for Regression:    
                 - R2, MAE, MSE, RMSE, MAPE
 
-            - Avaiable evalulation metrics for Classification:    
+            - Available evaluation metrics for Classification:    
                 - Accuracy, Precision, Recall, F1 Score, ROC-AUC
         """
         def highlight_best(s: pd.Series) -> list[str]:
@@ -1371,10 +1371,10 @@ def tune_model(
         eval_metric : str, optional
             Default: eval_metric passed to the start_experiment(), If It was also None, 'R2' for Regression and 'Accuracy' for Classification will be used
         
-            - Avaiable evalulation metrics for Regression:    
+            - Available evaluation metrics for Regression:    
                 - R2, MAE, MSE, RMSE, MAPE
 
-            - Avaiable evalulation metrics for Classification:    
+            - Available evaluation metrics for Classification:    
                 - Accuracy, Precision, Recall, F1 Score, ROC-AUC
 
         param_grid : dict (default = defined custom param dict in flexml/config/tune_model_config.py)

From 44c37f9bf98a334a94d94aa1e5ce9bc4b476ae49 Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 18:04:51 +0300
Subject: [PATCH 4/8] Error fix when show_model_stats() or get_best_models()
 are called before start_experiment()

---
 flexml/structures/supervised_base.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/flexml/structures/supervised_base.py b/flexml/structures/supervised_base.py
index 53cc310..78e7c82 100644
--- a/flexml/structures/supervised_base.py
+++ b/flexml/structures/supervised_base.py
@@ -747,6 +747,7 @@ def get_best_models(self, eval_metric: Optional[str] = None, top_n_models: int =
             Single or a list of top n models based on the evaluation metric or None If no models have been trained yet.
         """
         if len(self.__model_training_info) == 0:
+            self.__logger.warning("No models have been trained yet, start an experiment first via start_experiment()")
             return None
         
         top_n_models = self.__top_n_models_checker(top_n_models)
@@ -1266,7 +1267,10 @@ def highlight_best(s: pd.Series) -> list[str]:
                 is_best = (s == s.max()) & (s != float('inf')) & (s != -1)
             return ['background-color: green' if v else '' for v in is_best]
         
-        
+        if len(self.__model_training_info) == 0:
+            self.__logger.warning("No models have been trained yet, start an experiment first via start_experiment()")
+            return None
+
         if eval_metric is None:
             eval_metric = self.eval_metric
         
@@ -1277,8 +1281,6 @@ def highlight_best(s: pd.Series) -> list[str]:
         else:
             eval_metric = eval_metric_checker(self.__ML_TASK_TYPE, eval_metric)
 
-        
-
         sorted_model_stats_df = self.__sort_models(eval_metric)
         sorted_model_stats_df['Time (sec)'] = sorted_model_stats_df['Time (sec)'].apply(lambda x: f"{x:.2f}")
         sorted_model_stats_df.index += 1

From 89f5e4417063a9a36a981a3842c0a160725b1e6c Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 18:08:07 +0300
Subject: [PATCH 5/8] Improved if logic at evaluate_model_perf(), custom_score
 calculation

---
 flexml/helpers/supervised_helpers.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py
index 8bd7a70..0c66bd5 100644
--- a/flexml/helpers/supervised_helpers.py
+++ b/flexml/helpers/supervised_helpers.py
@@ -197,15 +197,15 @@ def evaluate_model_perf(
         eval_metric_name = custom_score.name
         
         # For Classification: handle proba vs labels
-        if ml_task_type == "Classification" and not custom_score.needs_proba:
-            y_pred_for_metric = y_pred_labels
-        elif ml_task_type == "Classification" and custom_score.needs_proba:
-            # Determine if binary or multiclass
-            if y_pred.shape[1] == 2:
-                y_pred_for_metric = y_pred[:, 1]
+        if ml_task_type == "Classification":
+            if custom_score.needs_proba:
+                if y_pred.shape[1] == 2:
+                    y_pred_for_metric = y_pred[:, 1]
+                else:
+                    y_pred_for_metric = y_pred
             else:
-                y_pred_for_metric = y_pred
-        else:
+                y_pred_for_metric = y_pred_labels
+        else: # Regression
             y_pred_for_metric = y_pred
 
         # Call custom function and add to standard metrics

From 6de04e13aa9ecdfbd213af732d4f48caca4bc34b Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 18:08:48 +0300
Subject: [PATCH 6/8] Removed unused import

---
 flexml/helpers/supervised_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexml/helpers/supervised_helpers.py b/flexml/helpers/supervised_helpers.py
index 0c66bd5..ea33ab3 100644
--- a/flexml/helpers/supervised_helpers.py
+++ b/flexml/helpers/supervised_helpers.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from typing import Union, Optional, Callable
+from typing import Union, Optional
 from flexml.structures.custom_score import CustomScore
 
 from sklearn.metrics import (

From 55bd7bce0a66ac17b34359b12bac0efedbda1a5b Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 18:13:53 +0300
Subject: [PATCH 7/8] Improved custom score class

---
 flexml/structures/custom_score.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/flexml/structures/custom_score.py b/flexml/structures/custom_score.py
index cf99349..98ac50c 100644
--- a/flexml/structures/custom_score.py
+++ b/flexml/structures/custom_score.py
@@ -21,24 +21,24 @@ def __init__(
         if not isinstance(self.name, str) or not self.name.strip():
             raise ValueError(f"name must be a non-empty string, got '{self.name}'")
 
+        if not isinstance(score_func, Callable):
+            raise ValueError(f"score_func must be a callable, got '{type(score_func)}'")
+
+        if needs_proba is None or not isinstance(needs_proba, bool):
+            raise ValueError(f"needs_proba must be a boolean, got '{type(needs_proba)}'")
+
         if direction not in ['maximize', 'minimize']:
             raise ValueError(f"direction must be either 'maximize' or 'minimize', got '{direction}'")
 
-        if needs_proba is None or not isinstance(needs_proba, bool):
-            raise ValueError(f"needs_proba must be a boolean, got '{needs_proba}'")
-
-        try:
-            sig = inspect.signature(score_func)
-            params = list(sig.parameters.keys())
-            
-            # Check if function has exactly 2 parameters
-            if len(params) != 2:
-                raise ValueError(
-                    f"Custom evaluation function must have exactly 2 parameters (y_true, y_pred), "
-                    f"but got {len(params)} parameters: {params}"
-                )
-        except Exception as e:
-            raise ValueError(f"Error validating custom evaluation function: {str(e)}")
+        sig = inspect.signature(score_func)
+        params = list(sig.parameters.keys())
+        
+        # Check if function has exactly 2 parameters
+        if len(params) != 2:
+            raise ValueError(
+                f"Custom evaluation function must have exactly 2 parameters (y_true, y_pred), "
+                f"but got {len(params)} parameters: {params}"
+            )
 
         self.scorer = make_scorer(
             self.score_func,

From 2abe8a40151d88ffd2f5c710f0658648287cd47d Mon Sep 17 00:00:00 2001
From: Ozgur Aslan <ozguraslank@gmail.com>
Date: Sat, 3 Jan 2026 18:23:06 +0300
Subject: [PATCH 8/8] Passed custom score name as a argument in tests

---
 tests/test_custom_metrics.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_custom_metrics.py b/tests/test_custom_metrics.py
index b16772e..aaf1d98 100644
--- a/tests/test_custom_metrics.py
+++ b/tests/test_custom_metrics.py
@@ -490,6 +490,7 @@ def invalid_metric(y_true, y_pred, extra_param):
             model.start_experiment(
                 experiment_size='quick',
                 eval_metric=invalid_metric,
+                custom_metric_name='Invalid Metric',
                 custom_metric_direction='maximize',
                 custom_metric_needs_proba=False,
                 cv_method='kfold',