From c8953da52dac56737ca26b6fe58e4b50b82b84e2 Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:06:13 +0100
Subject: [PATCH 01/10] refactor(templates): document public
 fit/predict/transform wrappers

---
 .../custom_classifier_mixin_estimator.py      | 44 +++++++++++++++++++
 .../custom_transformer_mixin_estimator.py     | 27 ++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/scikit_longitudinal/templates/custom_classifier_mixin_estimator.py b/scikit_longitudinal/templates/custom_classifier_mixin_estimator.py
index 0c22181..b8284cf 100644
--- a/scikit_longitudinal/templates/custom_classifier_mixin_estimator.py
+++ b/scikit_longitudinal/templates/custom_classifier_mixin_estimator.py
@@ -62,6 +62,25 @@ def wrapper(X: np.ndarray, *args, **kwargs) -> Any:
     def fit(
         self, X: np.ndarray, y: np.ndarray = None, sample_weight: np.ndarray = None
     ) -> "CustomClassifierMixinEstimator":
+        """Fit the classifier to the training data.
+
+        Validates ``X`` (and ``y`` when provided) with scikit-learn's
+        ``check_X_y`` / ``check_array`` and then delegates to the subclass
+        implementation in ``_fit``. ``sample_weight`` is forwarded only when
+        the subclass's ``_fit`` declares it.
+
+        Args:
+            X (np.ndarray):
+                Training input samples of shape ``(n_samples, n_features)``.
+            y (np.ndarray, optional):
+                Target class labels of shape ``(n_samples,)``.
+            sample_weight (np.ndarray, optional):
+                Per-sample weights of shape ``(n_samples,)``. Forwarded to
+                ``_fit`` only when supported.
+
+        Returns:
+            CustomClassifierMixinEstimator: The fitted estimator (``self``).
+        """
         if y is None:
             return self._check_array_decorator(self._fit)(X)
         _fit_sig = inspect.signature(self._fit)
@@ -74,10 +93,35 @@ def fit(
 
     @final
     def predict(self, X: np.ndarray) -> np.ndarray:
+        """Predict class labels for the input samples.
+
+        Validates ``X`` with scikit-learn's ``check_array`` and delegates to
+        the subclass implementation in ``_predict``.
+
+        Args:
+            X (np.ndarray):
+                Input samples of shape ``(n_samples, n_features)``.
+
+        Returns:
+            np.ndarray: Predicted class labels of shape ``(n_samples,)``.
+        """
         return self._check_array_decorator(self._predict)(X)
 
     @final
     def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """Predict class probabilities for the input samples.
+
+        Validates ``X`` with scikit-learn's ``check_array`` and delegates to
+        the subclass implementation in ``_predict_proba``.
+
+        Args:
+            X (np.ndarray):
+                Input samples of shape ``(n_samples, n_features)``.
+
+        Returns:
+            np.ndarray: Class probabilities of shape ``(n_samples, n_classes)``,
+            with columns ordered as in ``self.classes_``.
+        """
         return self._check_array_decorator(self._predict_proba)(X)
 
     def _fit(
diff --git a/scikit_longitudinal/templates/custom_transformer_mixin_estimator.py b/scikit_longitudinal/templates/custom_transformer_mixin_estimator.py
index a0c0841..dd6b296 100644
--- a/scikit_longitudinal/templates/custom_transformer_mixin_estimator.py
+++ b/scikit_longitudinal/templates/custom_transformer_mixin_estimator.py
@@ -53,12 +53,39 @@ def wrapper(X: np.ndarray, *args, **kwargs) -> Any:
     def fit(
         self, X: np.ndarray, y: np.ndarray = None
     ) -> "CustomTransformerMixinEstimator":
+        """Fit the transformer to the input data.
+
+        Validates ``X`` (and ``y`` when provided) with scikit-learn's
+        ``check_X_y`` / ``check_array`` and then delegates to the subclass
+        implementation in ``_fit``.
+
+        Args:
+            X (np.ndarray):
+                Training input samples of shape ``(n_samples, n_features)``.
+            y (np.ndarray, optional):
+                Target values of shape ``(n_samples,)``.
+
+        Returns:
+            CustomTransformerMixinEstimator: The fitted transformer (``self``).
+        """
         if y is None:
             return self._check_array_decorator(self._fit)(X)
         return self._check_X_y_decorator(self._fit)(X, y)
 
     @final
     def transform(self, X: np.ndarray) -> np.ndarray:
+        """Apply the transformation to the input data.
+
+        Validates ``X`` with scikit-learn's ``check_array`` and delegates to
+        the subclass implementation in ``_transform``.
+
+        Args:
+            X (np.ndarray):
+                Input samples of shape ``(n_samples, n_features)``.
+
+        Returns:
+            np.ndarray: Transformed array.
+        """
         return self._check_array_decorator(self._transform)(X)
 
     def _fit(

From b82d1d8c4411d7c78709d7199e9552a88fa90be1 Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:06:18 +0100
Subject: [PATCH 02/10] refactor(docs): restructure API reference nav and
 landing pages

---
 .../data_preparation/aggregation_function.md  | 16 ++++--
 .../data_preparation/longitudinal_dataset.md  | 11 ----
 .../API/data_preparation/merwav_time_minus.md | 16 ++++--
 docs/API/data_preparation/merwav_time_plus.md | 16 ++++--
 docs/API/data_preparation/sepwav.md           | 53 ++++++++++++++++---
 .../ensemble/longitudinal_stacking.md         | 20 -------
 .../ensemble/longitudinal_voting.md           | 23 --------
 docs/API/index.md                             |  3 --
 docs/API/pipeline/longitudinal_pipeline.md    | 11 ----
 ...correlation_feature_selection_per_group.md | 30 +++++++----
 zensical.toml                                 | 20 ++++---
 11 files changed, 112 insertions(+), 107 deletions(-)
 delete mode 100644 docs/API/estimators/ensemble/longitudinal_stacking.md
 delete mode 100644 docs/API/estimators/ensemble/longitudinal_voting.md

diff --git a/docs/API/data_preparation/aggregation_function.md b/docs/API/data_preparation/aggregation_function.md
index 0eaf0a2..9fced8b 100644
--- a/docs/API/data_preparation/aggregation_function.md
+++ b/docs/API/data_preparation/aggregation_function.md
@@ -1,13 +1,21 @@
-# Aggregation Function for Longitudinal Data
+# Aggregation Function
 
 ??? tip "What is the AggrFunc module?"
     The `AggrFunc` module facilitates the application of aggregation functions to feature groups within a longitudinal
     dataset, enabling the use of temporal information before applying traditional machine learning algorithms.
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `AggrFunc` module's usage before exploring its API.
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
 
-    [See The Temporal Dependency Guide ](../../tutorials/temporal_dependency.md){ .md-button }
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.data_preparation.aggregation_function.AggrFunc
     options:
diff --git a/docs/API/data_preparation/longitudinal_dataset.md b/docs/API/data_preparation/longitudinal_dataset.md
index 9137dea..446567a 100644
--- a/docs/API/data_preparation/longitudinal_dataset.md
+++ b/docs/API/data_preparation/longitudinal_dataset.md
@@ -1,16 +1,5 @@
 # Longitudinal Dataset
 
-??? tip "What is the LongitudinalDataset module?"
-    The `LongitudinalDataset` module is a comprehensive container designed for managing and preparing longitudinal datasets.
-    It provides essential data management and transformation capabilities, facilitating the development and application
-    of machine learning algorithms tailored to longitudinal data classification tasks. Built around a `pandas` DataFrame,
-    it enhances functionality while maintaining a familiar interface.
-
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding
-    of feature groups and the `LongitudinalDataset` module's usage before exploring its API.
-
-    [See the Temporal Dependency Guide](../../tutorials/temporal_dependency.md){ .md-button }
-
 ## ::: scikit_longitudinal.data_preparation.longitudinal_dataset.LongitudinalDataset
     options:
         heading: "LongitudinalDataset"
diff --git a/docs/API/data_preparation/merwav_time_minus.md b/docs/API/data_preparation/merwav_time_minus.md
index 5652cf9..6d72ed2 100644
--- a/docs/API/data_preparation/merwav_time_minus.md
+++ b/docs/API/data_preparation/merwav_time_minus.md
@@ -1,4 +1,4 @@
-# Merging Waves and Discarding Time Indices for Longitudinal Data
+# Merging Waves and Discarding Time Indices
 
 ??? tip "What is the MerWavTimeMinus module?"
     The `MerWavTimeMinus` module transforms longitudinal data by merging all features across waves into a single set,
@@ -6,10 +6,18 @@
     temporal dependencies. It provides methods for data preparation and transformation, including `prepare_data` and
     `transform`.
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `MerWavTimeMinus` module's usage before exploring its API.
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
 
-    [See The Temporal Dependency Guide ](../../tutorials/temporal_dependency.md){ .md-button }
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.data_preparation.merwav_time_minus.MerWavTimeMinus
     options:
diff --git a/docs/API/data_preparation/merwav_time_plus.md b/docs/API/data_preparation/merwav_time_plus.md
index e8a7f6a..916bffe 100644
--- a/docs/API/data_preparation/merwav_time_plus.md
+++ b/docs/API/data_preparation/merwav_time_plus.md
@@ -1,4 +1,4 @@
-# Merging Waves and Keeping Time Indices for Longitudinal Data
+# Merging Waves and Keeping Time Indices
 
 ??? tip "What is the MerWavTimePlus module?"
     The MerWavTimePlus module transforms longitudinal data by merging all features across waves into a single set while
@@ -6,10 +6,18 @@
     leverage temporal dependencies and patterns. It provides methods for data preparation and transformation, including
     prepare_data and transform.
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `MerWavTimePlus` module's usage before exploring its API.
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
 
-    [See The Temporal Dependency Guide ](../../tutorials/temporal_dependency.md){ .md-button }
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.data_preparation.merwav_time_plus.MerWavTimePlus
     options:
diff --git a/docs/API/data_preparation/sepwav.md b/docs/API/data_preparation/sepwav.md
index 5663f46..1461c11 100644
--- a/docs/API/data_preparation/sepwav.md
+++ b/docs/API/data_preparation/sepwav.md
@@ -1,21 +1,58 @@
-# Separate Waves Classifier for Longitudinal Data
+# Separate Waves Classifier
 
-??? tip "What is the SepWav module?"
-    The `SepWav` module implements the Separate Waves strategy for longitudinal data analysis.
-    It trains individual classifiers on each wave (time point) and combines their predictions using ensemble methods
-    like voting or stacking. This approach leverages temporal information for improved model performance.
+??? tip "Abstract of Separate Waves (SepWav)"
+    *Extracted from "A New Longitudinal Classification Method Based on Stacking Predictions for Separate Time Points" (BCS SGAI AI-2025).*
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `SepWav` module's usage before exploring its API.
+    Biomedical research often uses longitudinal data with repeated measurements of variables across time (e.g. cholesterol measured across time), which is challenging for standard machine learning algorithms due to intrinsic temporal dependencies. The Separate Waves (SepWav) data-transformation method trains a base classifier for each time point ("wave") and aggregates their predictions via voting. However, the simplicity of the voting mechanism may not be enough to capture complex patterns of time-dependent interactions involving the base classifiers' predictions. Hence, we propose a novel SepWav method where the simple voting mechanism is replaced by a stacking-based meta-classifier that integrates the base classifiers' wave-specific predictions into a final predicted class label, aiming at improving predictive performance. Experiments with 20 datasets of ageing-related diseases have shown that, overall, the proposed Stacking-based SepWav method achieved significantly better predictive performance than two other methods for longitudinal classification in most cases, when using class-weight adjustment as a class-balancing method.
 
-    [See The Temporal Dependency Guide ](../../tutorials/temporal_dependency.md){ .md-button }
+    [See More In References :fontawesome-solid-book:](../../publications.md){ .md-button }
 
 ## ::: scikit_longitudinal.data_preparation.separate_waves.SepWav
     options:
         heading: "SepWav"
+        inherited_members: true
         members:
             - get_params
             - fit
             - predict
             - predict_proba
             - predict_wave
+
+---
+
+## SepWav ensemble back-ends
+
+`SepWav` delegates the final aggregation of per-wave predictions to one of the
+two classifiers below.
+
+### Longitudinal Voting Classifier
+
+Aggregates per-wave predictions with a configurable voting rule: simple
+majority, linear or exponential recency decay, or cross-validation-weighted
+voting.
+
+#### ::: scikit_longitudinal.estimators.ensemble.longitudinal_voting.longitudinal_voting.LongitudinalVotingClassifier
+    options:
+        heading: "LongitudinalVotingClassifier"
+        inherited_members: true
+        members:
+            - fit
+            - predict
+            - predict_proba
+
+#### ::: scikit_longitudinal.estimators.ensemble.longitudinal_voting.longitudinal_voting.LongitudinalEnsemblingStrategy
+
+### Longitudinal Stacking Classifier
+
+Trains a meta-learner on the class probabilities emitted by the per-wave
+classifiers fitted by `SepWav`.
+
+#### ::: scikit_longitudinal.estimators.ensemble.longitudinal_stacking.longitudinal_stacking.LongitudinalStackingClassifier
+    options:
+        heading: "LongitudinalStackingClassifier"
+        inherited_members: true
+        members:
+            - fit
+            - predict
+            - predict_proba
+
diff --git a/docs/API/estimators/ensemble/longitudinal_stacking.md b/docs/API/estimators/ensemble/longitudinal_stacking.md
deleted file mode 100644
index b9eb088..0000000
--- a/docs/API/estimators/ensemble/longitudinal_stacking.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Longitudinal Stacking Classifier
-
-??? tip "What is the Longitudinal Stacking Classifier?"
-    The Longitudinal Stacking Classifier is a sophisticated ensemble method designed to address the complexities of longitudinal data. It employs a stacking approach, combining predictions from multiple pre-trained base estimators to serve as input features for a meta-learner, which generates the final prediction. This classifier is particularly effective for capturing temporal dependencies and enhancing predictive performance in longitudinal datasets, especially when used with the "SepWav" (Separate Waves) strategy.
-
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of feature groups and the `LongitudinalStackingClassifier`'s usage before exploring its API.
-
-    [See The Temporal Dependency Guide ](../../../tutorials/temporal_dependency.md){ .md-button }
-
-## ::: scikit_longitudinal.estimators.ensemble.longitudinal_stacking.longitudinal_stacking.LongitudinalStackingClassifier
-    options:
-        heading: "LongitudinalStackingClassifier"
-        members:
-            - _fit
-            - _predict
-            - _predict_proba
-
-!!! note "Use of underscore in method names"
-    `_predict` should be called via `predict`; we handle the call to `_predict` in the `predict` method.
-    The same applies to `_predict_proba` and `predict_proba`.
diff --git a/docs/API/estimators/ensemble/longitudinal_voting.md b/docs/API/estimators/ensemble/longitudinal_voting.md
deleted file mode 100644
index dda6a18..0000000
--- a/docs/API/estimators/ensemble/longitudinal_voting.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Longitudinal Voting Classifier
-
-??? tip "What is the Longitudinal Voting Classifier?"
-    The Longitudinal Voting Classifier is a versatile ensemble method designed to handle the unique challenges posed
-    by longitudinal data. It leverages different voting strategies to combine predictions from multiple base estimators,
-    enhancing predictive performance. The base estimators are individually trained, and their predictions are
-    aggregated based on the chosen voting strategy to generate the final prediction.
-
-    Mainly used within SepWav. Relate to this primitive.
-
-## ::: scikit_longitudinal.estimators.ensemble.longitudinal_voting.longitudinal_voting.LongitudinalVotingClassifier
-    options:
-        heading: "LongitudinalVotingClassifier"
-        members:
-            - _fit
-            - _predict
-            - _predict_proba
-
-!!! note "Use of underscore in method names"
-    `_predict` should be called via `predict` we handle the call to `_predict` in the `predict` method.
-    The same applies to `_predict_proba` and `predict_proba`.
-
-## ::: scikit_longitudinal.estimators.ensemble.longitudinal_voting.longitudinal_voting.LongitudinalEnsemblingStrategy
diff --git a/docs/API/index.md b/docs/API/index.md
index e1000cc..3dad7c8 100644
--- a/docs/API/index.md
+++ b/docs/API/index.md
@@ -49,6 +49,3 @@ Welcome to `Sklong`'s API where you will find all references to each and one of
     [Jump to primitives](pipeline/longitudinal_pipeline.md)
 
 </div>
-
-!!! question "Looking for credits and paper references?"
-    Algorithm-specific citations and contributor attributions live in [Publications](../publications.md).
diff --git a/docs/API/pipeline/longitudinal_pipeline.md b/docs/API/pipeline/longitudinal_pipeline.md
index ee6c05e..96fc555 100644
--- a/docs/API/pipeline/longitudinal_pipeline.md
+++ b/docs/API/pipeline/longitudinal_pipeline.md
@@ -1,16 +1,5 @@
 # Longitudinal Pipeline
 
-??? tip "What is the LongitudinalPipeline module?"
-    The `LongitudinalPipeline` module extends scikit-learn's `Pipeline` to handle longitudinal data, ensuring that
-    the structure of longitudinal features is updated and maintained throughout transformations.
-    It is designed for longitudinal classification tasks, integrating seamlessly with scikit-learn's ecosystem.
-
-    Let's stack your steps and build a nice LongitudinalPipeline!
-
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of feature groups and the `LongitudinalPipeline` module's usage before exploring its API.
-
-    [See The Temporal Dependency Guide ](../../tutorials/temporal_dependency.md){ .md-button }
-
 ## ::: scikit_longitudinal.pipeline.LongitudinalPipeline
     options:
         heading: "LongitudinalPipeline"
diff --git a/docs/API/preprocessors/feature_selection/correlation_feature_selection_per_group.md b/docs/API/preprocessors/feature_selection/correlation_feature_selection_per_group.md
index 44d7ecf..d452af9 100644
--- a/docs/API/preprocessors/feature_selection/correlation_feature_selection_per_group.md
+++ b/docs/API/preprocessors/feature_selection/correlation_feature_selection_per_group.md
@@ -1,22 +1,30 @@
 # Correlation Based Feature Selection Per Group (CFS Per Group)
 
-??? tip "What is the CFS Per Group module?"
-    The `CorrelationBasedFeatureSelectionPerGroup` module implements the CFS-Per-Group algorithm, a longitudinal
-    variant of the standard CFS method. It is designed for feature selection in longitudinal datasets by considering
-    temporal variations across multiple waves (time points).
+??? tip "Abstract of CorrelationBasedFeatureSelectionPerGroup"
+    *Extracted from Pomsuwan & Freitas (2017), "Feature selection for the classification of longitudinal human ageing data".*
 
-    The algorithm operates in two phases: selecting features within each longitudinal group and then refining the
-    selection across all groups and non-longitudinal features.
+    We propose a new variant of the Correlation-based Feature Selection (CFS) method for coping with longitudinal data - where variables are repeatedly measured across different time points. The proposed CFS variant is evaluated on ten datasets created using data from the English Longitudinal Study of Ageing (ELSA), with different age-related diseases used as the class variables to be predicted. The results show that, overall, the proposed CFS variant leads to better predictive performance than the standard CFS and the baseline approach of no feature selection, when using Naïve Bayes and J48 decision tree induction as classification algorithms (although the difference in performance is very small in the results for J4.8). We also report the most relevant features selected by J48 across the datasets.
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding
-    of feature groups and the `CFS Per Group` module's usage before exploring its API.
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
 
-    [See The Temporal Dependency Guide ](../../../tutorials/temporal_dependency.md){ .md-button }
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.preprocessors.feature_selection.correlation_feature_selection.cfs_per_group.CorrelationBasedFeatureSelectionPerGroup
     options:
         heading: "CorrelationBasedFeatureSelectionPerGroup"
+        inherited_members: true
         members:
-            - _fit
-            - _transform
+            - fit
+            - transform
             - apply_selected_features_and_rename
diff --git a/zensical.toml b/zensical.toml
index 4d4831e..0d57e31 100644
--- a/zensical.toml
+++ b/zensical.toml
@@ -46,14 +46,18 @@ nav = [
             { "Correlation Feature Selection" = "API/preprocessors/feature_selection/correlation_feature_selection_per_group.md" },
         ]},
         { "Estimators" = [
-            { "Lexico Decision Tree" = "API/estimators/trees/lexico_decision_tree_classifier.md" },
-            { "Lexico Random Forest" = "API/estimators/ensemble/lexico_random_forest.md" },
-            { "Lexico Deep Forest" = "API/estimators/ensemble/lexico_deep_forest.md" },
-            { "Lexico Gradient Boosting" = "API/estimators/ensemble/lexico_gradient_boosting.md" },
-            { "Nested Trees" = "API/estimators/ensemble/nested_trees.md" },
-            { "TpT Decision Tree" = "API/estimators/trees/tpt_decision_tree_classifier.md" },
-            { "Longitudinal Stacking" = "API/estimators/ensemble/longitudinal_stacking.md" },
-            { "Longitudinal Voting" = "API/estimators/ensemble/longitudinal_voting.md" },
+            { "Classifiers" = [
+                { "Lexico Decision Tree" = "API/estimators/trees/lexico_decision_tree_classifier.md" },
+                { "Lexico Random Forest" = "API/estimators/ensemble/lexico_random_forest.md" },
+                { "Lexico Deep Forest" = "API/estimators/ensemble/lexico_deep_forest.md" },
+                { "Lexico Gradient Boosting" = "API/estimators/ensemble/lexico_gradient_boosting.md" },
+                { "Nested Trees" = "API/estimators/ensemble/nested_trees.md" },
+                { "TpT Decision Tree" = "API/estimators/trees/tpt_decision_tree_classifier.md" },
+            ]},
+            { "Regressors" = [
+                { "Lexico Decision Tree" = "API/estimators/trees/lexico_decision_tree_regressor.md" },
+                { "TpT Decision Tree" = "API/estimators/trees/tpt_decision_tree_regressor.md" },
+            ]},
         ]},
         { "Pipeline" = [
             { "Longitudinal Pipeline" = "API/pipeline/longitudinal_pipeline.md" },

From 83c4881ae27cc6341607579b3b60e08b755bbd0e Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:06:22 +0100
Subject: [PATCH 03/10] refactor(docs): add estimator reference pages for trees
 and ensembles

---
 .../estimators/ensemble/lexico_deep_forest.md | 39 ++++++++++++-------
 .../ensemble/lexico_gradient_boosting.md      | 38 +++++++++++-------
 .../ensemble/lexico_random_forest.md          | 35 ++++++++++-------
 docs/API/estimators/ensemble/nested_trees.md  | 37 ++++++++++--------
 .../trees/lexico_decision_tree_classifier.md  | 37 ++++++++++--------
 .../trees/lexico_decision_tree_regressor.md   | 31 +++++++++++++++
 .../trees/tpt_decision_tree_classifier.md     | 36 ++++++++---------
 .../trees/tpt_decision_tree_regressor.md      | 31 +++++++++++++++
 8 files changed, 189 insertions(+), 95 deletions(-)
 create mode 100644 docs/API/estimators/trees/lexico_decision_tree_regressor.md
 create mode 100644 docs/API/estimators/trees/tpt_decision_tree_regressor.md

diff --git a/docs/API/estimators/ensemble/lexico_deep_forest.md b/docs/API/estimators/ensemble/lexico_deep_forest.md
index b028ffb..70cf937 100644
--- a/docs/API/estimators/ensemble/lexico_deep_forest.md
+++ b/docs/API/estimators/ensemble/lexico_deep_forest.md
@@ -1,26 +1,35 @@
 # Lexico Deep Forest Classifier
 
-??? tip "What is the Lexico Deep Forest Classifier?"
-    The Lexico Deep Forest Classifier is an advanced ensemble learning model designed for longitudinal data analysis.
-    It extends the Deep Forest framework by incorporating longitudinal-adapted base estimators that capture temporal
-    complexities and interdependencies inherent in longitudinal data. The classifier combines accurate learners
-    (longitudinal base estimators) and weak learners (diversity non-longitudinal estimators) to improve robustness
-    and generalization, making it ideal for applications like medical studies or time-series classification.
+??? tip "Abstract of LexicoDeepForestClassifier"
+    *Extracted from Ribeiro & Freitas (2024), "Lexicographical random forests for longitudinal data classification".*
 
-    The classifier uses Lexico Random Forest classifiers as base estimators, which are specialized to handle
-    the temporal structure of longitudinal data.
+    Standard supervised machine learning methods often ignore the temporal information represented in longitudinal data, but that information can lead to more precise predictions in classification tasks. Data preprocessing techniques and classification algorithms can be adapted to cope directly with longitudinal data inputs, making use of temporal information such as the time-index of features and previous measurements of the class variable. In this article, we propose two changes to the classification task of predicting age-related diseases in a real-world dataset created from the English Longitudinal Study of Ageing. First, we explore the addition of previous measurements of the class variable, and estimating the missing data in those added features using intermediate classifiers. Second, we propose a new split-feature selection procedure for a random forest's decision trees, which considers the candidate features' time-indexes, in addition to the information gain ratio. Our experiments compared the proposed approaches to baseline approaches, in 3 prediction scenarios, varying the "time gap" for the prediction - how many years in advance the class (occurrence of an age-related disease) is predicted. The experiments were performed on 10 datasets varying the class variable, and showed that the proposed approaches increased the random forest's predictive accuracy.
+
+    Adapted and integrated into a Deep Forest cascade, this estimator stacks layers of `LexicoRandomForestClassifier`s (and optional diversity learners) so that each layer applies the lexicographic split-selection procedure above while propagating wave-aware predictions through the cascade.
+
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
+
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.estimators.ensemble.lexicographical.lexico_deep_forest.LexicoDeepForestClassifier
     options:
         heading: "LexicoDeepForestClassifier"
+        inherited_members: true
         members:
-            - _fit
-            - _predict
-            - _predict_proba
-
-!!! note "Use of underscore in method names"
-    `_predict` should be called via `predict` we handle the call to `_predict` in the `predict` method.
-    The same applies to `_predict_proba` and `predict_proba`.
+            - fit
+            - predict
+            - predict_proba
 
 ## ::: scikit_longitudinal.estimators.ensemble.lexicographical.lexico_deep_forest.LongitudinalClassifierType
 
diff --git a/docs/API/estimators/ensemble/lexico_gradient_boosting.md b/docs/API/estimators/ensemble/lexico_gradient_boosting.md
index adcd494..cf02ed1 100644
--- a/docs/API/estimators/ensemble/lexico_gradient_boosting.md
+++ b/docs/API/estimators/ensemble/lexico_gradient_boosting.md
@@ -1,23 +1,33 @@
 # Lexico Gradient Boosting Classifier
 
-??? tip "What is the Lexico Gradient Boosting Classifier?"
-    The Lexico Gradient Boosting Classifier is an advanced ensemble learning model tailored for longitudinal data analysis.
-    It combines the power of gradient boosting with a lexicographic optimization approach to prioritize more recent
-    data points (waves) in its decision-making process. This makes it particularly effective for datasets where temporal
-    recency is crucial, such as medical studies or time-series classification.
+??? tip "Abstract of LexicoGradientBoostingClassifier"
+    *Extracted from Ribeiro & Freitas (2024), "Lexicographical random forests for longitudinal data classification".*
 
-    The classifier uses Lexico Decision Tree Regressors as base estimators, which are specialized to handle
-    the temporal structure of longitudinal data.
+    Standard supervised machine learning methods often ignore the temporal information represented in longitudinal data, but that information can lead to more precise predictions in classification tasks. Data preprocessing techniques and classification algorithms can be adapted to cope directly with longitudinal data inputs, making use of temporal information such as the time-index of features and previous measurements of the class variable. In this article, we propose two changes to the classification task of predicting age-related diseases in a real-world dataset created from the English Longitudinal Study of Ageing. First, we explore the addition of previous measurements of the class variable, and estimating the missing data in those added features using intermediate classifiers. Second, we propose a new split-feature selection procedure for a random forest's decision trees, which considers the candidate features' time-indexes, in addition to the information gain ratio. Our experiments compared the proposed approaches to baseline approaches, in 3 prediction scenarios, varying the "time gap" for the prediction - how many years in advance the class (occurrence of an age-related disease) is predicted. The experiments were performed on 10 datasets varying the class variable, and showed that the proposed approaches increased the random forest's predictive accuracy.
+
+    Adapted and integrated into a Gradient Boosting framework, this estimator boosts `LexicoDecisionTreeRegressor`s as base learners, so each successive tree applies the lexicographic split-selection procedure above while fitting the residuals of the previous iterations.
+
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
+
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.estimators.ensemble.lexicographical.lexico_gradient_boosting.LexicoGradientBoostingClassifier
     options:
         heading: "LexicoGradientBoostingClassifier"
+        inherited_members: true
         members:
-            - _fit
-            - _predict
-            - _predict_proba
+            - fit
+            - predict
+            - predict_proba
             - feature_importances_
-
-!!! note "Use of underscore in method names"
-    `_predict` should be called via `predict` we handle the call to `_predict` in the `predict` method.
-    The same applies to `_predict_proba` and `predict_proba`.
diff --git a/docs/API/estimators/ensemble/lexico_random_forest.md b/docs/API/estimators/ensemble/lexico_random_forest.md
index 92f1fef..016a205 100644
--- a/docs/API/estimators/ensemble/lexico_random_forest.md
+++ b/docs/API/estimators/ensemble/lexico_random_forest.md
@@ -1,25 +1,32 @@
 # Lexico Random Forest Classifier
 
-??? tip "What is the Lexico Random Forest Classifier?"
-    The Lexico Random Forest Classifier is an advanced ensemble learning model designed specifically for
-    longitudinal data analysis. It extends the traditional Random Forest algorithm by incorporating a
-    lexicographic optimization approach within each decision tree. This approach prioritizes more recent data points
-    (waves) when selecting splits, based on the premise that recent measurements are more predictive and relevant.
+??? tip "Abstract of LexicoRandomForestClassifier"
+    *Extracted from Ribeiro & Freitas (2024), "Lexicographical random forests for longitudinal data classification".*
 
-    The classifier is optimized for efficiency using a Cython implementation and is particularly suited for applications
-    where temporal recency is critical, such as medical studies or time-series of time-series classification.
+    Standard supervised machine learning methods often ignore the temporal information represented in longitudinal data, but that information can lead to more precise predictions in classification tasks. Data preprocessing techniques and classification algorithms can be adapted to cope directly with longitudinal data inputs, making use of temporal information such as the time-index of features and previous measurements of the class variable. In this article, we propose two changes to the classification task of predicting age-related diseases in a real-world dataset created from the English Longitudinal Study of Ageing. First, we explore the addition of previous measurements of the class variable, and estimating the missing data in those added features using intermediate classifiers. Second, we propose a new split-feature selection procedure for a random forest's decision trees, which considers the candidate features' time-indexes, in addition to the information gain ratio. Our experiments compared the proposed approaches to baseline approaches, in 3 prediction scenarios, varying the "time gap" for the prediction - how many years in advance the class (occurrence of an age-related disease) is predicted. The experiments were performed on 10 datasets varying the class variable, and showed that the proposed approaches increased the random forest's predictive accuracy.
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `LexicoRandomForestClassifier`'s usage before exploring its API.
+    Adapted and integrated into a Random Forest, this estimator builds an ensemble of `LexicoDecisionTreeClassifier`s — each tree applies the lexicographic split-selection procedure above and their predictions are aggregated through the standard random-forest voting scheme.
 
-    [See The Temporal Dependency Guide ](../../../tutorials/temporal_dependency.md){ .md-button }
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
+
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.estimators.ensemble.lexicographical.lexico_random_forest.LexicoRandomForestClassifier
     options:
         heading: "LexicoRandomForestClassifier"
+        inherited_members: true
         members:
             - fit
-
-!!! note "Where are predict? and predict_proba?"
-    The `predict` and `predict_proba` methods are inherited from the `RandomForestClassifier` class in `scikit-learn`.
-    They are not explicitly defined in `LexicoRandomForestClassifier` but can be called directly on the instance.
+            - predict
+            - predict_proba
diff --git a/docs/API/estimators/ensemble/nested_trees.md b/docs/API/estimators/ensemble/nested_trees.md
index 670d6a3..0c62efa 100644
--- a/docs/API/estimators/ensemble/nested_trees.md
+++ b/docs/API/estimators/ensemble/nested_trees.md
@@ -1,26 +1,31 @@
 # Nested Trees Classifier
 
-??? tip "What is the Nested Trees Classifier?"
-    The Nested Trees Classifier is a unique and innovative classification algorithm specifically designed for
-    longitudinal datasets. It enhances traditional decision tree algorithms by embedding smaller decision trees within
-    the nodes of a primary tree structure, leveraging the inherent information in longitudinal data optimally.
+??? tip "Abstract of NestedTreesClassifier"
+    *Extracted from Ovchinnik, Otero & Freitas (2022), "Nested trees for longitudinal classification".*
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `NestedTreesClassifier`'s usage before exploring its API.
+    Longitudinal datasets contain repeated measurements of the same variables at different points in time. Longitudinal data mining algorithms aim to utilize such datasets to extract interesting knowledge and produce useful models. Many existing longitudinal classification methods either dismiss the longitudinal aspect of the data during model construction or produce complex models that are scarcely interpretable. We propose a new longitudinal classification algorithm based on decision trees, named Nested Trees. It utilizes a unique longitudinal model construction method that is fully aware of the longitudinal aspect of the predictive attributes (variables) and constructs tree nodes that make decisions based on a longitudinal attribute as a whole, considering measurements of that attribute across multiple time points. The algorithm was evaluated using 10 classification tasks based on the English Longitudinal Study of Ageing (ELSA) data.
 
-    [See The Temporal Dependency Guide ](../../../tutorials/temporal_dependency.md){ .md-button }
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
+
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.estimators.ensemble.nested_trees.nested_trees.NestedTreesClassifier
     options:
         heading: "NestedTreesClassifier"
+        inherited_members: true
         members:
-            - _fit
-            - _predict
-            - _predict_proba
+            - fit
+            - predict
+            - predict_proba
             - print_nested_tree
-
-!!! note "Inherited Methods"
-    The `predict` and `predict_proba` methods are inherited from the `CustomClassifierMixinEstimator`
-    class and can be called directly on the `NestedTreesClassifier` instance.
-
-    They internally call `_predict` and `_predict_proba` respectively.
diff --git a/docs/API/estimators/trees/lexico_decision_tree_classifier.md b/docs/API/estimators/trees/lexico_decision_tree_classifier.md
index a0b4850..10128c3 100644
--- a/docs/API/estimators/trees/lexico_decision_tree_classifier.md
+++ b/docs/API/estimators/trees/lexico_decision_tree_classifier.md
@@ -1,29 +1,32 @@
 # Lexicographical Decision Tree Classifier
 
-??? tip "What is the Lexicographical Decision Tree Classifier?"
-    The Lexicographical Decision Tree Classifier is a specialized machine learning model designed for
-    analyzing longitudinal data, where measurements are collected over time.
+??? tip "Abstract of LexicoDecisionTreeClassifier"
+    *Extracted from Ribeiro & Freitas (2024), "Lexicographical random forests for longitudinal data classification".*
 
-    Unlike traditional decision trees that select splits based solely on statistical measures like information gain,
-    this classifier incorporates the temporal aspect of the data. It prioritizes more recent measurements when deciding
-    how to split the data, under the assumption that recent information is often more predictive of outcomes.
+    Standard supervised machine learning methods often ignore the temporal information represented in longitudinal data, but that information can lead to more precise predictions in classification tasks. Data preprocessing techniques and classification algorithms can be adapted to cope directly with longitudinal data inputs, making use of temporal information such as the time-index of features and previous measurements of the class variable. In this article, we propose two changes to the classification task of predicting age-related diseases in a real-world dataset created from the English Longitudinal Study of Ageing. First, we explore the addition of previous measurements of the class variable, and estimating the missing data in those added features using intermediate classifiers. Second, we propose a new split-feature selection procedure for a random forest's decision trees, which considers the candidate features' time-indexes, in addition to the information gain ratio. Our experiments compared the proposed approaches to baseline approaches, in 3 prediction scenarios, varying the "time gap" for the prediction - how many years in advance the class (occurrence of an age-related disease) is predicted. The experiments were performed on 10 datasets varying the class variable, and showed that the proposed approaches increased the random forest's predictive accuracy.
 
-    This is achieved through a lexicographic optimization approach that balances statistical purity with temporal
-    relevance. See further details below.
+    Adapted to a single decision tree, this estimator implements the lexicographic split-selection procedure above directly inside `DecisionTreeClassifier`'s splitter, yielding a longitudinal-aware tree that prefers more recent waves whenever competing splits have comparable gain ratios.
 
-    We highly recommend reviewing the `Temporal Dependency` page in the documentation for a deeper understanding of
-    feature groups and the `Lexicographical Decision Tree` classifier's usage before exploring its API.
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
 
-    [See The Temporal Dependency Guide ](../../../tutorials/temporal_dependency.md){ .md-button }
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.estimators.trees.lexicographical.lexico_decision_tree.LexicoDecisionTreeClassifier
     options:
         heading: "LexicoDecisionTreeClassifier"
+        inherited_members: true
         members:
             - fit
-
-!!! note "Where are predict? and predict_proba?"
-    The `predict` and `predict_proba` methods are inherited from the `DecisionTreeClassifier` class, which is a part of the `scikit-learn` library.
-    Therefore, the `LexicoDecisionTreeClassifier` does not explicitly define these methods. Instead, it inherits them from its parent class.
-
-    Feel free to call them directly on the `LexicoDecisionTreeClassifier` instance.
+            - predict
+            - predict_proba
diff --git a/docs/API/estimators/trees/lexico_decision_tree_regressor.md b/docs/API/estimators/trees/lexico_decision_tree_regressor.md
new file mode 100644
index 0000000..89a5686
--- /dev/null
+++ b/docs/API/estimators/trees/lexico_decision_tree_regressor.md
@@ -0,0 +1,31 @@
+# Lexicographical Decision Tree Regressor
+
+??? tip "Abstract of LexicoDecisionTreeRegressor"
+    *Extracted from Ribeiro & Freitas (2024), "Lexicographical random forests for longitudinal data classification".*
+
+    Standard supervised machine learning methods often ignore the temporal information represented in longitudinal data, but that information can lead to more precise predictions in classification tasks. Data preprocessing techniques and classification algorithms can be adapted to cope directly with longitudinal data inputs, making use of temporal information such as the time-index of features and previous measurements of the class variable. In this article, we propose two changes to the classification task of predicting age-related diseases in a real-world dataset created from the English Longitudinal Study of Ageing. First, we explore the addition of previous measurements of the class variable, and estimating the missing data in those added features using intermediate classifiers. Second, we propose a new split-feature selection procedure for a random forest's decision trees, which considers the candidate features' time-indexes, in addition to the information gain ratio. Our experiments compared the proposed approaches to baseline approaches, in 3 prediction scenarios, varying the "time gap" for the prediction - how many years in advance the class (occurrence of an age-related disease) is predicted. The experiments were performed on 10 datasets varying the class variable, and showed that the proposed approaches increased the random forest's predictive accuracy.
+
+    Adapted to regression, this estimator applies the same lexicographic split-selection procedure inside `DecisionTreeRegressor`, replacing information-gain ratio with variance reduction (`friedman_mse`) as the primary objective while still preferring more recent waves on near-ties.
+
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
+
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
+
+## ::: scikit_longitudinal.estimators.trees.lexicographical.lexico_decision_tree_regressor.LexicoDecisionTreeRegressor
+    options:
+        heading: "LexicoDecisionTreeRegressor"
+        inherited_members: true
+        members:
+            - fit
+            - predict
diff --git a/docs/API/estimators/trees/tpt_decision_tree_classifier.md b/docs/API/estimators/trees/tpt_decision_tree_classifier.md
index 5adeffc..bc4be11 100644
--- a/docs/API/estimators/trees/tpt_decision_tree_classifier.md
+++ b/docs/API/estimators/trees/tpt_decision_tree_classifier.md
@@ -1,32 +1,30 @@
 # Time-penalised Trees Decision Tree Classifier
 
-??? tip "What is the Time-penalised Trees (TpT) Decision Tree Classifier?"
-    The `TpTDecisionTreeClassifier` is a longitudinal-aware decision tree that extends the standard CART algorithm with
-    a **time-penalised split gain**. At a parent node observed at time $t_p$, a candidate split evaluated at time
-    $t_c$ has its information gain $\Delta I$ scaled by an exponential penalty $e^{-\gamma\,(t_c - t_p)}$. The
-    splitter therefore prefers earlier waves unless later observations bring a substantially stronger signal, which
-    yields sparse-in-time and interpretable trees.
+??? tip "Abstract of TpTDecisionTreeClassifier"
+    *Extracted from Valla (2024), "Time-penalised trees (TpT): introducing a new tree-based data mining algorithm for time-varying covariates".*
 
-    TpT can consume both `wide` longitudinal matrices (with `features_group`) and LONG-format dataframes (one row per
-    `(subject, time)` observation) by setting `assume_long_format=True` and providing `id_col`, `time_col`, and
-    `duration_col`.
+    This article introduces a new decision tree algorithm that accounts for time-varying covariates in the decision-making process. Traditional decision tree algorithms assume that the covariates are static and do not change over time, which can lead to inaccurate predictions in dynamic environments. Other existing methods suggest workaround solutions such as the pseudo-subject approach. The proposed algorithm utilises a different structure and a time-penalised splitting criterion that allows a recursive partitioning of both the covariates space and time. Relevant historical trends are then inherently involved in the construction of a tree, and are visible and interpretable once it is fit. This approach allows for innovative and highly interpretable analysis in settings where the covariates are subject to change over time. The effectiveness of the algorithm is demonstrated through a real-world data application in life insurance. The results presented in this article can be seen as an introduction or proof-of-concept of the time-penalised approach, and the algorithm's theoretical properties and comparison against existing approaches on datasets from various fields will be explored in forthcoming work.
 
-    We highly recommend reading the `Temporal Dependency` page before exploring the TpT API.
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
 
-    [See The Temporal Dependency Guide ](../../../tutorials/temporal_dependency.md){ .md-button }
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
 
 ## ::: scikit_longitudinal.estimators.trees.TpT.TpT_decision_tree.TpTDecisionTreeClassifier
     options:
         heading: "TpTDecisionTreeClassifier"
+        inherited_members: true
         members:
             - fit
             - predict
             - predict_proba
-
-!!! note "Where do `predict` and `predict_proba` come from?"
-    Both methods are inherited from scikit-learn's `DecisionTreeClassifier`. `TpTDecisionTreeClassifier` only overrides
-    them to handle the optional LONG→wide conversion; otherwise the standard scikit-learn behaviour applies.
-
-!!! warning "`gamma` vs. `threshold_gain`"
-    `threshold_gain` is kept as a backward-compatible alias for `gamma` (both control the time-penalty rate
-    $\gamma$). Prefer the explicit `gamma` keyword in new code; if both are provided, `gamma` takes precedence.
diff --git a/docs/API/estimators/trees/tpt_decision_tree_regressor.md b/docs/API/estimators/trees/tpt_decision_tree_regressor.md
new file mode 100644
index 0000000..9d1a439
--- /dev/null
+++ b/docs/API/estimators/trees/tpt_decision_tree_regressor.md
@@ -0,0 +1,31 @@
+# Time-penalised Trees Decision Tree Regressor
+
+??? tip "Abstract of TpTDecisionTreeRegressor"
+    *Extracted from Valla (2024), "Time-penalised trees (TpT): introducing a new tree-based data mining algorithm for time-varying covariates".*
+
+    This article introduces a new decision tree algorithm that accounts for time-varying covariates in the decision-making process. Traditional decision tree algorithms assume that the covariates are static and do not change over time, which can lead to inaccurate predictions in dynamic environments. Other existing methods suggest workaround solutions such as the pseudo-subject approach. The proposed algorithm utilises a different structure and a time-penalised splitting criterion that allows a recursive partitioning of both the covariates space and time. Relevant historical trends are then inherently involved in the construction of a tree, and are visible and interpretable once it is fit. This approach allows for innovative and highly interpretable analysis in settings where the covariates are subject to change over time. The effectiveness of the algorithm is demonstrated through a real-world data application in life insurance. The results presented in this article can be seen as an introduction or proof-of-concept of the time-penalised approach, and the algorithm's theoretical properties and comparison against existing approaches on datasets from various fields will be explored in forthcoming work.
+
+    Adapted to regression, this estimator applies the same time-penalised splitting criterion above inside `DecisionTreeRegressor`, replacing the classification impurity improvement with variance reduction (MSE) before applying the exponential time penalty.
+
+    [See More In References :fontawesome-solid-book:](../../../publications.md){ .md-button }
+
+??? question "What are features_group and non_longitudinal_features?"
+    Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+    temporal structure of longitudinal data.
+
+    - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+      waves, ordered from oldest to most recent. This captures temporal dependencies.
+    - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+      temporal matrix.
+
+    Proper setup of these attributes is critical for leveraging temporal patterns effectively.
+
+    [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../../tutorials/temporal_dependency.md){ .md-button }
+
+## ::: scikit_longitudinal.estimators.trees.TpT.TpT_decision_tree_regressor.TpTDecisionTreeRegressor
+    options:
+        heading: "TpTDecisionTreeRegressor"
+        inherited_members: true
+        members:
+            - fit
+            - predict

From 7d6c13d71cb4eb757d02bed7199b9f7fdc47f103 Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:06:29 +0100
Subject: [PATCH 04/10] refactor(docs): tidy data_preparation and pipeline
 docstrings

---
 .../data_preparation/aggregation_function.py  | 20 +++-------
 .../data_preparation/elsa_handler.py          |  2 +-
 .../data_preparation/longitudinal_dataset.py  | 39 ++++++-------------
 .../data_preparation/merwav_time_minus.py     | 13 -------
 .../data_preparation/merwav_time_plus.py      | 18 +--------
 .../data_preparation/separate_waves.py        | 16 ++------
 scikit_longitudinal/pipeline.py               | 23 +----------
 7 files changed, 25 insertions(+), 106 deletions(-)

diff --git a/scikit_longitudinal/data_preparation/aggregation_function.py b/scikit_longitudinal/data_preparation/aggregation_function.py
index 41ad7d8..f6c62db 100644
--- a/scikit_longitudinal/data_preparation/aggregation_function.py
+++ b/scikit_longitudinal/data_preparation/aggregation_function.py
@@ -177,19 +177,9 @@ class AggrFunc(DataPreparationMixin):
     those in Scikit-Learn or any other alike machine learning-based libarires.
 
 
-    !!! question "What is a feature group?"
-        In a nutshell, a feature group is a collection of features sharing a common base longitudinal attribute
-        across different waves of data collection (e.g., "income_wave1", "income_wave2", "income_wave3"). Note that
-        aggregation reduces the dataset's temporal information significantly.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
-
     The aggregation function is applied iteratively across waves for each feature group, producing a single aggregated
-    feature per group (e.g., "mean_income" from "income_wave1", "income_wave2", "income_wave3" using the "mean"
-    function). Supported aggregation functions include "mean", "median", "mode", and custom callable functions that
+    feature per group (e.g., `mean_income` from `income_wave1`, `income_wave2`, `income_wave3` using the `mean`
+    function). Supported aggregation functions include `mean`, `median`, `mode`, and custom callable functions that
     take a pandas Series as input and return a single value. Parallel processing is also supported via the Ray library
     for enhanced efficiency on large datasets.
 
@@ -216,7 +206,7 @@ class AggrFunc(DataPreparationMixin):
         Below are examples demonstrating the usage of the `AggrFunc` class with the "stroke.csv" dataset.
         Please, note that "stroke.csv" is a placeholder and should be replaced with the actual path to your dataset.
 
-        !!! example "Basic Usage with Mean Aggregation"
+        !!! example "Basic Usage"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
             from scikit_longitudinal.data_preparation.aggregation_function import AggrFunc
@@ -241,7 +231,7 @@ class AggrFunc(DataPreparationMixin):
             transformed_dataset, _, _, _ = agg_func._transform()
             ```
 
-        !!! example "Using Custom Aggregation Function"
+        !!! example "Advanced: custom aggregation function"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
             from scikit_longitudinal.data_preparation.aggregation_function import AggrFunc
@@ -269,7 +259,7 @@ class AggrFunc(DataPreparationMixin):
             transformed_dataset, _, _, _ = agg_func._transform()
             ```
 
-        !!! example "Using Parallel Processing"
+        !!! example "Advanced: parallel processing"
             ```python
             # ... similar to the previous example, prepare data and transform ...
 
diff --git a/scikit_longitudinal/data_preparation/elsa_handler.py b/scikit_longitudinal/data_preparation/elsa_handler.py
index 76c6a1f..79e8de8 100644
--- a/scikit_longitudinal/data_preparation/elsa_handler.py
+++ b/scikit_longitudinal/data_preparation/elsa_handler.py
@@ -31,7 +31,7 @@ class ElsaDataHandler:
             A dictionary containing datasets for each unique class.
 
     Examples:
-        !!! example "Basic Usage with ELSA Core Dataset"
+        !!! example "Basic Usage"
             ```python
             from elsadatahandler import ElsaDataHandler  # Replace with actual import path
 
diff --git a/scikit_longitudinal/data_preparation/longitudinal_dataset.py b/scikit_longitudinal/data_preparation/longitudinal_dataset.py
index 9adb33b..98b5879 100644
--- a/scikit_longitudinal/data_preparation/longitudinal_dataset.py
+++ b/scikit_longitudinal/data_preparation/longitudinal_dataset.py
@@ -132,21 +132,18 @@ class LongitudinalDataset:
     Therefore, the class is designed to manage this temporal information and provide a clean interface for
     machine learning tasks throughout the `Sklong` library.
 
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
+    !!! question "What are features_group and non_longitudinal_features?"
+        Two key attributes, `features_group` and `non_longitudinal_features`, enable algorithms to interpret the
+        temporal structure of longitudinal data.
 
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
+        - **features_group**: A list of lists where each sublist contains indices of a longitudinal attribute's
+          waves, ordered from oldest to most recent. This captures temporal dependencies.
+        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the
+          temporal matrix.
 
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
+        Proper setup of these attributes is critical for leveraging temporal patterns effectively.
 
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
+        [See More In Temporal Dependency Guide :fontawesome-solid-timeline:](../../tutorials/temporal_dependency.md){ .md-button }
 
     Args:
         file_path (Union[str, Path]): Path to the dataset file (supports ARFF and CSV formats).
@@ -164,7 +161,7 @@ class LongitudinalDataset:
     Examples:
         Below are examples illustrating the class's usage.
 
-        !!! example "Loading and Preparing Data"
+        !!! example "Basic Usage"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
 
@@ -184,7 +181,7 @@ class LongitudinalDataset:
             dataset.load_train_test_split(test_size=0.2, random_state=42)
             ```
 
-        !!! example "Using Custom Feature Groups"
+        !!! example "Advanced: custom feature groups"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
 
@@ -201,7 +198,7 @@ class LongitudinalDataset:
             dataset.setup_features_group(custom_groups)
             ```
 
-        !!! example "Converting File Formats"
+        !!! example "Advanced: converting file formats"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
 
@@ -550,18 +547,6 @@ def setup_features_group(
     ) -> None:
         """Configure feature groups and non-longitudinal features for longitudinal analysis.
 
-        !!! question "What is a feature group? What's the structure really?"
-            In a nutshell, a feature group is a collection of features sharing a common base longitudinal attribute
-            across different waves of data collection (e.g., "income_wave1", "income_wave2", "income_wave3"). Note that
-            aggregation reduces the dataset's temporal information significantly.
-
-            Each sublist in `feature_groups` represents a longitudinal attribute across waves, ordered oldest to most
-            recent (e.g., `[index_w1, index_w2]`). Use -1 for missing waves to align groups.
-
-            To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-
-            [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
         This method defines how features are grouped to capture temporal dependencies across waves. It supports three
         distinct input types, each suited to different use cases, with detailed examples and explanations below.
 
diff --git a/scikit_longitudinal/data_preparation/merwav_time_minus.py b/scikit_longitudinal/data_preparation/merwav_time_minus.py
index 3d371b6..0dd3e10 100644
--- a/scikit_longitudinal/data_preparation/merwav_time_minus.py
+++ b/scikit_longitudinal/data_preparation/merwav_time_minus.py
@@ -18,19 +18,6 @@ class MerWavTimeMinus(DataPreparationMixin):
     feature as distinct features, simplifying the dataset for traditional machine learning algorithms but losing temporal
     dependencies.
 
-    !!! note "Purpose of this Class"
-        This class is useful for understanding the data preparation step before applying preprocessors or classifiers.
-        It does not reduce or augment features but provides a clear view of the merged dataset.
-
-    !!! question "What is a feature group?"
-        In a nutshell, a feature group is a collection of features sharing a common base longitudinal attribute
-        across different waves of data collection (e.g., "income_wave1", "income_wave2", "income_wave3"). Note that
-        aggregation reduces the dataset's temporal information significantly.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
     Args:
         features_group (List[List[int]], optional): A temporal matrix representing the temporal dependency of a
             longitudinal dataset. Each sublist contains indices of a longitudinal attribute's waves. Defaults to None.
diff --git a/scikit_longitudinal/data_preparation/merwav_time_plus.py b/scikit_longitudinal/data_preparation/merwav_time_plus.py
index a7a8887..fbc917f 100644
--- a/scikit_longitudinal/data_preparation/merwav_time_plus.py
+++ b/scikit_longitudinal/data_preparation/merwav_time_plus.py
@@ -15,22 +15,8 @@ class MerWavTimePlus(DataPreparationMixin):
 
     The `MerWavTimePlus` class transforms longitudinal data by merging all features across waves into a single set
     while preserving their time indices. This maintains the temporal structure, enabling longitudinal machine learning
-    methods to leverage temporal dependencies and patterns.
-
-    !!! quote "MerWavTime(+)? Usefulness?"
-        In longitudinal studies, data is collected across multiple waves (time points), resulting in features that
-        capture temporal information. This method merges all features from all waves into a single set while preserving
-        their time indices, facilitating the use of time-aware machine learning techniques.
-
-    !!! question "What is a feature group?"
-        In a nutshell, a feature group is a collection of features sharing a common base longitudinal attribute
-        across different waves of data collection (e.g., "income_wave1", "income_wave2", "income_wave3"). Note that
-        aggregation reduces the dataset's temporal information significantly.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
+    methods to leverage temporal dependencies and patterns. See all
+    [longitudinal-data-aware machine learning estimators](../estimators/trees/lexico_decision_tree_classifier.md).
 
     Args:
         features_group (List[List[int]], optional): A temporal matrix representing the temporal dependency of a
diff --git a/scikit_longitudinal/data_preparation/separate_waves.py b/scikit_longitudinal/data_preparation/separate_waves.py
index c151db0..55cbafb 100644
--- a/scikit_longitudinal/data_preparation/separate_waves.py
+++ b/scikit_longitudinal/data_preparation/separate_waves.py
@@ -258,16 +258,6 @@ class SepWav(BaseEstimator, ClassifierMixin, DataPreparationMixin):
     selected, the base wave estimators must implement `predict_proba`, because the meta-learner is trained on
     wave-level class-probability outputs.
 
-    !!! question "What is a feature group?"
-        In a nutshell, a feature group is a collection of features sharing a common base longitudinal attribute
-        across different waves of data collection (e.g., "income_wave1", "income_wave2", "income_wave3").
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](
-        https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/
-        ){ .md-button }
-
     !!! note "Ensemble Strategies"
         Supported ensemble methods include:
 
@@ -311,7 +301,7 @@ class SepWav(BaseEstimator, ClassifierMixin, DataPreparationMixin):
     Examples:
         Below are examples using the "stroke.csv" dataset. Replace "stroke.csv" with your actual dataset path.
 
-        !!! example "Basic Usage with Majority Voting"
+        !!! example "Basic Usage"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
             from scikit_longitudinal.data_preparation import SepWav
@@ -349,7 +339,7 @@ class SepWav(BaseEstimator, ClassifierMixin, DataPreparationMixin):
             print(f"Accuracy: {accuracy}")
             ```
 
-        !!! example "Using Stacking Ensemble"
+        !!! example "Advanced: stacking ensemble"
             ```python
             from scikit_longitudinal.data_preparation import LongitudinalDataset
             from scikit_longitudinal.data_preparation import SepWav
@@ -390,7 +380,7 @@ class SepWav(BaseEstimator, ClassifierMixin, DataPreparationMixin):
             print(f"Accuracy: {accuracy}")
             ```
 
-        !!! example "Using Parallel Processing"
+        !!! example "Advanced: parallel processing"
             ```python
             # ... Similar to the previous example, but with parallel processing enabled ...
 
diff --git a/scikit_longitudinal/pipeline.py b/scikit_longitudinal/pipeline.py
index 35aa230..1d06bf9 100644
--- a/scikit_longitudinal/pipeline.py
+++ b/scikit_longitudinal/pipeline.py
@@ -30,25 +30,6 @@ class LongitudinalPipeline(Pipeline):
     with longitudinal data. It ensures that the structure of longitudinal features is updated and maintained throughout
     the pipeline's transformations, making it ideal for longitudinal classification tasks.
 
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        These attributes are updated dynamically as data passes through the pipeline, ensuring that temporal relationships
-        are preserved.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
     !!! note "Extension of scikit-learn's Pipeline"
         While maintaining the interface of scikit-learn's `Pipeline`, this class includes additional validations and
         methods to ensure the correct processing of longitudinal data. It integrates seamlessly with scikit-learn's
@@ -158,7 +139,7 @@ def custom_callback(step_idx, dataset, y, name, transformer):
     Examples:
         Below are examples demonstrating the usage of the `LongitudinalPipeline` class.
 
-        !!! example "Basic Usage with a Classifier"
+        !!! example "Basic Usage"
             ```python
             from scikit_longitudinal.pipeline import LongitudinalPipeline
             from scikit_longitudinal.data_preparation import LongitudinalDataset
@@ -203,7 +184,7 @@ def custom_callback(step_idx, dataset, y, name, transformer):
             print(f"Predictions: {y_pred}")
             ```
 
-        !!! example "Using a Custom Callback"
+        !!! example "Advanced: custom callback"
             ```python
             from scikit_longitudinal.pipeline import LongitudinalPipeline
 

From 2e4aaa441234b802dd055baaf555d26fe8f22a0d Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:06:32 +0100
Subject: [PATCH 05/10] refactor(docs): streamline estimator docstrings and
 unify examples

---
 .../lexicographical/lexico_deep_forest.py     |  58 +---
 .../lexico_gradient_boosting.py               |  66 +----
 .../lexicographical/lexico_random_forest.py   |  94 +++----
 .../longitudinal_stacking.py                  |  77 +-----
 .../longitudinal_voting.py                    | 100 +------
 .../ensemble/nested_trees/nested_trees.py     |  43 +--
 .../estimators/trees/TpT/TpT_decision_tree.py |  55 +---
 .../trees/TpT/TpT_decision_tree_regressor.py  | 256 +++++++-----------
 .../lexicographical/lexico_decision_tree.py   | 103 +++----
 .../lexico_decision_tree_regressor.py         |  62 ++---
 10 files changed, 256 insertions(+), 658 deletions(-)

diff --git a/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_deep_forest.py b/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_deep_forest.py
index c76f4c7..bf83dab 100644
--- a/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_deep_forest.py
+++ b/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_deep_forest.py
@@ -111,39 +111,12 @@ class LexicoDeepForestClassifier(CustomClassifierMixinEstimator):
     """
     Lexico Deep Forest Classifier for longitudinal data analysis.
 
-    The Lexico Deep Forest Classifier is an advanced ensemble algorithm designed specifically for longitudinal data
-    analysis. It extends the fundamental principles of the Deep Forest framework by incorporating longitudinal-adapted
-    base estimators to capture the temporal complexities and interdependencies inherent in longitudinal data. The
-    classifier combines accurate learners (longitudinal base estimators) and weak learners (diversity estimators) to
-    improve robustness and generalization. It supports binary and multiclass targets through the standard classifier
-    surface: `fit`, `predict`, `predict_proba`, and `classes_`.
-
-    !!! tip "Why Use LexicoDeepForestClassifier?"
-        This classifier is ideal for longitudinal datasets where temporal structure is crucial. By leveraging a deep
-        forest architecture with longitudinal-adapted estimators, it captures complex patterns and temporal dependencies
-        effectively—perfect for applications like medical studies or time-series classification.
-
-    !!! question "How Does It Work?"
-        The classifier builds a cascade of forests, where each layer uses the predictions from the previous layer as
-        additional features. The base estimators are longitudinal-adapted classifiers like `LexicoRandomForestClassifier`,
-        which use lexicographic optimization to prioritize recent data points. Diversity estimators (weak learners) are
-        optionally included to enhance the ensemble's diversity and predictive performance.
-
-    !!! note "Performance Boost with Cython"
-        The underlying decision trees use a Cython-optimized splitter (`node_lexicoRF_split`) for faster computation.
-        See the [Cython implementation](https://github.com/simonprovost/scikit-lexicographical-trees/blob/21443b9dce51434b3198ccabac8bafc4698ce953/sklearn/tree/_splitter.pyx#L695)
-        for details.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes define the temporal structure:
-
-        - **features_group**: A list of lists, each sublist containing indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent (e.g., `[[0,1], [2,3]]` for two attributes with two waves each).
-        - **non_longitudinal_features**: Indices of static features (not used in lexicographic optimization but included
-          in standard splits).
-
-        Accurate configuration is essential for leveraging temporal patterns. See the
-        [Temporal Dependency Guide](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/) for more.
+    This classifier extends the Deep Forest framework for longitudinal data by stacking layers of
+    longitudinal-adapted base estimators (typically `LexicoRandomForestClassifier`) so each layer's predictions
+    become additional features for the next. Every base tree applies a lexicographic split-selection rule: the
+    primary objective maximises the information-gain ratio (entropy criterion), and the secondary objective
+    favours features from more recent waves whenever competing gain ratios are within `threshold_gain`. For more
+    information on Deep Forest, see [DF21](https://deep-forest.readthedocs.io/en/stable/).
 
     Args:
         features_group (List[List[int]], optional):
@@ -176,7 +149,7 @@ class LexicoDeepForestClassifier(CustomClassifierMixinEstimator):
             The class labels.
 
     Examples:
-        !!! example "Basic Usage with LexicoRandomForestClassifier"
+        !!! example "Basic Usage"
 
             ```python
             from scikit_longitudinal.estimators.ensemble.lexicographical.lexico_deep_forest import LexicoDeepForestClassifier, \
@@ -209,7 +182,7 @@ class LexicoDeepForestClassifier(CustomClassifierMixinEstimator):
             print(f"Predictions: {y_pred}")
             ```
 
-        !!! example "Using Multiple Estimator Types"
+        !!! example "Advanced: multiple estimator types"
 
             ```python
             # ... Similar setup as above ...
@@ -227,7 +200,7 @@ class LexicoDeepForestClassifier(CustomClassifierMixinEstimator):
             # ... Similar prediction and evaluation as above ...
             ```
 
-        !!! example "Disabling Diversity Estimators"
+        !!! example "Advanced: disabling diversity estimators"
 
             ```python
             # ... Similar setup as above ...
@@ -241,12 +214,6 @@ class LexicoDeepForestClassifier(CustomClassifierMixinEstimator):
 
             # ... Similar prediction and evaluation as above ...
             ```
-
-    Notes:
-        - **References**:
-
-          - Zhou, Z.H. and Feng, J., 2019. "Deep forest." *National Science Review*, 6(1), pp.74-86.
-          - [Deep Forest GitHub](https://github.com/LAMDA-NJU/Deep-Forest)
     """
 
     # pylint: disable=too-many-arguments,invalid-name,signature-differs,no-member
@@ -349,9 +316,6 @@ def _fit(
         !!! tip "Configuration Tip"
             Experiment with different combinations of `longitudinal_base_estimators` and `diversity_estimators` to
             find the optimal balance between accuracy and diversity for your dataset.
-
-        !!! note
-            Ensure `features_group` accurately maps your data's temporal structure for optimal performance.
         """
         if self.single_classifier_type is not None and self.single_count is not None:
             self.longitudinal_base_estimators = [
@@ -404,9 +368,5 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
         Returns:
             np.ndarray:
                 The predicted class probabilities for each input sample.
-
-        !!! question "When to Use Probabilities?"
-            Use `predict_proba` instead of `predict` when you need to assess confidence levels or apply custom
-            decision thresholds rather than relying on the default class assignment.
         """
         return self._deep_forest.predict_proba(X)
diff --git a/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_gradient_boosting.py b/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_gradient_boosting.py
index 9def93d..c618543 100644
--- a/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_gradient_boosting.py
+++ b/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_gradient_boosting.py
@@ -60,47 +60,11 @@ class LexicoGradientBoostingClassifier(CustomClassifierMixinEstimator):
     """
     Lexico Gradient Boosting Classifier for longitudinal data analysis.
 
-    The Lexico Gradient Boosting Classifier is an advanced ensemble algorithm designed specifically for longitudinal
-    datasets. It incorporates the fundamental principles of the Gradient Boosting framework while utilizing
-    longitudinal-adapted base estimators to capture the temporal complexities and interdependencies intrinsic to
-    longitudinal data. The base estimators are Lexico Decision Tree Regressors, which are specialized decision tree
-    models capable of handling longitudinal data through a lexicographic optimization approach. The classifier follows
-    the familiar binary and multiclass interface of `sklearn.ensemble.GradientBoostingClassifier`.
-
-    !!! tip "Why Use LexicoGradientBoostingClassifier?"
-        This classifier is ideal for longitudinal datasets where temporal recency is crucial. By leveraging lexicographic
-        optimization within a boosting framework, it iteratively improves predictions while prioritizing recent
-        measurements—perfect for applications like patient health monitoring or financial forecasting.
-
-    !!! question "How Does Lexicographic Optimization Work?"
-        The base estimators (Lexico Decision Tree Regressors) use a bi-objective split selection strategy:
-
-        1. **Primary**: Minimize the loss (using "friedman_mse" criterion).
-        2. **Secondary**: Favor features from more recent waves when loss reductions are similar (within `threshold_gain`).
-
-        This ensures both statistical accuracy and temporal relevance are optimized, with boosting aggregating these
-        decisions for enhanced predictive power.
-
-    !!! note "Performance Boost with Cython"
-        The underlying splitter (`node_lexicoRF_split`) is optimized in Cython for faster computation. See the
-        [Cython implementation](https://github.com/simonprovost/scikit-lexicographical-trees/blob/21443b9dce51434b3198ccabac8bafc4698ce953/sklearn/tree/_splitter.pyx#L695)
-        for details.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
+    This classifier extends scikit-learn's `GradientBoostingClassifier` for longitudinal data by integrating a
+    lexicographic optimisation approach within each base learner (a `LexicoDecisionTreeRegressor`). Splits are
+    evaluated with a bi-objective rule: the primary objective minimises the loss (`friedman_mse` criterion), and
+    the secondary objective favours features from more recent waves whenever competing loss reductions are within
+    `threshold_gain`. Boosting aggregates these decisions over successive iterations by fitting residuals.
 
     Args:
         threshold_gain (float, default=0.0015):
@@ -144,7 +108,7 @@ class LexicoGradientBoostingClassifier(CustomClassifierMixinEstimator):
             The class labels.
 
     Examples:
-        !!! example "Basic Usage with Dummy Longitudinal Data"
+        !!! example "Basic Usage"
 
             ```python
             from sklearn.metrics import accuracy_score
@@ -165,7 +129,7 @@ class LexicoGradientBoostingClassifier(CustomClassifierMixinEstimator):
             print(f"Accuracy: {accuracy_score(dataset.y_test, y_pred)}")
             ```
 
-        !!! example "Tuning Learning Rate and Threshold Gain"
+        !!! example "Advanced: tuning learning rate and threshold gain"
 
             ```python
             # ... Similar setup as above ...
@@ -182,15 +146,6 @@ class LexicoGradientBoostingClassifier(CustomClassifierMixinEstimator):
 
             # ... Similar evaluation as above ...
             ```
-
-    Notes:
-        - **References**:
-
-          - Ribeiro, C. and Freitas, A., 2020. "A new random forest method for longitudinal data classification using a
-            lexicographic bi-objective approach." *2020 IEEE Symposium Series on Computational Intelligence (SSCI)*,
-            pp. 806-813.
-          - Ribeiro, C. and Freitas, A.A., 2024. "A lexicographic optimisation approach to promote more recent features
-            on longitudinal decision-tree-based classifiers." *Artificial Intelligence Review*, 57(4), p.84.
     """
 
     def __init__(
@@ -256,9 +211,6 @@ def _fit(
         !!! tip "Tuning Tip"
             Adjust `n_estimators` and `learning_rate` to balance model complexity and convergence speed. A lower
             `learning_rate` with more `n_estimators` can improve generalization but increases computation time.
-
-        !!! note
-            Ensure `features_group` accurately maps your data's temporal structure for optimal performance.
         """
         self._lexico_gradient_boosting = GradientBoostingClassifier(
             splitter=self.splitter,
@@ -302,10 +254,6 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
         Returns:
             np.ndarray:
                 The predicted class probabilities for each input sample.
-
-        !!! question "When to Use Probabilities?"
-            Use `predict_proba` instead of `predict` when you need to assess confidence levels or apply custom
-            decision thresholds rather than relying on the default class assignment.
         """
         return self._lexico_gradient_boosting.predict_proba(X)
 
diff --git a/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_random_forest.py b/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_random_forest.py
index b05e280..0421afc 100644
--- a/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_random_forest.py
+++ b/scikit_longitudinal/estimators/ensemble/lexicographical/lexico_random_forest.py
@@ -9,47 +9,12 @@ class LexicoRandomForestClassifier(RandomForestClassifier):
     """
     Lexico Random Forest Classifier for longitudinal data classification.
 
-    The Lexico Random Forest Classifier is an advanced ensemble algorithm tailored for longitudinal data analysis. It
-    extends the traditional Random Forest by integrating a lexicographic optimization approach within each decision tree,
-    prioritizing more recent data points (waves) for splits. This is based on the premise that recent measurements are more
-    predictive and relevant, making it ideal for applications like medical studies or time-series classification. The
-    implementation leverages a Cython-optimized fork of scikit-learn's decision tree for enhanced efficiency and supports
-    both binary and multiclass targets through the standard `fit`, `predict`, `predict_proba`, and `classes_` API.
-
-    !!! tip "Why Use LexicoRandomForestClassifier?"
-        This classifier excels with longitudinal datasets where temporal recency is key. By combining lexicographic
-        optimization with the ensemble strength of random forests, it captures evolving patterns while minimizing
-        overfitting—perfect for robust predictive modeling.
-
-    !!! question "How Does Lexicographic Optimization Work?"
-        Each tree in the forest employs a bi-objective split selection strategy:
-
-        1. **Primary**: Maximize the information gain ratio using the "entropy" criterion.
-        2. **Secondary**: Favor features from more recent waves when gain ratios are similar (within `threshold_gain`).
-
-        This ensures both statistical purity and temporal relevance are optimized, with the ensemble aggregating these
-        decisions for improved accuracy.
-
-    !!! note "Performance Boost with Cython"
-        The splitter (`node_lexicoRF_split`) is optimized in Cython for faster computation. See the
-        [Cython implementation](https://github.com/simonprovost/scikit-lexicographical-trees/blob/21443b9dce51434b3198ccabac8bafc4698ce953/sklearn/tree/_splitter.pyx#L695)
-        for details.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
+    This classifier extends scikit-learn's `RandomForestClassifier` for longitudinal data by integrating a
+    lexicographic optimisation approach within each tree of the forest, based on the premise that recent
+    measurements are more predictive and relevant. Splits are evaluated with a bi-objective rule: the primary
+    objective maximises the information-gain ratio (entropy criterion), and the secondary objective favours
+    features from more recent waves whenever competing gain ratios are within `threshold_gain`. The ensemble
+    aggregates these temporally-aware trees to reduce overfitting while preserving recency-driven decisions.
 
     Args:
         n_estimators (int, default=100):
@@ -100,7 +65,7 @@ class LexicoRandomForestClassifier(RandomForestClassifier):
             Fitted tree ensemble.
 
     Examples:
-        !!! example "Basic Usage with Dummy Longitudinal Data"
+        !!! example "Basic Usage"
 
             ```python
             from sklearn.metrics import accuracy_score
@@ -121,7 +86,7 @@ class LexicoRandomForestClassifier(RandomForestClassifier):
             print(f"Accuracy: {accuracy_score(dataset.y_test, y_pred)}")
             ```
 
-        !!! example "Tuning Threshold Gain"
+        !!! example "Advanced: tuning threshold gain"
 
             ```python
             from sklearn.metrics import accuracy_score
@@ -174,15 +139,6 @@ class LexicoRandomForestClassifier(RandomForestClassifier):
                 grid_search.fit(X, y)
                 print(f"Best parameters: {grid_search.best_params_}")
                 ```
-
-    Notes:
-        - **References**:
-
-          - Ribeiro, C. and Freitas, A., 2020. "A new random forest method for longitudinal data classification using a
-            lexicographic bi-objective approach." *2020 IEEE Symposium Series on Computational Intelligence (SSCI)*,
-            pp. 806-813.
-          - Ribeiro, C. and Freitas, A.A., 2024. "A lexicographic optimisation approach to promote more recent features
-            on longitudinal decision-tree-based classifiers." *Artificial Intelligence Review*, 57(4), p.84.
     """
 
     def __init__(
@@ -279,11 +235,39 @@ def fit(self, X, y, sample_weight=None, *args, **kwargs):
         !!! tip "Tuning Tip"
             Adjust `n_estimators` and `threshold_gain` to balance accuracy and computation time. Start with defaults
             and refine based on your dataset.
-
-        !!! note
-            Ensure `features_group` accurately maps your data's temporal structure for optimal performance.
         """
         if self.features_group is None:
             raise ValueError("The features_group parameter must be provided.")
 
         return super().fit(X, y, sample_weight=sample_weight, *args, **kwargs)
+
+    def predict(self, X):
+        """Predict class labels for the input samples.
+
+        Inherited from scikit-learn's `RandomForestClassifier`. Each tree votes for a class and the
+        class with the most votes is returned.
+
+        Args:
+            X (array-like of shape (n_samples, n_features)):
+                Input samples.
+
+        Returns:
+            np.ndarray: Predicted class labels of shape `(n_samples,)`.
+        """
+        return super().predict(X)
+
+    def predict_proba(self, X):
+        """Predict class probabilities for the input samples.
+
+        Inherited from scikit-learn's `RandomForestClassifier`. Probabilities are the mean of the
+        probabilistic predictions of the individual trees.
+
+        Args:
+            X (array-like of shape (n_samples, n_features)):
+                Input samples.
+
+        Returns:
+            np.ndarray: Class probabilities of shape `(n_samples, n_classes)`, with columns ordered
+            as in `self.classes_`.
+        """
+        return super().predict_proba(X)
diff --git a/scikit_longitudinal/estimators/ensemble/longitudinal_stacking/longitudinal_stacking.py b/scikit_longitudinal/estimators/ensemble/longitudinal_stacking/longitudinal_stacking.py
index f09c9d5..de395b8 100644
--- a/scikit_longitudinal/estimators/ensemble/longitudinal_stacking/longitudinal_stacking.py
+++ b/scikit_longitudinal/estimators/ensemble/longitudinal_stacking/longitudinal_stacking.py
@@ -76,29 +76,10 @@ def _extract_wave(self, X: np.ndarray) -> np.ndarray:
 
 class LongitudinalStackingClassifier(CustomClassifierMixinEstimator):
     """
-    Longitudinal Stacking Classifier for ensemble learning on longitudinal data.
-
-    The Longitudinal Stacking Classifier is a sophisticated ensemble method designed to handle the unique challenges
-    posed by longitudinal data. It leverages a stacking approach where multiple base estimators are trained, and their
-    predicted class probabilities are used as input features for a meta-learner, which generates the final
-    prediction. This method excels at capturing complex temporal patterns by learning from the combined strengths of
-    diverse base models, and supports both binary and multiclass targets.
-
-    !!! warning "When to Use?"
-        This classifier is primarily used with the "SepWav" (Separate Waves) strategy but can also be applied with
-        longitudinal-based estimators that do not follow the SepWav approach if preferred.
-
-    !!! info "SepWav (Separate Waves) Strategy"
-        The SepWav strategy involves training separate classifiers for each wave's features and the class variable.
-        The class-probability outputs from these classifiers are then combined using stacking, where a meta-learner
-        (e.g., Logistic Regression, Decision Tree, or Random Forest) learns to make the final prediction based on the
-        base classifiers' outputs.
-
-    !!! info "Wrapper Around Sklearn StackingClassifier"
-        This class wraps the `sklearn` StackingClassifier, offering a familiar interface while incorporating
-        enhancements for longitudinal data. As in scikit-learn, base estimators are cloned and refitted during
-        stacking unless a prefit workflow is explicitly requested. When `extract_wave` is provided, those internal
-        refits remain wave-specific.
+    Trains a meta-learner on the class-probability outputs of the pre-trained base estimators. Each base estimator
+    must implement `predict_proba`; the meta-learner is then fitted on the stacked probabilities to produce the
+    final prediction. Supports both binary and multiclass targets, and wraps scikit-learn's `StackingClassifier`
+    under the hood. When `extract_wave` is provided, internal refits remain wave-specific.
 
     Args:
         estimators (List[CustomClassifierMixinEstimator]):
@@ -120,56 +101,6 @@ class LongitudinalStackingClassifier(CustomClassifierMixinEstimator):
         ValueError: If no base estimators are provided, if a base estimator does not implement `predict_proba`, or if
             the meta-learner is not suitable.
         NotFittedError: If attempting to predict or predict_proba before fitting the model.
-
-    Examples:
-        !!! example "Basic Usage with Dummy Longitudinal Data"
-
-            ```python
-            from scikit_longitudinal.estimators.ensemble.longitudinal_stacking import LongitudinalStackingClassifier
-            from sklearn.ensemble import RandomForestClassifier
-            from scikit_longitudinal.estimators.ensemble.lexicographical import LexicoRandomForestClassifier
-            from sklearn.linear_model import LogisticRegression
-            import numpy as np
-
-            # Dummy data
-            X = np.array([[0, 1, 0, 1, 45, 1], [1, 1, 1, 1, 50, 0], [0, 0, 0, 0, 55, 1]])
-            y = np.array([0, 1, 2])
-            features_group = [[0, 1], [2, 3]]
-
-            # Train base estimators
-            rf = RandomForestClassifier().fit(X, y)
-            lexico_rf = LexicoRandomForestClassifier(features_group=features_group).fit(X, y)
-
-            # Create and fit the stacking classifier
-            clf = LongitudinalStackingClassifier(
-                estimators=[('rf', rf), ('lexico_rf', lexico_rf)],
-                meta_learner=LogisticRegression()
-            )
-            clf.fit(X, y)
-            y_pred = clf.predict(X)
-            print(f"Predictions: {y_pred}")
-            ```
-
-        !!! example "Using a Decision Tree as Meta-Learner with Parallel Processing"
-
-            ```python
-            from sklearn.tree import DecisionTreeClassifier
-            clf = LongitudinalStackingClassifier(
-                estimators=[('rf', rf), ('lexico_rf', lexico_rf)],
-                meta_learner=DecisionTreeClassifier(),
-                n_jobs=-1  # Use all available CPUs
-            )
-            clf.fit(X, y)
-            y_pred = clf.predict(X)
-            print(f"Predictions: {y_pred}")
-            ```
-
-    Notes:
-        - **References**:
-
-          - Ribeiro, C. and Freitas, A.A., 2019. "A mini-survey of supervised machine learning approaches for coping
-            with ageing-related longitudinal datasets." *3rd Workshop on AI for Aging, Rehabilitation and Independent
-            Assisted Living (ARIAL)*, held as part of IJCAI-2019.
     """
 
     def __init__(
diff --git a/scikit_longitudinal/estimators/ensemble/longitudinal_voting/longitudinal_voting.py b/scikit_longitudinal/estimators/ensemble/longitudinal_voting/longitudinal_voting.py
index 992ac36..eaf025e 100644
--- a/scikit_longitudinal/estimators/ensemble/longitudinal_voting/longitudinal_voting.py
+++ b/scikit_longitudinal/estimators/ensemble/longitudinal_voting/longitudinal_voting.py
@@ -17,9 +17,6 @@ class LongitudinalEnsemblingStrategy(Enum):
     """
     An enum for the different longitudinal voting strategies.
 
-    !!! note "Math Plugin Seems Capricious"
-        We will sometime not "interpret" the math on purpose to avoid yielding `math error plugin`.
-
     Attributes:
         MAJORITY_VOTING (int):
             Simple consensus voting where the most frequent prediction is selected.
@@ -27,25 +24,19 @@ class LongitudinalEnsemblingStrategy(Enum):
             Weights each classifier's vote based on the recency of its wave using a linear decay.
             Weight formula:
 
-            ```math
-            ( w_i = \\frac{i}{\sum_{j=1}^{N} j} )
-            ```
+            $$w_i = \\frac{i}{\\sum_{j=1}^{N} j}$$
 
         DECAY_EXPONENTIAL_VOTING (int):
             Weights each classifier's vote based on the recency of its wave using an exponential decay.
             Weight formula:
 
-            ```math
-            ( w_i = \\frac{e^{i}}{\sum_{j=1}^{N} e^{j}} )
-            ```
+            $$w_i = \\frac{e^{i}}{\\sum_{j=1}^{N} e^{j}}$$
 
         CV_BASED_VOTING (int):
             Weights each classifier based on its cross-validation accuracy on the training data.
             Weight formula:
 
-            ```math
-            ( w_i = \\frac{A_i}{\sum_{j=1}^{N} A_j} )
-            ```
+            $$w_i = \\frac{A_i}{\\sum_{j=1}^{N} A_j}$$
 
         STACKING (int):
             Stacking ensemble strategy uses a meta-learner to combine predictions of base classifiers.
@@ -53,11 +44,11 @@ class LongitudinalEnsemblingStrategy(Enum):
             probabilities.
             This approach is suitable when the cardinality of meta-features is smaller than the original feature set.
 
-            In stacking, for each wave \\( i \\) (\\( i \\in \\{1, 2, \\ldots, N\\} \\)), a base classifier \\( C_i \\)
-            is trained on \\( (X_i, T_N) \\). The class-probability output from \\( C_i \\) is denoted as \\( V_i \\),
-            forming the meta-features \\( \\mathbf{V} = [V_1, V_2, ..., V_N] \\). The meta-learner \\( M \\) is then
-            trained on \\( (\\mathbf{V}, T_N) \\), and for a new instance \\( x \\), the final prediction is
-            \\( P(x) = M(\\mathbf{V}(x)) \\).
+            In stacking, for each wave $i$ ($i \\in \\{1, 2, \\ldots, N\\}$), a base classifier $C_i$
+            is trained on $(X_i, T_N)$. The class-probability output from $C_i$ is denoted as $V_i$,
+            forming the meta-features $\\mathbf{V} = [V_1, V_2, ..., V_N]$. The meta-learner $M$ is then
+            trained on $(\\mathbf{V}, T_N)$, and for a new instance $x$, the final prediction is
+            $P(x) = M(\\mathbf{V}(x))$.
 
     """
 
@@ -70,31 +61,9 @@ class LongitudinalEnsemblingStrategy(Enum):
 
 class LongitudinalVotingClassifier(CustomClassifierMixinEstimator):
     """
-    Longitudinal Voting Classifier for ensemble learning on longitudinal data.
-
-    The Longitudinal Voting Classifier is a versatile ensemble method designed to handle the unique challenges posed by
-    longitudinal data. It leverages different voting strategies to combine predictions from multiple base estimators,
-    enhancing predictive performance. The base estimators are individually trained, and their predictions are
-    aggregated based on the chosen voting strategy to generate the final prediction. The classifier supports both
-    binary and multiclass targets.
-
-    !!! warning "When to Use?"
-        This classifier is primarily used when the "SepWav" (Separate Waves) strategy is employed. However, it can also
-        be applied with only longitudinal-based estimators that do not follow the SepWav approach if desired.
-
-    !!! info "SepWav (Separate Waves) Strategy"
-
-        The SepWav strategy involves considering each wave's features and the class variable as a separate dataset,
-        then learning a classifier for each dataset. The class labels predicted by these classifiers are combined into
-        a final predicted class label. This combination can be achieved using various approaches: simple majority
-        voting, weighted voting with weights decaying linearly or exponentially for older waves, weights optimised by
-        cross-validation on the training set (current class), and stacking methods that use the classifiers' predicted
-        class probabilities as input for learning a meta-classifier (see LongitudinalStacking).
-
-    !!! info "Wrapper Around Sklearn VotingClassifier"
-
-        This class wraps the `sklearn` VotingClassifier, offering a familiar interface while incorporating enhancements
-        for longitudinal data.
+    Aggregates predictions from pre-trained base estimators using the voting rule specified by
+    `LongitudinalEnsemblingStrategy` (majority, linear or exponential decay, or cross-validation-weighted). Supports
+    both binary and multiclass targets, and wraps scikit-learn's `VotingClassifier` under the hood.
 
     Args:
         voting (LongitudinalEnsemblingStrategy, default=LongitudinalEnsemblingStrategy.MAJORITY_VOTING):
@@ -116,54 +85,7 @@ class probabilities as input for learning a meta-classifier (see LongitudinalSta
         ValueError: If no estimators are provided or if an invalid voting strategy is specified.
         NotFittedError: If attempting to predict or predict_proba before fitting the model.
 
-    Examples:
-        !!! example "Basic Usage with Dummy Longitudinal Data"
-            ```python
-            from scikit_longitudinal.estimators.ensemble.longitudinal_voting import (
-                LongitudinalVotingClassifier,
-                LongitudinalEnsemblingStrategy
-            )
-            from sklearn.ensemble import RandomForestClassifier
-            from scikit_longitudinal.estimators.ensemble.lexicographical import LexicoRandomForestClassifier
-            import numpy as np
-
-            # Dummy data
-            X = np.array([[0, 1, 0, 1, 45, 1], [1, 1, 1, 1, 50, 0], [0, 0, 0, 0, 55, 1]])
-            y = np.array([0, 1, 2])
-            features_group = [[0, 1], [2, 3]]
-
-            # Train estimators
-            rf = RandomForestClassifier().fit(X, y)
-            lexico_rf = LexicoRandomForestClassifier(features_group=features_group).fit(X, y)
-
-            # Create and fit the voting classifier
-            clf = LongitudinalVotingClassifier(
-                voting=LongitudinalEnsemblingStrategy.MAJORITY_VOTING,
-                estimators=[('rf', rf), ('lexico_rf', lexico_rf)],
-            )
-            clf.fit(X, y)
-            y_pred = clf.predict(X)
-            print(f"Predictions: {y_pred}")
-            ```
-
-        !!! example "Using Cross-Validation-Based Weighted Voting"
-            ```python
-            clf = LongitudinalVotingClassifier(
-                voting=LongitudinalEnsemblingStrategy.CV_BASED_VOTING,
-                estimators=[('rf', rf), ('lexico_rf', lexico_rf)],
-            )
-            clf.fit(X, y)
-            y_pred = clf.predict(X)
-            print(f"Predictions: {y_pred}")
-            ```
-
     Notes:
-        - **References**:
-
-          - Ribeiro, C. and Freitas, A.A., 2019. "A mini-survey of supervised machine learning approaches for coping
-            with ageing-related longitudinal datasets." *3rd Workshop on AI for Aging, Rehabilitation and Independent
-            Assisted Living (ARIAL)*, held as part of IJCAI-2019.
-
         - `predict_proba` returns normalised vote shares across classes. These are consistent with the hard-voting
           decision returned by `predict`, but they are not calibrated probabilities.
     """
diff --git a/scikit_longitudinal/estimators/ensemble/nested_trees/nested_trees.py b/scikit_longitudinal/estimators/ensemble/nested_trees/nested_trees.py
index b264082..0e4a3f6 100644
--- a/scikit_longitudinal/estimators/ensemble/nested_trees/nested_trees.py
+++ b/scikit_longitudinal/estimators/ensemble/nested_trees/nested_trees.py
@@ -20,28 +20,13 @@ class NestedTreesClassifier(CustomClassifierMixinEstimator):
     """
     Nested Trees Classifier for longitudinal data classification.
 
-    The Nested Trees Classifier is a unique and innovative algorithm tailored for longitudinal datasets. It enhances
-    traditional decision tree methods by embedding smaller decision trees within the nodes of a primary tree
-    structure, optimally leveraging the temporal information inherent in longitudinal data. This hierarchical approach
-    excels at capturing complex temporal patterns and dependencies, and supports both binary and multiclass labels.
-
-    !!! info "Structure Overview"
-        The outer tree uses a custom algorithm to select longitudinal attributes (groups of time-specific features).
-        Each node hosts an inner `DecisionTreeClassifier` from scikit-learn, partitioning data based on the selected
-        attribute, creating a nested decision-making process.
-
-        We highly  recommend to read the paper to better understand the primitive.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes define the temporal structure:
-
-        - **features_group**: A list of lists, each sublist containing indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent (e.g., `[[0,1], [2,3]]` for two attributes with two waves each).
-        - **non_longitudinal_features**: Indices of static features
-          (not used in temporal modelling but included in splits).
-
-        Accurate configuration is essential. See the
-        [Temporal Dependency Guide](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/).
+    The Nested Trees Classifier enhances traditional decision tree methods with a two-level, longitudinal-aware
+    construction: the outer tree picks splits on a **whole longitudinal attribute** (the group of time-specific
+    features that represent repeated measurements of the same variable across waves) instead of on a single
+    feature, and each outer node hosts an inner `DecisionTreeClassifier` from scikit-learn that partitions the
+    data using only the measurements of that selected attribute across time. This preserves the longitudinal
+    structure during model construction, keeps decisions interpretable (each outer node is labelled by one
+    attribute), and naturally captures temporal patterns and dependencies.
 
     Args:
         features_group (List[List[int]], optional):
@@ -73,7 +58,7 @@ class NestedTreesClassifier(CustomClassifierMixinEstimator):
             Unique class labels, set during fitting.
 
     Examples:
-        !!! example "Basic Usage with Dummy Longitudinal Data"
+        !!! example "Basic Usage"
 
             ```python
             from sklearn.metrics import accuracy_score
@@ -94,7 +79,7 @@ class NestedTreesClassifier(CustomClassifierMixinEstimator):
             print(f"Accuracy: {accuracy_score(dataset.y_test, y_pred)}")
             ```
 
-        !!! example "Customizing Inner Tree Hyperparameters"
+        !!! example "Advanced: customising inner tree hyperparameters"
 
             ```python
             # ... Similar setup as above ...
@@ -109,12 +94,6 @@ class NestedTreesClassifier(CustomClassifierMixinEstimator):
 
             # ... Similar prediction and evaluation as above ...
             ```
-
-    Notes:
-        - Requires accurate `features_group` and `non_longitudinal_features` setup for optimal temporal modeling.
-        - References: Ovchinnik, S., Otero, F., & Freitas, A.A. (2022). *Nested trees for longitudinal classification.*
-          ACM/SIGAPP Symposium on Applied Computing, 441-444.
-        - Original Java implementation: [Nested Trees GitHub](https://github.com/NestedTrees/NestedTrees).
     """
 
     # pylint: disable=too-many-arguments,too-many-positional-arguments,invalid-name,signature-differs,no-member
@@ -304,10 +283,6 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
 
         Raises:
             ValueError: If the classifier isn’t fitted.
-
-        !!! question "When to Use Probabilities?"
-            Use `predict_proba` instead of `predict` when you need confidence scores or custom thresholds, such as in
-            medical diagnostics.
         """
         if self.root is None:
             raise ValueError("The classifier must be fitted before making predictions.")
diff --git a/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree.py b/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree.py
index f071f66..eca9331 100644
--- a/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree.py
+++ b/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree.py
@@ -18,44 +18,21 @@ class TpTDecisionTreeClassifier(DecisionTreeClassifier):
     Time-penalised Trees (TpT) Decision Tree Classifier for longitudinal data classification.
 
     This classifier extends the standard Decision Tree algorithm to handle longitudinal data by incorporating a
-    **time-penalized split gain**. At a parent node time $t_p$, a candidate split at time $t_c$ has gain
-    $\\Delta I$ which is penalized as $\\Delta I \\cdot e^{-\\gamma (t_c - t_p)}$. In this Phase-1
+    **time-penalised split gain**. At a parent node time $t_p$, a candidate split at time $t_c$ has gain
+    $\\Delta I$ which is penalised as $\\Delta I \\cdot e^{-\\gamma (t_c - t_p)}$. In this Phase-1
     implementation, $t_c$ is proxied by the **wave index** of the splitting feature; in a later step we will
     propagate the true parent time through the builder to compute $t_c - t_p$ exactly.
 
-    !!! tip "Why Use TpTDecisionTreeClassifier?"
-        This classifier is ideal when working with longitudinal datasets where temporal dependency matters. By balancing
-        information gain with a penalty for later observations, it captures evolving patterns effectively—perfect for
-        applications like medical studies or time-series within time-series classification.
+    ??? note "LONG vs wide input — *[Soon To Be Deprecated](https://github.com/simonprovost/scikit-longitudinal/issues/64)*"
+        TpT internally operates on a **wide** matrix (features expanded over waves). If `assume_long_format=True`,
+        the classifier can accept a LONG-format dataframe and will convert it to the expected wide representation
+        before fitting (using `id_col`, `time_col`, `duration_col`, `time_step`, and `max_horizon`).
 
-    !!! question "How does TpT work?"
-        At each node, we evaluate candidate splits by a **time-penalized impurity improvement**:
-        $G_\\gamma = \\Delta I \\cdot e^{-\\gamma \\Delta t}$.
-        In this phase, $\\Delta t$ is approximated by the wave index of the splitting feature. A split is chosen
-        if its penalized gain is maximal among candidates. (We temporarily reuse the Cython parameter
-        `threshold_gain` as $\\gamma$.)
-
-    !!! note "Performance Boost with Cython"
-        The underlying splitter (`node_TpT_split`) is optimized in Cython for faster computation. Check out the
-        [Cython implementation](https://github.com/simonprovost/scikit-lexicographical-trees/blob/21443b9dce51434b3198ccabac8bafc4698ce953/sklearn/tree/_splitter.pyx#L695)
-        for a deep dive into the performance enhancements.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest//temporal_dependency/){ .md-button }
+        - LONG-format: one row per (subject, time) observation.
+        - wide format: one row per subject, with features duplicated across waves.
 
+        The conversion fills feature values up to each subject's duration/horizon and leaves NaNs beyond,
+        enabling "duration leaves" in the TpT logic.
 
     Args:
         gamma (float, optional):
@@ -122,7 +99,7 @@ class TpTDecisionTreeClassifier(DecisionTreeClassifier):
     Examples:
         Below are examples demonstrating the usage of the `TpTDecisionTreeClassifier` class.
 
-        !!! example "Basic Usage with Iris Dataset"
+        !!! example "Basic Usage"
 
             Please note that the Iris is not longitudinal data, but this example is for demonstration purposes only.
             We could not publicly use the dataset we use for our various papers without user registering
@@ -154,7 +131,7 @@ class TpTDecisionTreeClassifier(DecisionTreeClassifier):
             print(f"Accuracy: {accuracy}")
             ```
 
-        !!! example "Using with LongitudinalPipeline"
+        !!! example "Advanced: using with LongitudinalPipeline"
 
             ```python
             from scikit_longitudinal.pipeline import LongitudinalPipeline
@@ -190,14 +167,6 @@ class TpTDecisionTreeClassifier(DecisionTreeClassifier):
             y_pred = pipeline.predict(dataset.X_test)
             print(f"Predictions: {y_pred}")
             ```
-
-    Notes:
-        - Contributors of the code are: Mathias VALLA, Esteban MAUBOUSSIN, Alae KHIDOUR, Berkehan KOCAK and Sonny MUPFUNI
-        - The `features_group` parameter is essential for longitudinal data and must reflect the dataset's temporal structure.
-        - For non-longitudinal datasets, this classifier may not outperform the standard `DecisionTreeClassifier`.
-        - References:
-              - [1] Valla, M. Time-penalised trees (TpT): introducing a new tree-based data mining algorithm for time-varying covariates. Ann Math Artif Intell 92, 1609–1661 (2024). https://doi.org/10.1007/s10472-024-09950-w
-              - [2] Mathias Valla, Xavier Milhaud. Consistent Time-Aware Trees for Longitudinal Data: The Time-Penalized Tree. 2026. ⟨hal-05022929v2⟩ https://cnrs.hal.science/hal-05022929
     """
 
     _parameter_constraints = {
diff --git a/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree_regressor.py b/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree_regressor.py
index b397bc9..d581d45 100644
--- a/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree_regressor.py
+++ b/scikit_longitudinal/estimators/trees/TpT/TpT_decision_tree_regressor.py
@@ -16,38 +16,14 @@ class TpTDecisionTreeRegressor(DecisionTreeRegressor):
     """
     Time-penalised Trees (TpT) Decision Tree Regressor for longitudinal data regression.
 
-    This regressor extends standard CART regression trees to handle longitudinal covariates by incorporating
-    a **time-penalized split gain**. At a node associated with a parent time $t_p$, a candidate split
-    evaluated at time $t_c$ yields an impurity improvement $\\Delta I$ (typically based on variance
-    reduction / MSE), which is penalized as:
-
-    $G_\\gamma = \\Delta I \\cdot e^{-\\gamma (t_c - t_p)}$.
-
-    In the current implementation, $t_c$ is represented by the **wave index** of the splitting feature and
-    $t_p$ is propagated by the tree builder, so that the penalty depends on the *time distance* between
-    successive splits.
-
-    !!! tip "Why use TpTDecisionTreeRegressor?"
-        TpT is useful when:
-        - covariates are observed at multiple time points (waves),
-        - later observations can be informative but should be “paid for” (regularization in time),
-        - you want interpretable, sparse-in-time regression rules.
-
-        By balancing variance reduction with an exponential penalty for using later waves,
-        TpT tends to prefer earlier splits unless later waves provide substantially better predictive signal.
-
-    !!! question "How does TpT work in regression?"
-        At each node, for each candidate split, the regressor computes a variance-based improvement
-        (e.g., MSE reduction) and applies the time penalty:
-
-        $G_\\gamma = \\Delta I \\cdot e^{-\\gamma \\Delta t}$
-
-        where $\\Delta t = \\max(0, t_c - t_p)$.
-
-        The split maximizing $G_\\gamma$ is chosen, subject to classical CART constraints
-        (min samples per leaf, max depth, etc.).
-
-    !!! note "LONG vs wide input"
+    This regressor extends scikit-learn's `DecisionTreeRegressor` for longitudinal data by incorporating a
+    **time-penalised split gain**. At a node associated with a parent time $t_p$, a candidate split evaluated
+    at time $t_c$ yields an impurity improvement $\\Delta I$ (typically based on variance reduction / MSE),
+    which is penalised as $G_\\gamma = \\Delta I \\cdot e^{-\\gamma (t_c - t_p)}$. In the current implementation,
+    $t_c$ is represented by the **wave index** of the splitting feature and $t_p$ is propagated by the tree
+    builder, so that the penalty depends on the *time distance* between successive splits.
+
+    ??? note "LONG vs wide input — *[Soon To Be Deprecated](https://github.com/simonprovost/scikit-longitudinal/issues/64)*"
         TpT internally operates on a **wide** matrix (features expanded over waves). If `assume_long_format=True`,
         the regressor can accept a LONG-format dataframe and will convert it to the expected wide representation
         before fitting (using `id_col`, `time_col`, `duration_col`, `time_step`, and `max_horizon`).
@@ -55,130 +31,98 @@ class TpTDecisionTreeRegressor(DecisionTreeRegressor):
         - LONG-format: one row per (subject, time) observation.
         - wide format: one row per subject, with features duplicated across waves.
 
-        The conversion fills feature values up to each subject’s duration/horizon and leaves NaNs beyond,
-        enabling “duration leaves” in the TpT logic.
-
-    !!! question "Feature groups and temporal structure"
-        The parameter `features_group` encodes the temporal layout of longitudinal covariates:
-
-        - **features_group**: list of lists; each inner list contains the column indices corresponding to the same
-          covariate across waves, ordered from oldest to most recent.
-
-        Providing correct feature groups is essential to ensure that:
-        - the splitter can map features to wave indices (for time penalization),
-        - the model remains consistent with the dataset temporal structure.
-
-        To see more, we recommend the `Temporal Dependency` page:
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest//temporal_dependency/){ .md-button }
-
-    !!! note "Cython implementation"
-        The core split search is implemented in Cython for performance (TpT splitter).
-        This regressor is a thin sklearn-compatible wrapper around that optimized implementation.
-
-    Parameters
-    ----------
-    gamma : float, optional
-        Time-penalty rate $\\gamma$ in $e^{-\\gamma \\Delta t}$.
-        If not provided, falls back to `threshold_gain` for backward compatibility.
-    threshold_gain : float, optional
-        Backward-compatible alias for `gamma`. If both are provided, `gamma` takes precedence.
-        (Internally reused to match existing Cython parameter naming.)
-    features_group : list[list[int]], optional
-        Temporal grouping of feature indices (waves per covariate). Required when using wide format input.
-        If `assume_long_format=True`, this can be inferred/constructed during preprocessing depending on
-        how wide features are generated.
-    criterion : {"squared_error"}, default="squared_error"
-        Split criterion for regression. In this TpT regressor, the intended criterion is MSE / variance reduction.
-        (Other criteria may not be supported depending on the current Cython implementation.)
-    splitter : str, default="TpT"
-        Split strategy identifier. Must match the TpT splitter name exposed by the underlying Cython backend.
-    max_depth : int, optional
-        Maximum depth of the tree. If None, the tree expands until other stopping criteria apply.
-    min_samples_split : int, default=2
-        Minimum number of samples required to split an internal node.
-    min_samples_leaf : int, default=1
-        Minimum number of samples required to be at a leaf node.
-    min_weight_fraction_leaf : float, default=0.0
-        Minimum weighted fraction of the sum of weights required in each leaf.
-    max_features : int, float, {"sqrt", "log2"}, optional
-        Number of features to consider at each split.
-    random_state : int, RandomState instance, optional
-        Controls randomness of feature sampling and tie-breaking.
-    max_leaf_nodes : int, optional
-        Grow a tree with at most `max_leaf_nodes` leaves (best-first strategy when supported).
-    min_impurity_decrease : float, default=0.0
-        Minimum (unpenalized) impurity decrease required to split.
-    ccp_alpha : float, default=0.0
-        Complexity parameter used for Minimal Cost-Complexity Pruning.
-    store_leaf_values : bool, default=False
-        Whether to store the samples that fall into leaves in the `tree_` attribute.
-    monotonic_cst : list[int], optional
-        Monotonic constraints for features (if supported by the underlying sklearn tree code and compatible
-        with missing values / regression settings).
-
-    Long-format preprocessing parameters
-    -------------------------------
-    assume_long_format : bool, default=False
-        If True, interpret `X` as LONG-format and convert to wide prior to fitting.
-    id_col : str, default="id"
-        Subject identifier column name in LONG-format.
-    time_col : str, default="time_point"
-        Observation time column name in LONG-format.
-    duration_col : str, default="duration"
-        Subject-specific horizon/duration column name in LONG-format.
-    time_step : float, default=1.0
-        Temporal discretization step used to map times to wave indices.
-    max_horizon : float, optional
-        Optional cap for the horizon considered during preprocessing.
-
-    Attributes
-    ----------
-    n_features_in_ : int
-        Number of features seen during fit (wide representation).
-    tree_ : sklearn.tree._tree.Tree
-        The underlying fitted tree structure.
-    feature_importances_ : ndarray of shape (n_features,)
-        Impurity-based feature importances (variance reduction based).
-    _wide_feature_names_ : list[str]
-        Names of generated wide features (when LONG-format preprocessing is enabled).
-    _subject_ids_ : list[str]
-        Subject ids aligned with the wide matrix rows (when LONG-format preprocessing is enabled).
-
-    Notes
-    -----
-    - Contributors: Mathias VALLA, Esteban MAUBOUSSIN, Alae KHIDOUR, Berkehan KOCAK, Sonny MUPFUNI
-    - The intended regression criterion is MSE (variance reduction). If you enable different regression criteria,
-      ensure the Cython backend supports them.
-    - References:
-        - [1] Valla, M. *Time-penalised trees (TpT): introducing a new tree-based data mining algorithm for
-          time-varying covariates.* Ann Math Artif Intell 92, 1609–1661 (2024).
-        - [2] Valla, M., Milhaud, X. *Consistent Time-Aware Trees for Longitudinal Data: The Time-Penalized Tree.* 2026.
-          ⟨hal-05022929v2⟩ https://cnrs.hal.science/hal-05022929
-
-    Examples
-    --------
-    !!! example "Regression with LONG-format input"
-        ```python
-        import pandas as pd
-        from scikit_longitudinal.estimators.trees import TpTDecisionTreeRegressor
-
-        df_long = pd.read_csv("my_longitudinal_dataset.csv")
-        y = df_long["target"]
-        X = df_long.drop(columns=["target"])
-
-        reg = TpTDecisionTreeRegressor(
-            gamma=0.01,
-            assume_long_format=True,
-            id_col="id",
-            time_col="time_point",
-            duration_col="duration",
-            time_step=1.0,
-            max_depth=4,
-            random_state=0,
-        )
-        reg.fit(X, y)
-        preds = reg.predict(X)
-        ```
+        The conversion fills feature values up to each subject's duration/horizon and leaves NaNs beyond,
+        enabling "duration leaves" in the TpT logic.
+
+    Args:
+        gamma (float, optional):
+            Time-penalty rate $\\gamma$ in $e^{-\\gamma \\Delta t}$. If not provided, falls back to
+            `threshold_gain` for backward compatibility.
+        threshold_gain (float, optional):
+            Backward-compatible alias for `gamma`. If both are provided, `gamma` takes precedence. (Internally
+            reused to match existing Cython parameter naming.)
+        features_group (List[List[int]], optional):
+            Temporal grouping of feature indices (waves per covariate). Required when using wide-format input.
+            If `assume_long_format=True`, this can be inferred/constructed during preprocessing depending on
+            how wide features are generated.
+        max_horizon (int, optional):
+            Optional cap for the horizon considered during LONG-format preprocessing.
+        id_col (str, optional):
+            Subject identifier column name in LONG-format.
+        time_col (str, optional):
+            Observation time column name in LONG-format.
+        duration_col (str, optional):
+            Subject-specific horizon/duration column name in LONG-format.
+        time_step (float, default=1.0):
+            Temporal discretisation step used to map times to wave indices.
+        assume_long_format (bool, default=False):
+            If True, interpret `X` as LONG-format and convert to wide prior to fitting.
+        long_feature_columns (List[str], optional):
+            Subset of LONG-format columns to treat as features.
+        criterion (str, default="friedman_mse"):
+            Split criterion for regression. The intended criterion is MSE / variance reduction. (Other criteria
+            may not be supported depending on the current Cython implementation.)
+        splitter (str, default="TpT"):
+            Split strategy identifier. Must match the TpT splitter name exposed by the underlying Cython backend.
+        max_depth (Optional[int], default=None):
+            Maximum depth of the tree. If None, the tree expands until other stopping criteria apply.
+        min_samples_split (int, default=2):
+            Minimum number of samples required to split an internal node.
+        min_samples_leaf (int, default=1):
+            Minimum number of samples required to be at a leaf node.
+        min_weight_fraction_leaf (float, default=0.0):
+            Minimum weighted fraction of the sum of weights required in each leaf.
+        max_features (Optional[Union[int, float, str]], default=None):
+            Number of features to consider at each split.
+        random_state (Optional[int], default=None):
+            Controls randomness of feature sampling and tie-breaking.
+        max_leaf_nodes (Optional[int], default=None):
+            Grow a tree with at most `max_leaf_nodes` leaves (best-first strategy when supported).
+        min_impurity_decrease (float, default=0.0):
+            Minimum (unpenalised) impurity decrease required to split.
+        ccp_alpha (float, default=0.0):
+            Complexity parameter used for Minimal Cost-Complexity Pruning.
+        store_leaf_values (bool, default=False):
+            Whether to store the samples that fall into leaves in the `tree_` attribute.
+        monotonic_cst (Optional[List[int]], default=None):
+            Monotonic constraints for features (if supported by the underlying sklearn tree code and compatible
+            with missing values / regression settings).
+
+    Attributes:
+        n_features_in_ (int):
+            Number of features seen during fit (wide representation).
+        tree_ (sklearn.tree._tree.Tree):
+            The underlying fitted tree structure.
+        feature_importances_ (ndarray of shape (n_features,)):
+            Impurity-based feature importances (variance reduction based).
+        _wide_feature_names_ (List[str]):
+            Names of generated wide features (when LONG-format preprocessing is enabled).
+        _subject_ids_ (List[str]):
+            Subject ids aligned with the wide matrix rows (when LONG-format preprocessing is enabled).
+
+    Examples:
+        !!! example "Basic Usage"
+            ```python
+            import pandas as pd
+            from scikit_longitudinal.estimators.trees import TpTDecisionTreeRegressor
+
+            df_long = pd.read_csv("my_longitudinal_dataset.csv")
+            y = df_long["target"]
+            X = df_long.drop(columns=["target"])
+
+            reg = TpTDecisionTreeRegressor(
+                gamma=0.01,
+                assume_long_format=True,
+                id_col="id",
+                time_col="time_point",
+                duration_col="duration",
+                time_step=1.0,
+                max_depth=4,
+                random_state=0,
+            )
+            reg.fit(X, y)
+            preds = reg.predict(X)
+            ```
     """
 
     _parameter_constraints = {
diff --git a/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree.py b/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree.py
index 27afd1f..3d24c7e 100644
--- a/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree.py
+++ b/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree.py
@@ -9,46 +9,11 @@ class LexicoDecisionTreeClassifier(DecisionTreeClassifier):
     """
     Lexico Decision Tree Classifier for longitudinal data classification.
 
-    This classifier extends the standard Decision Tree algorithm to handle longitudinal data by incorporating a
-    lexicographic optimization approach. It prioritizes more recent data points (waves) when determining splits,
-    based on the premise that recent measurements are more predictive and relevant. The implementation leverages a
-    Cython-optimized fork of scikit-learn's decision tree for improved efficiency, and supports the same binary and
-    multiclass classification workflow as `sklearn.tree.DecisionTreeClassifier`.
-
-    !!! tip "Why Use LexicoDecisionTreeClassifier?"
-        This classifier is ideal when working with longitudinal datasets where temporal recency matters. By balancing
-        information gain with a preference for recent features, it captures evolving patterns effectively—perfect for
-        applications like medical studies or time-series within time-series classification.
-
-    !!! question "How Does Lexicographic Optimization Work?"
-        The algorithm evaluates splits using two objectives:
-
-        1. **Primary**: Maximize the information gain ratio (how much a split reduces uncertainty).
-        2. **Secondary**: Favor features from more recent waves when gain ratios are similar (within `threshold_gain`).
-
-        This dual approach ensures that the tree leverages both statistical purity and temporal relevance.
-
-    !!! note "Performance Boost with Cython"
-        The underlying splitter (`node_lexicoRF_split`) is optimized in Cython for faster computation. Check out the
-        [Cython implementation](https://github.com/simonprovost/scikit-lexicographical-trees/blob/21443b9dce51434b3198ccabac8bafc4698ce953/sklearn/tree/_splitter.pyx#L695)
-        for a deep dive into the performance enhancements.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
+    This classifier extends scikit-learn's `DecisionTreeClassifier` for longitudinal data by integrating a
+    lexicographic optimisation approach that prioritises more recent waves during split selection, based on the
+    premise that recent measurements are more predictive and relevant. Splits are evaluated with a bi-objective
+    rule: the primary objective maximises the information-gain ratio (entropy criterion), and the secondary
+    objective favours features from more recent waves whenever competing gain ratios are within `threshold_gain`.
 
     Args:
         threshold_gain (float, default=0.0015):
@@ -108,7 +73,7 @@ class LexicoDecisionTreeClassifier(DecisionTreeClassifier):
     Examples:
         Below are examples demonstrating the usage of the `LexicoDecisionTreeClassifier` class.
 
-        !!! example "Basic Usage with Iris Dataset"
+        !!! example "Basic Usage"
 
             Please note that the Iris is not longitudinal data, but this example is for demonstration purposes only.
             We could not publicly use the dataset we use for our various papers without user registering
@@ -140,7 +105,7 @@ class LexicoDecisionTreeClassifier(DecisionTreeClassifier):
             print(f"Accuracy: {accuracy}")
             ```
 
-        !!! example "Using with LongitudinalPipeline"
+        !!! example "Advanced: using with LongitudinalPipeline"
 
             ```python
             from scikit_longitudinal.pipeline import LongitudinalPipeline
@@ -176,17 +141,6 @@ class LexicoDecisionTreeClassifier(DecisionTreeClassifier):
             y_pred = pipeline.predict(dataset.X_test)
             print(f"Predictions: {y_pred}")
             ```
-
-    Notes:
-        - The `features_group` parameter is essential for longitudinal data and must reflect the dataset's temporal structure.
-        - For non-longitudinal datasets, this classifier may not outperform the standard `DecisionTreeClassifier`.
-        - References:
-              - Ribeiro, C. and Freitas, A., 2020. A new random forest method for longitudinal data classification using a
-                lexicographic bi-objective approach. In 2020 IEEE Symposium Series on Computational Intelligence (SSCI)
-                (pp. 806-813). IEEE.
-              - Ribeiro, C. and Freitas, A.A., 2024. A lexicographic optimisation approach to promote more recent features
-                on longitudinal decision-tree-based classifiers: applications to the English Longitudinal Study of Ageing.
-                Artificial Intelligence Review, 57(4), p.84.
     """
 
     def __init__(
@@ -254,16 +208,43 @@ def fit(self, X, y, sample_weight=None, *args, **kwargs):
         Raises:
             ValueError:
                 If `features_group` is not provided, as it is required for longitudinal functionality.
-
-        !!! tip "Preparing Your Data"
-            Ensure your input `X` aligns with the `features_group` structure—features should be ordered consistently
-            with the temporal sequence defined in `features_group`.
-
-        !!! note
-            The `fit` method relies heavily on `features_group` to apply the lexicographic optimization. Missing this
-            parameter will halt execution, so double-check it’s set before calling `fit`.
         """
         if self.features_group is None:
             raise ValueError("The features_group parameter must be provided.")
 
         return super().fit(X, y, sample_weight=sample_weight, *args, **kwargs)
+
+    def predict(self, X, check_input=True):
+        """Predict class labels for the input samples.
+
+        Inherited from scikit-learn's `DecisionTreeClassifier`. The Lexico tree only customises
+        split selection at fit time; prediction is the standard tree-traversal routine.
+
+        Args:
+            X (array-like of shape (n_samples, n_features)):
+                Input samples.
+            check_input (bool, default=True):
+                Allow to bypass input validation. Forwarded to scikit-learn.
+
+        Returns:
+            np.ndarray: Predicted class labels of shape `(n_samples,)`.
+        """
+        return super().predict(X, check_input=check_input)
+
+    def predict_proba(self, X, check_input=True):
+        """Predict class probabilities for the input samples.
+
+        Inherited from scikit-learn's `DecisionTreeClassifier`. Probabilities are the fraction of
+        training samples of each class in the leaf reached by each input sample.
+
+        Args:
+            X (array-like of shape (n_samples, n_features)):
+                Input samples.
+            check_input (bool, default=True):
+                Allow to bypass input validation. Forwarded to scikit-learn.
+
+        Returns:
+            np.ndarray: Class probabilities of shape `(n_samples, n_classes)`, with columns ordered
+            as in `self.classes_`.
+        """
+        return super().predict_proba(X, check_input=check_input)
diff --git a/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree_regressor.py b/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree_regressor.py
index e6579c9..440afb6 100644
--- a/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree_regressor.py
+++ b/scikit_longitudinal/estimators/trees/lexicographical/lexico_decision_tree_regressor.py
@@ -9,40 +9,13 @@ class LexicoDecisionTreeRegressor(DecisionTreeRegressor):
     """
     Lexico Decision Tree Regressor for longitudinal data regression.
 
-    The `LexicoDecisionTreeRegressor` is a specialized regression model designed for longitudinal data. It builds
-    upon scikit-learn's `DecisionTreeRegressor` by integrating a lexicographic optimization strategy. This approach
-    prioritizes recent data points (waves) during split selection, optimizing both statistical accuracy and temporal
-    relevance—a powerful tool for modeling time-dependent phenomena like patient health trends or economic forecasts.
+    This regressor extends scikit-learn's `DecisionTreeRegressor` for longitudinal data by integrating a
+    lexicographic optimisation approach that prioritises more recent waves during split selection. Splits are
+    evaluated with a bi-objective rule: the primary objective maximises the variance-reduction information gain
+    (`friedman_mse` criterion), and the secondary objective favours features from more recent waves whenever
+    competing gains are within `threshold_gain`. This is a powerful tool for modelling time-dependent phenomena
+    like patient health trends or economic forecasts.
 
-    !!! question "How Does Lexicographic Optimization Work?"
-        This regressor adapts the traditional decision tree algorithm for longitudinal data by considering two objectives:
-
-        1. **Primary**: Maximize the information gain ratio (using "friedman_mse" criterion).
-        2. **Secondary**: Favor features from more recent waves when gain ratios are comparable (within `threshold_gain`).
-
-        This dual approach ensures that the tree leverages both statistical purity and temporal relevance.
-
-
-    !!! note "Performance Boost with Cython"
-        The underlying splitter (`node_lexicoRF_split`) is optimized in Cython for faster computation. Check out the
-        [Cython implementation](https://github.com/simonprovost/scikit-lexicographical-trees/blob/21443b9dce51434b3198ccabac8bafc4698ce953/sklearn/tree/_splitter.pyx#L695)
-        for a deep dive into the performance enhancements.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
 
     Args:
         threshold_gain (float, default=0.0015):
@@ -90,12 +63,6 @@ class LexicoDecisionTreeRegressor(DecisionTreeRegressor):
         While `Sklong` focussed classification tasks only as of now. This regressor model is used by
         our LexicographicalGradientBoosting primitive. Feel free to experiment with it in your own
         longitudinal regression tasks but we do not guarantee its performance.
-
-    Notes:
-        - **Performance**: Best suited for longitudinal data; may not outperform standard regressors on non-temporal data.
-        - **Reference**: Ribeiro, C. and Freitas, A., 2020. "A new random forest method for longitudinal data regression
-          using a lexicographic bi-objective approach." In *2020 IEEE Symposium Series on Computational Intelligence
-          (SSCI)* (pp. 806-813).
     """
 
     def __init__(
@@ -160,3 +127,20 @@ def fit(self, X, y, *args, **kwargs):
             raise ValueError("The features_group parameter must be provided.")
 
         return super().fit(X, y, *args, **kwargs)
+
+    def predict(self, X, check_input=True):
+        """Predict regression target values for the input samples.
+
+        Inherited from scikit-learn's `DecisionTreeRegressor`. The Lexico tree only customises
+        split selection at fit time; prediction is the standard tree-traversal routine.
+
+        Args:
+            X (array-like of shape (n_samples, n_features)):
+                Input samples.
+            check_input (bool, default=True):
+                Allow to bypass input validation. Forwarded to scikit-learn.
+
+        Returns:
+            np.ndarray: Predicted target values of shape `(n_samples,)`.
+        """
+        return super().predict(X, check_input=check_input)

From 3fe70845974379cc4d758e2dc4bde6508d078208 Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:06:35 +0100
Subject: [PATCH 06/10] refactor(docs): trim CFS and CFS-per-group docstrings

---
 .gitignore                                    |  1 +
 .../correlation_feature_selection/cfs.py      |  9 ----
 .../cfs_per_group.py                          | 49 ++-----------------
 3 files changed, 4 insertions(+), 55 deletions(-)

diff --git a/.gitignore b/.gitignore
index e6fca2d..30b0f86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,4 @@ node_modules/
 *.ipr
 *.iws
 .idea/
+/AGENTS.md
diff --git a/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs.py b/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs.py
index 21db7b7..64be1f6 100644
--- a/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs.py
+++ b/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs.py
@@ -23,8 +23,6 @@ class CorrelationBasedFeatureSelection(CustomTransformerMixinEstimator):
     exhaustiveSearch, or greedySearch. This implementation concern the support for CFS only. For the CFS per group
     (longitudinal component), refer to the CorrelationBasedFeatureSelectionPerGroup class.
 
-    Read more in the Notes below for implementation details.
-
     Args:
         search_method : str, default="greedySearch"
             The search method to use. Options are "exhaustiveSearch", and "greedySearch".
@@ -45,13 +43,6 @@ class CorrelationBasedFeatureSelection(CustomTransformerMixinEstimator):
         >>> X_selected.shape
         >>> # (100, N) ; N is the number of selected features
 
-    Notes:
-        The improved CFS algorithm is based on the following references:
-
-        * Zixiao. S. (2019, August 11). GitHub - ZixiaoShen
-        /Correlation-based-Feature-Selection, available at:
-        https://github.com/ZixiaoShen/Correlation-based-Feature-Selection
-
     See also:
         * CustomTransformerMixinEstimator: Base class for all Transformer Mixin estimators in scikit-learn that we
         customed so that the original scikit-learn "check_x_y" is performed all the time.
diff --git a/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs_per_group.py b/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs_per_group.py
index abbe0f3..d27a8bf 100644
--- a/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs_per_group.py
+++ b/scikit_longitudinal/preprocessors/feature_selection/correlation_feature_selection/cfs_per_group.py
@@ -38,33 +38,6 @@ class CorrelationBasedFeatureSelectionPerGroup(CustomTransformerMixinEstimator):
     2. **Phase 2**: The aggregated features from Phase 1 are combined with non-longitudinal features, and a standard CFS
        is applied to further refine the selection by removing redundant features.
 
-    !!! quote "CFS-Per-Group: A Longitudinal Variation of CFS"
-        CFS-Per-Group, also known as `Exh-CFS-Gr` in the literature, adapts the standard CFS method to longitudinal data.
-        It is particularly useful for datasets where features are collected over multiple time points, such as in ageing
-        studies or health monitoring.
-
-        For scientific references, see the Notes section below.
-
-    !!! note "Standard CFS Implementation"
-        For the standard CFS algorithm without the longitudinal component, refer to the `CorrelationBasedFeatureSelection`
-        class.
-
-    !!! question "Feature Groups and Non-Longitudinal Features"
-        Two key attributes, `feature_groups` and `non_longitudinal_features`, enable algorithms to interpret the temporal
-        structure of longitudinal data, we try to build those as much as possible for users, while allowing
-        users to also define their own feature groups if needed. As follows:
-
-        - **feature_groups**: A list of lists where each sublist contains indices of a longitudinal attribute's waves,
-          ordered from oldest to most recent. This captures temporal dependencies.
-        - **non_longitudinal_features**: A list of indices for static, non-temporal features excluded from the temporal
-          matrix.
-
-        Proper setup of these attributes is critical for leveraging temporal patterns effectively, and effectively
-        use the primitives that follow.
-
-        To see more, we highly recommend visiting the `Temporal Dependency` page in the documentation.
-        [Temporal Dependency Guide :fontawesome-solid-timeline:](https://scikit-longitudinal.readthedocs.io/latest/tutorials/temporal_dependency/){ .md-button }
-
     Args:
         non_longitudinal_features (Optional[List[int]], optional): List of indices for non-longitudinal features.
             These features are not part of the temporal matrix and are treated separately. Defaults to None.
@@ -87,7 +60,7 @@ class CorrelationBasedFeatureSelectionPerGroup(CustomTransformerMixinEstimator):
     Examples:
         Below are examples demonstrating the usage of the `CorrelationBasedFeatureSelectionPerGroup` class.
 
-        !!! example "Basic Usage with Longitudinal Component"
+        !!! example "Basic Usage"
             ```python
             from scikit_longitudinal.preprocessors.feature_selection.correlation_feature_selection import CorrelationBasedFeatureSelectionPerGroup
             from scikit_longitudinal.data_preparation import LongitudinalDataset
@@ -115,7 +88,7 @@ class CorrelationBasedFeatureSelectionPerGroup(CustomTransformerMixinEstimator):
             print(X_selected)
             ```
 
-        !!! example "Using Parallel Processing"
+        !!! example "Advanced: parallel processing"
             ```python
             # ... Same as above, but with parallel processing enabled ...
 
@@ -130,7 +103,7 @@ class CorrelationBasedFeatureSelectionPerGroup(CustomTransformerMixinEstimator):
             # ... Same as above, but with parallel processing enabled ...
             ```
 
-        !!! example "Using Version 2 with Outer Search"
+        !!! example "Advanced: version 2 with outer search"
             ```python
             # ... Same as above, but with parallel processing enabled ...
 
@@ -145,22 +118,6 @@ class CorrelationBasedFeatureSelectionPerGroup(CustomTransformerMixinEstimator):
 
             # ... Same as above, but with parallel processing enabled ...
             ```
-
-    Notes:
-        The CFS-Per-Group algorithm is based on the following references:
-
-        - **Zixiao Shen's CFS Implementation**:
-          - *Zixiao. S.* (2019, August 11). GitHub - ZixiaoShen/Correlation-based-Feature-Selection. Available at: [GitHub](https://github.com/ZixiaoShen/Correlation-based-Feature-Selection)
-        - **Mastervii's CFS 2-Phase Variant**:
-          - *Pomsuwan, T.* (2023, February 24). GitHub - mastervii/CSF_2-phase-variant. Available at: [GitHub](https://github.com/mastervii/CSF_2-phase-variant)
-        - **Longitudinal Component References**:
-          - **Version 1**:
-            - *Pomsuwan, T. and Freitas, A.A.* (2017, November). Feature selection for the classification of longitudinal human ageing data. In *2017 IEEE International Conference on Data Mining Workshops (ICDMW)* (pp. 739-746). IEEE.
-          - **Version 2**:
-            - *Pomsuwan, T. and Freitas, A.A.* (2018, February). Feature selection for the classification of longitudinal human ageing data. Master's thesis, University of Kent. Available at: [University of Kent](https://kar.kent.ac.uk/66568/)
-
-    See also:
-        - `CorrelationBasedFeatureSelection`: For the standard CFS algorithm without the longitudinal component.
     """
 
     # pylint: disable=too-many-arguments,invalid-name,signature-differs,no-member

From bacd8d45d0220fe28b15d4bae5a67003cd88245c Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:13:08 +0100
Subject: [PATCH 07/10] refactor(docs): refresh community hub pages for current
 state

---
 docs/developers.md   | 4 ++--
 docs/faq.md          | 2 +-
 docs/publications.md | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/developers.md b/docs/developers.md
index 275ce0c..68ed239 100644
--- a/docs/developers.md
+++ b/docs/developers.md
@@ -102,7 +102,7 @@ After pinning the x86_64 interpreter, `uv` will automatically use it for future
 ---
 
 ## Linting and Formatting
-We use **Ruff** to maintain code quality.
+We use **Ruff** (with default rules — no custom configuration) to keep code quality in check.
 
 - **Check Issues**:
  ```bash
@@ -251,7 +251,7 @@ Scikit-longitudinal currently exposes shared extension templates for three compo
 ## Running Tests
 Validate your changes:
 ```bash
-uv run pytest -sv tests/
+uv run pytest -sv scikit_longitudinal/tests/
 ```
 
 ---
diff --git a/docs/faq.md b/docs/faq.md
index 1fbd1cb..962cd49 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -96,7 +96,7 @@ icon: lucide/circle-help
 
     The main current boundaries are:
 
-    - no regression support yet, although it is wanted,
+    - no heavy regression support yet, although it is wanted,
     - no longitudinal neural-network family integrated into the core workflow yet, although it is wanted,
     - no attempt to cover full time-series forecasting problems, since `aeon` and similar toolkits are already well shaped for that space,
     - an expectation that temporal structure is provided explicitly through wide-format data and feature-group metadata.
diff --git a/docs/publications.md b/docs/publications.md
index 259b1f3..2c05766 100644
--- a/docs/publications.md
+++ b/docs/publications.md
@@ -67,7 +67,7 @@ This section keeps track of the research papers behind the implemented primitive
     - Valla, M. (2024). *Time-penalised trees (TpT): introducing a new tree-based data mining algorithm for time-varying covariates*. *Annals of Mathematics and Artificial Intelligence* 92, 1609–1661. [Read the paper (DOI)](https://doi.org/10.1007/s10472-024-09950-w)
     - Valla, M., Milhaud, X. (2026). *Consistent Time-Aware Trees for Longitudinal Data: The Time-Penalized Tree*. ⟨hal-05022929v2⟩. [Read the preprint](https://cnrs.hal.science/hal-05022929)
 
-    **API Reference:** [TpT Decision Tree Classifier](API/estimators/trees/tpt_decision_tree_classifier.md)
+    **API Reference:** [TpT Decision Tree Classifier](API/estimators/trees/tpt_decision_tree_classifier.md), [TpT Decision Tree Regressor](API/estimators/trees/tpt_decision_tree_regressor.md)
 
     **Credits:** Original author: Mathias Valla. Implementation: [Mathias Valla](https://github.com/MathiasValla), Esteban Mauboussin, Alae Khidour, Berkehan Kocak, and Sonny Mupfuni, with the `Sklong` team.
 

From 1a51c10df49b424fb0afc7599c66dc20aa41e7a7 Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:17:51 +0100
Subject: [PATCH 08/10] refactor(docs): refresh getting-started install steps
 for 0.1.8

---
 docs/getting-started/index.md | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md
index dec8fa1..85c173e 100644
--- a/docs/getting-started/index.md
+++ b/docs/getting-started/index.md
@@ -43,7 +43,7 @@ Choose the installation method that best suits your workflow.
     You can also pin a specific version if desired:
 
     ```bash
-    uv add "scikit-longitudinal==0.0.8"
+    uv add "scikit-longitudinal==0.1.8"
     ```
 
     After adding, run `uv sync` to materialise the lockfile.
@@ -69,7 +69,7 @@ Choose the installation method that best suits your workflow.
     You can also pin a specific version if desired:
 
     ```bash
-    pip install scikit-longitudinal==0.0.8
+    pip install scikit-longitudinal==0.1.8
     ```
 
 === "Conda <span class='tab-badge'>CondaForge</span>"
@@ -98,7 +98,7 @@ Choose the installation method that best suits your workflow.
     You can also pin a specific version if desired:
 
     ```bash
-    pip install scikit-longitudinal==0.0.8
+    pip install scikit-longitudinal==0.1.8
     ```
 
     This installs `Scikit-longitudinal` in your newly created Conda environment.
@@ -115,23 +115,17 @@ Choose the installation method that best suits your workflow.
 
 === "Google Colab <span class='tab-badge'>4 lines</span>"
 
-    1. Open a new Colab notebook (Python 3.10+).
-    2. Install `Sklong`:
+    1. Open a new Colab notebook (Python 3.10–3.13).
+    2. Remove the preinstalled stock `scikit-learn` (Sklong relies on the `scikit-lexicographical-trees` fork which ships its own `sklearn` package):
 
     ```bash
-    !pip install scikit-longitudinal
-    ```
-
-    3. Ensure the compatible `scikit-lexicographical-trees` dependency is present:
-
-    ```bash
-    !pip install scikit-lexicographical-trees
+    !pip uninstall scikit-learn -y
     ```
 
-    4. Remove conflicting `scikit-learn` if preinstalled:
+    3. Install `Sklong` (this pulls in `scikit-lexicographical-trees` automatically):
 
     ```bash
-    !pip uninstall scikit-learn -y
+    !pip install scikit-longitudinal
     ```
 
 === "Marimo"

From cebd7b0107f958197943e87be4c900cea008a91c Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:20:57 +0100
Subject: [PATCH 09/10] fix(docs): drop removed 'auto' max_features option in
 tuning tutorial

---
 docs/tutorials/sklong_hyperparameter_tuning.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/sklong_hyperparameter_tuning.md b/docs/tutorials/sklong_hyperparameter_tuning.md
index aa2d301..069e6d0 100644
--- a/docs/tutorials/sklong_hyperparameter_tuning.md
+++ b/docs/tutorials/sklong_hyperparameter_tuning.md
@@ -53,7 +53,7 @@ param_distributions = {
  'threshold_gain': loguniform(1e-4, 1e-1),
  'n_estimators': randint(50, 300),
  'max_depth': [None, 5, 10, 15],
- 'max_features': ['auto', 'sqrt', 0.8],
+ 'max_features': ['sqrt', 'log2', 0.8],
 }
 
 random_search = RandomizedSearchCV(

From db1f6c7e82bdb33e0d9b4a33d089db176e2cad15 Mon Sep 17 00:00:00 2001
From: Provost Simon <simon1.provost@epitech.eu>
Date: Sun, 19 Apr 2026 02:39:35 +0100
Subject: [PATCH 10/10] refactor(docs): add PyPI badge, retitle landing hero
 [cd tests]

---
 docs/overrides/home.html | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/docs/overrides/home.html b/docs/overrides/home.html
index 66f095c..d2c9820 100644
--- a/docs/overrides/home.html
+++ b/docs/overrides/home.html
@@ -482,14 +482,21 @@
     .joss-link {
         display: flex;
         flex-direction: column;
-        align-items: flex-start;
-        justify-content: left;
+        align-items: center;
+        justify-content: center;
         text-decoration: none;
         width: fit-content;
         outline: none;
         transition: all 0.3s ease;
     }
 
+    .badge-row {
+        display: flex;
+        flex-direction: row;
+        align-items: flex-start;
+        gap: 2.25rem;
+    }
+
     .joss-link,
     .joss-link:hover,
     .joss-link:focus,
@@ -504,6 +511,7 @@
         font-weight: 700;
         color: var(--homepage-icon-quiet);
         margin: 0 0 5px;
+        text-align: center;
         transition: color 0.3s ease;
     }
 
@@ -1373,9 +1381,9 @@
                 <h1 class="title-klack-style">
                     <mark>Scikit-Longitudinal</mark>
                     <br/>
-                    <marker> efficient & open-source</marker>
+                    <marker>Your Longitudinal-Data Aware</marker>
                     <br/>
-                    <marker>Longitudinal ML</marker>
+                    <marker>Machine Learning</marker>
                 </h1>
                 <div class="tx-landing__actions">
                     <a class="klack-button group" href="getting-started/" aria-label="Get Started">
@@ -1405,6 +1413,16 @@ <h1 class="title-klack-style">
                             <path d="M11.867 24.571c-.703.047-1.511.387-2.027.868-.176.164-.164.363.023.492.575.433 1.43.656 2.145.562.68-.047 1.488-.41 2.027-.89.211-.188.176-.41-.07-.551-.574-.352-1.418-.54-2.098-.48Zm-2.46-3.96c-.153-.223-.364-.235-.54-.024-.468.54-.808 1.348-.855 2.027-.07.727.176 1.582.633 2.121.152.188.34.188.492.012.457-.504.785-1.324.82-2.039.035-.68-.187-1.523-.55-2.098Zm5.765.785c-.656.222-1.348.761-1.723 1.359-.129.21-.07.387.153.469.668.281 1.558.281 2.215.011.644-.234 1.335-.785 1.722-1.382.14-.235.07-.434-.187-.516-.657-.2-1.535-.176-2.18.059Zm-3.375-3.188c-.21-.2-.422-.164-.527.094-.317.633-.434 1.512-.305 2.18.129.726.586 1.488 1.148 1.898.188.140.387.105.48-.117.317-.598.411-1.477.27-2.18-.14-.668-.562-1.418-1.066-1.875Zm8.309-1.348c.046-.27-.094-.421-.375-.386-.68.082-1.454.457-1.957.937-.504.457-.915 1.219-1.032 1.934-.035.234.094.387.328.375.727-.024 1.536-.375 2.028-.89.504-.493.914-1.29 1.008-1.97Zm-6.727-1.007c-.258-.094-.434.023-.434.304-.046.692.2 1.524.598 2.121.375.61 1.113 1.114 1.817 1.278.222.035.386-.082.398-.305.035-.727-.223-1.559-.633-2.11a3.843 3.843 0 0 0-1.746-1.288Zm1.957-2.742c-.504-.446-1.312-.75-2.016-.81-.27-.01-.398.165-.363.423.152.68.61 1.441 1.149 1.898.515.469 1.359.785 2.086.762.21-.012.34-.176.304-.387a3.44 3.44 0 0 0-1.16-1.886Zm4.758-1.536c-.035-.28-.223-.386-.469-.28-.633.245-1.3.808-1.664 1.394-.375.609-.586 1.464-.504 2.132.024.235.2.352.422.282.68-.188 1.371-.739 1.723-1.371a3.69 3.69 0 0 0 .492-2.157Zm-5.39-2.402c-.634-.234-1.5-.246-2.169-.059-.258.082-.328.282-.199.493.352.597 1.043 1.16 1.734 1.418.657.27 1.547.27 2.215 0 .211-.094.27-.293.153-.480-.375-.61-1.067-1.138-1.735-1.372Zm3.925-3.07c-.117-.235-.34-.282-.539-.118-.48.47-.926 1.22-1.066 1.899-.153.691-.059 1.559.257 2.168.106.21.305.27.48.129.575-.422 1.032-1.172 1.161-1.887.14-.68.023-1.547-.293-2.191Zm-6.059-.305c-.68-.14-1.535-.035-2.156.246-.246.106-.304.316-.129.527.457.516 1.22.973 1.899 1.125.703.188 1.582.07 2.191-.293.2-.128.235-.328.094-.492a3.425 3.425 0 0 0-1.899-1.113Zm3.47-3.61c-.153-.21-.376-.234-.552-.023-.445.55-.773 1.348-.808 2.016-.07.68.164 1.535.562 2.133.13.187.328.21.504.046.516-.515.867-1.324.88-2.039.046-.68-.188-1.535-.587-2.132ZM9.991 1.006c-.28-.035-.422.117-.398.387.105.703.492 1.488.96 1.98.493.551 1.29.926 2.005.961.234.012.363-.129.34-.363-.082-.680-.47-1.488-.973-1.969-.48-.48-1.254-.89-1.934-.996Z"/>
                         </svg>
                     </a>
+                    <div class="badge-row">
+                    <a class="joss-link pypi-link" href="https://pypi.org/project/Scikit-longitudinal/" target="_blank" aria-label="Available on PyPI">
+                        <p class="joss-top-text">Available on</p>
+                        <div class="joss-content">
+                            <svg class="joss-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" aria-hidden="true">
+                                <path d="M23.922 13.58v3.912L20.55 18.72l-.078.055.052.037 3.45-1.256.026-.036v-3.997l-.053-.036-.025.092z M23.621 5.618l-3.04 1.107v3.912l3.339-1.215V5.509zM23.92 13.457V9.544l-3.336 1.215v3.913zM20.47 14.71V10.8L17.17 12v3.913zM17.034 19.996v-3.912l-3.313 1.206v3.912zM17.17 16.057v3.868l3.314-1.206V14.85l-3.314 1.206zm2.093 1.882c-.367.134-.663-.074-.663-.463s.296-.814.663-.947c.365-.133.662.075.662.464s-.297.814-.662.946z M13.225 9.315l.365-.132-3.285-1.197-3.323 1.21.102.037 3.184 1.16zM20.507 10.664V6.751L17.17 7.965v3.913zM17.058 11.918V8.005l-3.302 1.202v3.912zM13.643 9.246l-3.336 1.215v3.913l3.336-1.215zM6.907 13.165l3.322 1.209v-3.913L6.907 9.252z M10.34 7.873l3.281 1.193V5.198l-3.28-1.193zM20.507 2.715L17.19 3.922v3.913l3.317-1.207zM16.95 3.903L13.724 2.73l-3.269 1.19 3.225 1.174zM15.365 4.606l-1.624.592v3.868l3.317-1.207V3.991l-1.693.615zm-.391 2.778c-.367.134-.662-.074-.662-.464s.295-.813.662-.946c.366-.133.663.074.663.464s-.297.813-.663.946z M10.229 18.41v-3.914l-3.322-1.209V17.2zM13.678 17.182v-3.913l-3.371 1.227v3.913z M13.756 17.154l3.3-1.2V12.04l-3.3 1.2zM13.678 21.217l-3.371 1.227v-3.912h-.078v3.912l-3.322-1.209v-3.913l-.053-.058-.025-.06-3.336-1.21v-3.948l.034.013 3.287 1.196.015-.078-3.261-1.187 3.26-1.187v-.109L3.876 9.62l-.307-.112 3.26-1.188v.877l.079-.055V6.769l3.257 1.185.058-.061L7.084 6.75l-.102-.037 3.24-1.179v-.083L6.854 6.677v.018l-.025.018v1.523L3.44 9.47v.02l-.025.017v4.007l-3.39 1.233v.019L0 14.784v3.995l.025.037 3.4 1.237.008-.006.007.01 3.4 1.238.008-.006.006.01 3.4 1.237.014-.009.012.01 3.45-1.256.026-.037-.078-.027zM3.493 9.563l3.257 1.185-3.257 1.187V9.562zM3.4 19.96L.078 18.752v-3.913l2.361.86.96.349v3.913zm.015-3.99L.335 14.85l-.182-.066 3.262-1.187v2.374zm3.399 5.231l-3.321-1.209v-3.912l3.321 1.209v3.912zM23.791 5.434l-3.21-1.17v2.338zM20.387 2.643l-3.24-1.18-3.27 1.19 3.247 1.182z"/>
+                            </svg>
+                            <span class="joss-text">PyPI</span>
+                        </div>
+                    </a>
 
                     <a class="joss-link" href="https://doi.org/10.21105/joss.08481" target="_blank" aria-label="Published in JOSS">
                         <p class="joss-top-text">Published in</p>
@@ -1415,6 +1433,8 @@ <h1 class="title-klack-style">
                             <span class="joss-text">JOSS</span>
                         </div>
                     </a>
+                    </div>
+
                 </div>
             </div>
             <div class="tx-landing__image">