vansteensellab · luciabarb · Apr 1, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/PARM/PARM_model_evaluation.py b/PARM/PARM_model_evaluation.py
diff --git a/PARM/PARM_mutagenesis.py b/PARM/PARM_mutagenesis.py
@@ -1069,6 +1069,10 @@ def run_motif_scanning(
                         if append:
                             start_motif_name = start_motif - 5
                             end_motif_name = end_motif - 5
+
+                        else:
+                            start_motif_name = start_motif
+                            end_motif_name = end_motif
 
                         if split_pos_neg:
                             if rho_pos_prob > threshold:
@@ -1776,7 +1780,6 @@ def find_hits_and_make_logo(
 
     ##add selected hits
     hits_to_plot_row1, hits_to_plot_row2 = [], []
-
     for it_rows, motif in hits.iterrows():
         start_motif, end_motif = motif['start'], motif['end']
 

diff --git a/PARM/PARM_predict.py b/PARM/PARM_predict.py
@@ -118,7 +118,7 @@ def PARM_predict(
             predictions_all_folds = []
             for model in list_of_models:
                 predictions_all_folds.append(
-                    get_prediction(tmp.sequence.to_list(), model, L_max=L_max)
+                    get_prediction(tmp.sequence.to_list(), model, L_max = L_max)
                 )
                 pbar.update(1)
             # Now, take the average of the predictions and add to the tmp[model_name]
@@ -151,7 +151,12 @@ def PARM_predict(
 
 
 def get_test_fold_predictions(
-    test_fold_path, list_of_models, cell_type, output_directory
+    test_fold_path, 
+    list_of_models, 
+    cell_type, 
+    output_directory, 
+    features_fragments_selection=False,
+    normalization_method="Log2RPM"
 ):
     """
     Perform predictions on test fold data and create measured vs predicted plot.
@@ -167,154 +172,199 @@ def get_test_fold_predictions(
 
     log(f"Loading test fold data from {test_fold_path}")
 
-    # Load HDF5 file directly
-    with h5py.File(test_fold_path, "r") as f:
-        # Load sequences (one-hot encoded)
-        sequences = f["X"]["sequence"]["OneHotEncoding"][:]
-        # Load measured values
-        measured = f["Y"][f"Log2RPM_{cell_type}"][:]
-        # Load the feature names of each fragment
-        try: 
-            feature_names = f["FEAT"]["FEATname"][:].astype(str)
-        except: 
-            feat_type = f["FEAT/FEATtype"][:].astype(str)
-            feat_start = f["FEAT/FEATstart"][:].astype(str)
-            feat_end = f["FEAT/FEATend"][:].astype(str)
-            feature_names = [
-                f"{t}_{s}_{e}"
-                for t, s, e in zip(feat_type, feat_start, feat_end)
-            ]
-
-    log(f"Loaded {len(sequences)} test fragments")
-
-    # Make predictions with each model
-    all_predictions = []
-    batch_size = 32
-    for i, model in enumerate(list_of_models):
-        log(f"Making predictions with model fold {i}")
-        model.eval()
-        predictions = []
-
-        with torch.no_grad():
-            for start_idx in range(0, len(sequences), batch_size):
-                end_idx = min(start_idx + batch_size, len(sequences))
-                batch_sequences = sequences[start_idx:end_idx]
-
-                # Convert to tensor and move to GPU if available
-                X = torch.tensor(batch_sequences, dtype=torch.float32).permute(0, 2, 1)
-                if torch.cuda.is_available():
-                    X = X.cuda()
-                    model = model.cuda()
-
-                # Make predictions
-                pred = model(X).cpu().detach().numpy()
-                predictions.append(pred)
-
-        # Concatenate all batch predictions
-        model_predictions = np.concatenate(predictions, axis=0)
-        all_predictions.append(model_predictions)
-
-    # Average predictions across all models
-    avg_predictions = np.mean(all_predictions, axis=0).flatten()
-    measured_flat = measured.flatten()
-
-    # Now, make a dataframe with the predictions and measured values, and the feature names
-    results_df = pd.DataFrame(
-        {
-            "measured_Log2PM": measured_flat,
-            "predicted_Log2RPM": avg_predictions,
-            "feature": feature_names,
-        }
-    )
-
-    # Calculate correlation
-    pearson_r, _ = pearsonr(measured_flat, avg_predictions)
-
-    log(f"Pearson correlation (fragment level): {pearson_r:.3f}")
-
-    # Create scatter plot
-    fig, ax = plt.subplots(figsize=(8, 7))
-
-    h = ax.hist2d(
-        avg_predictions, measured_flat, bins=100, norm=colors.LogNorm(), cmap="viridis"
-    )
-
-    # Add correlation annotation
-    ax.annotate(
-        f"R = {pearson_r:.3f}",
-        xy=(0.05, 0.95),
-        xycoords="axes fraction",
-        fontsize=12,
-        verticalalignment="top",
-        bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
-    )
-
-    # Add diagonal line
-    min_val = min(avg_predictions.min(), measured_flat.min())
-    max_val = max(avg_predictions.max(), measured_flat.max())
-    ax.plot([min_val, max_val], [min_val, max_val], "r--", alpha=0.8, linewidth=2)
-
-    ax.set_xlabel("Predicted Log2RPM", fontsize=12)
-    ax.set_ylabel("Measured Log2RPM", fontsize=12)
-    ax.set_title(f"Test Fold Results - {cell_type}", fontsize=14)
-
-    plt.colorbar(h[3], ax=ax, label="Fragment count")
-
-    # Save plot
-    plot_path = os.path.join(
-        output_directory, f"test_fold_scatter_{cell_type}_fragment_level.svg"
-    )
-    plt.savefig(plot_path, dpi=300, bbox_inches="tight")
-    plt.close()
-
-    results_path = os.path.join(
-        output_directory, f"test_fold_predictions_{cell_type}.tsv"
-    )
-    results_df.to_csv(results_path, sep="\t", index=False)
-
-    # Now, group prediction by feature and plot the measured vs. predicted values
-    grouped_results = (
-        results_df.groupby("feature")
-        .agg({"measured_Log2PM": "mean", "predicted_Log2RPM": "mean"})
-        .reset_index()
-    )
-
-    # plot the grouped results
-    fig, ax = plt.subplots(figsize=(8, 7))
-    h = ax.hist2d(
-        grouped_results["predicted_Log2RPM"],
-        grouped_results["measured_Log2PM"],
-        bins=100,
-        norm=colors.LogNorm(),
-        cmap="viridis",
-    )
-    # Add correlation annotation
-    pearson_r_grouped, _ = pearsonr(
-        grouped_results["measured_Log2PM"], grouped_results["predicted_Log2RPM"]
-    )
-
-    log(f"Pearson correlation (feature level): {pearson_r_grouped:.3f}")
-
-    ax.annotate(
-        f"R = {pearson_r_grouped:.3f}",
-        xy=(0.05, 0.95),
-        xycoords="axes fraction",
-        fontsize=12,
-        verticalalignment="top",
-        bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
-    )
-    # Add diagonal line
-    ax.plot([min_val, max_val], [min_val, max_val], "r--", alpha=0.8, linewidth=2)
-    ax.set_xlabel("Predicted Log2RPM", fontsize=12)
-    ax.set_ylabel("Measured Log2RPM", fontsize=12)
-    ax.set_title(f"Test Fold Results - {cell_type} (Grouped by Feature)", fontsize=14)
-    plt.colorbar(h[3], ax=ax, label="Feature count")
-    # Save plot
-    plot_grouped_path = os.path.join(
-        output_directory, f"test_fold_scatter_{cell_type}_feature_level.svg"
-    )
-    plt.savefig(plot_grouped_path, dpi=300, bbox_inches="tight")
-    plt.close()
+    #Split in cell
+
+    for i_cell, cell in enumerate(cell_type.split("__")):
+
+        # Load HDF5 file directly
+        with h5py.File(test_fold_path, "r") as f:
+            if features_fragments_selection is not False:
+                dict_features_int = {
+                    "TSS": 0,
+                    "EnhA": 1,
+                    "peaks": 2,
+                    "EnhAmany": 3,
+                    "EnhAstrong": 4,
+                    "MichielEnhA": 5,
+                    "others": 99,
+                }
+
+                # Workaround: allow selecting both TSS and EnhA fragments together.
+                if features_fragments_selection == "TSS_EnhA":
+                    selected_feature_types = np.array([0, 1])
+                else:
+                    if features_fragments_selection not in dict_features_int:
+                        valid = list(dict_features_int.keys()) + ["TSS_EnhA"]
+                        raise ValueError(
+                            f"Unknown features_fragments_selection '{features_fragments_selection}'. "
+                            f"Expected one of: {valid}."
+                        )
+                    selected_feature_types = np.array(
+                        [dict_features_int[features_fragments_selection]]
+                    )
+
+                feature_index = np.array(f["FEAT"]["FEATtype"][:])
+                index = np.arange(len(feature_index))
+                index = index[np.isin(feature_index, selected_feature_types)]
+
+
+            # Load sequences (one-hot encoded)
+            sequences = f["X"]["sequence"]["OneHotEncoding"][:]
+            if features_fragments_selection is not False: sequences = sequences[index]
+            # Load measured values
+            measured = f["Y"][f"{normalization_method}_{cell}"][:]
+            if features_fragments_selection is not False: measured = measured[index]
+            # Load the feature names of each fragment
+            try: 
+                feature_names = np.array(f["FEAT"]["FEATname"][:].astype(str))
+                if features_fragments_selection is not False: feature_names = feature_names[index]
+
+            except: 
+                feat_type = f["FEAT/FEATtype"][:].astype(str)
+                feat_start = f["FEAT/FEATstart"][:].astype(str)
+                feat_end = f["FEAT/FEATend"][:].astype(str)
+                feature_names = np.array([
+                    f"{t}_{s}_{e}"
+                    for t, s, e in zip(feat_type, feat_start, feat_end)
+                ])
+                if features_fragments_selection is not False: feature_names = feature_names[index]
+
+        log(f"Loaded {len(sequences)} test fragments")
+
+        # Make predictions with each model
+        all_predictions = []
+        batch_size = 32
+        for i, model in enumerate(list_of_models):
+            log(f"Making predictions with model fold {i}")
+            model.eval()
+            predictions = []
+
+            with torch.no_grad():
+                for start_idx in range(0, len(sequences), batch_size):
+                    end_idx = min(start_idx + batch_size, len(sequences))
+                    batch_sequences = sequences[start_idx:end_idx]
+
+                    # Convert to tensor and move to GPU if available
+                    X = torch.tensor(batch_sequences, dtype=torch.float32).permute(0, 2, 1)
+                    if torch.cuda.is_available():
+                        X = X.cuda()
+                        model = model.cuda()
+
+                    # Make predictions
+                    pred = model(X).cpu().detach().numpy()
+                    predictions.append(pred)
+
+            # Concatenate all batch predictions
+            model_predictions = np.concatenate(predictions, axis=0)
+            all_predictions.append(model_predictions)
+
+        # Average predictions across all models
+        avg_predictions = np.mean(all_predictions, axis=0)
+
+        if '__' in cell_type:
+            avg_predictions = avg_predictions[:,i_cell]
+
+        avg_predictions = avg_predictions.flatten()
+
+        measured_flat = measured.flatten()
+
+        # Now, make a dataframe with the predictions and measured values, and the feature names
+        results_df = pd.DataFrame(
+            {
+                f"measured_{normalization_method}": measured_flat,
+                f"predicted_{normalization_method}": avg_predictions,
+                "feature": feature_names,
+            }
+        )
+
+        # Calculate correlation
+        pearson_r, _ = pearsonr(measured_flat, avg_predictions)
+
+        log(f"Pearson correlation (fragment level): {pearson_r:.3f}")
+
+        # Create scatter plot
+        fig, ax = plt.subplots(figsize=(8, 7))
+
+        h = ax.hist2d(
+            avg_predictions, measured_flat, bins=100, norm=colors.LogNorm(), cmap="viridis"
+        )
+
+        # Add correlation annotation
+        ax.annotate(
+            f"R = {pearson_r:.3f}",
+            xy=(0.05, 0.95),
+            xycoords="axes fraction",
+            fontsize=12,
+            verticalalignment="top",
+            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
+        )
+
+        # Add diagonal line
+        min_val = min(avg_predictions.min(), measured_flat.min())
+        max_val = max(avg_predictions.max(), measured_flat.max())
+        ax.plot([min_val, max_val], [min_val, max_val], "r--", alpha=0.8, linewidth=2)
+
+        ax.set_xlabel("Predicted Log2RPM", fontsize=12)
+        ax.set_ylabel("Measured Log2RPM", fontsize=12)
+        ax.set_title(f"Test Fold Results - {cell}", fontsize=14)
+
+        plt.colorbar(h[3], ax=ax, label="Fragment count")
+
+        # Save plot
+        plot_path = os.path.join(
+            output_directory, f"test_fold_scatter_{cell}_fragment_level.svg"
+        )
+        plt.savefig(plot_path, dpi=300, bbox_inches="tight")
+        plt.close()
+
+        results_path = os.path.join(
+            output_directory, f"test_fold_predictions_{cell}.tsv"
+        )
+        results_df.to_csv(results_path, sep="\t", index=False)
+
+        # Now, group prediction by feature and plot the measured vs. predicted values
+        grouped_results = (
+            results_df.groupby("feature")
+            .agg({f"measured_{normalization_method}": "mean", f"predicted_{normalization_method}": "mean"})
+            .reset_index()
+        )
+
+        # plot the grouped results
+        fig, ax = plt.subplots(figsize=(8, 7))
+        h = ax.hist2d(
+            grouped_results[f"predicted_{normalization_method}"],
+            grouped_results[f"measured_{normalization_method}"],
+            bins=100,
+            norm=colors.LogNorm(),
+            cmap="viridis",
+        )
+        # Add correlation annotation
+        pearson_r_grouped, _ = pearsonr(
+            grouped_results[f"measured_{normalization_method}"], grouped_results[f"predicted_{normalization_method}"]
+        )
+
+        log(f"Pearson correlation (feature level): {pearson_r_grouped:.3f}")
+
+        ax.annotate(
+            f"R = {pearson_r_grouped:.3f}",
+            xy=(0.05, 0.95),
+            xycoords="axes fraction",
+            fontsize=12,
+            verticalalignment="top",
+            bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
+        )
+        # Add diagonal line
+        ax.plot([min_val, max_val], [min_val, max_val], "r--", alpha=0.8, linewidth=2)
+        ax.set_xlabel(f"Predicted {normalization_method}", fontsize=12)
+        ax.set_ylabel(f"Measured {normalization_method}", fontsize=12)
+        ax.set_title(f"Test Fold Results - {cell} (Grouped by Feature)", fontsize=14)
+        plt.colorbar(h[3], ax=ax, label="Feature count")
+        # Save plot
+        plot_grouped_path = os.path.join(
+            output_directory, f"test_fold_scatter_{cell}_feature_level.svg"
+        )
+        plt.savefig(plot_grouped_path, dpi=300, bbox_inches="tight")
+        plt.close()
 
 
 def get_prediction(sequence, complete_model, L_max=600):