From 48efda2606a25dac30b6fe607f5c0ffbe2bf7146 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 16:37:38 +0200 Subject: [PATCH 01/16] units added to shannon computation. --- src/dynsight/_internal/analysis/entropy.py | 28 ++++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 30790072..5000b611 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from numpy.typing import NDArray @@ -15,6 +15,7 @@ def compute_shannon( data: NDArray[np.float64], data_range: tuple[float, float], n_bins: int, + units: Literal["bit", "nat", "frac"] = "frac", ) -> float: """Compute the Shannon entropy of a univariate data distribution. @@ -31,6 +32,10 @@ def compute_shannon( n_bins: The number of bins with which the data histogram must be computed. + units: + The units of measure of the output entropy. If "frac", entropy is + normalized between 0 and 1 by dividing by log(n_bins). + Returns: The value of the normalized Shannon entropy of the dataset. @@ -67,11 +72,19 @@ def compute_shannon( ) probs = counts / np.sum(counts) # Data probabilities are needed entropy = -np.sum([p * np.log2(p) for p in probs if p > 0.0]) - entropy /= np.log2(n_bins) - return entropy + + if units == "bit": + return entropy + if units == "nat": + return entropy * np.log(2) + return entropy / np.log2(n_bins) -def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float: +def compute_kl_entropy( + data: NDArray[np.float64], + n_neigh: int = 1, + units: Literal["bit", "nat"] = "bit", +) -> float: """Estimate Shannon differential entropy using Kozachenko-Leonenko. The Kozachenko-Leonenko k-nearest neighbors method approximates @@ -86,6 +99,9 @@ def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float: n_neigh: The number of neighbors considered in the KL estimator. + units: + The units of measure of the output entropy. + Returns: The Shannon differential entropy of the dataset, in bits. @@ -112,7 +128,9 @@ def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float: eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances eps = np.clip(eps, 1e-10, None) # avoid log(0) const = digamma(n_data) - digamma(n_neigh) + 1 - return const + np.mean(np.log2(eps)) + if units == "bit": + return const + np.mean(np.log2(eps)) + return const + np.mean(np.log2(eps)) * np.log(2) def compute_negentropy(data: NDArray[np.float64]) -> float: From 29cf3b6095b0cf4b58645da85ad5eade34782303 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 16:42:03 +0200 Subject: [PATCH 02/16] units added to negentropy computation. --- src/dynsight/_internal/analysis/entropy.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 5000b611..8a54ef40 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -133,7 +133,10 @@ def compute_kl_entropy( return const + np.mean(np.log2(eps)) * np.log(2) -def compute_negentropy(data: NDArray[np.float64]) -> float: +def compute_negentropy( + data: NDArray[np.float64], + units: Literal["bit", "nat"] = "bit", +) -> float: """Estimate negentropy of a dataset. Negentropy is a measure of non-Gaussianity representing the distance @@ -149,8 +152,11 @@ def compute_negentropy(data: NDArray[np.float64]) -> float: data: The dataset for which the entropy is to be computed. + units: + The units of measure of the output negentropy. + Returns: - The negentropy of the dataset, in bits. + The negentropy of the dataset. Example: @@ -176,8 +182,8 @@ def compute_negentropy(data: NDArray[np.float64]) -> float: data_norm = (data - np.mean(data)) / np.std(data, ddof=1) sigma = np.std(data_norm, ddof=1) data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size) - h_gauss = compute_kl_entropy(data_gauss) - h_data = compute_kl_entropy(data_norm) + h_gauss = compute_kl_entropy(data_gauss, units=units) + h_data = compute_kl_entropy(data_norm, units=units) return h_gauss - h_data From b44e1f20136596b66ac017a25918bd0e35ec1cce Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 16:49:04 +0200 Subject: [PATCH 03/16] units added to shannon_multi computation. --- src/dynsight/_internal/analysis/entropy.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 8a54ef40..586cf6fe 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -191,6 +191,7 @@ def compute_shannon_multi( data: NDArray[np.float64], data_ranges: list[tuple[float, float]], n_bins: list[int], + units: Literal["bit", "nat", "frac"] = "frac", ) -> float: """Compute the Shannon entropy of a multivariate data distribution. @@ -210,6 +211,10 @@ def compute_shannon_multi( A list of integers specifying the number of bins for each dimension. + units: + The units of measure of the output entropy. If "frac", entropy is + normalized between 0 and 1 by dividing by log(n_bins). + Returns: The value of the normalized Shannon entropy of the dataset. @@ -248,9 +253,12 @@ def compute_shannon_multi( counts, _ = np.histogramdd(data, bins=n_bins, range=data_ranges) probs = counts / np.sum(counts) # Probability distribution entropy = -np.sum(probs[probs > 0] * np.log2(probs[probs > 0])) - entropy /= np.log2(np.prod(n_bins)) # Normalization - return entropy + if units == "bit": + return entropy + if units == "nat": + return entropy * np.log(2) + return entropy / np.log2(np.prod(n_bins)) # Normalization def compute_entropy_gain( From 5075169d9070794eeec79056f1f9a14300814ecb Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 17:05:39 +0200 Subject: [PATCH 04/16] Added function compute_kl_entropy_multi(). --- src/dynsight/_internal/analysis/entropy.py | 65 +++++++++++++++++++++- src/dynsight/analysis.py | 2 + 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 586cf6fe..9f7110e1 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -7,8 +7,9 @@ import numpy as np import numpy.typing as npt +from scipy.spatial import cKDTree from scipy.spatial.distance import cdist -from scipy.special import digamma +from scipy.special import digamma, gamma def compute_shannon( @@ -261,6 +262,68 @@ def compute_shannon_multi( return entropy / np.log2(np.prod(n_bins)) # Normalization +def compute_kl_entropy_multi( + data: NDArray[np.float64], + n_neigh: int = 1, + units: Literal["bit", "nat"] = "bit", +) -> float: + """Estimate Shannon differential entropy using Kozachenko-Leonenko. + + This function works for multivariate distribution. + The Kozachenko-Leonenko k-nearest neighbors method approximates + differential entropy based on distances to nearest neighbors + in the sample space. It's main advantage is being parameter-free. + + Parameters: + data: + The dataset for which the entropy is to be computed. + Shape (n_data, n_dims) + + n_neigh: + The number of neighbors considered in the KL estimator. + + units: + The units of measure of the output entropy. + + Returns: + The Shannon differential entropy of the dataset, in bits. + + Example: + + .. testcode:: klm-entropy-test + + import numpy as np + from dynsight.analysis import compute_kl_entropy_multi + + np.random.seed(1234) + data = np.random.rand(10000, 2) + + data_entropy = compute_kl_entropy_multi(data) + + .. testcode:: klm-entropy-test + :hide: + + assert np.isclose(data_entropy, -4.319358938644518) + + """ + n_samples, dim = data.shape + tree = cKDTree(data) + eps, _ = tree.query(data, k=n_neigh + 1, p=2) + eps = eps[:, -1] # distance to the n_neigh-th neighbor + eps = np.clip(eps, 1e-10, None) # avoid log(0) + unit_ball_volume = (np.pi ** (dim / 2)) / gamma(dim / 2 + 1) + entropy = ( + digamma(n_samples) + - digamma(n_neigh) + + np.log2(unit_ball_volume) + + (dim / n_samples) * np.sum(np.log2(eps)) + ) + + if units == "bit": + return entropy + return entropy * np.log(2) + + def compute_entropy_gain( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], diff --git a/src/dynsight/analysis.py b/src/dynsight/analysis.py index 3b59bcfa..4f08dc8a 100644 --- a/src/dynsight/analysis.py +++ b/src/dynsight/analysis.py @@ -4,6 +4,7 @@ compute_entropy_gain, compute_entropy_gain_multi, compute_kl_entropy, + compute_kl_entropy_multi, compute_negentropy, compute_shannon, compute_shannon_multi, @@ -22,6 +23,7 @@ "compute_entropy_gain", "compute_entropy_gain_multi", "compute_kl_entropy", + "compute_kl_entropy_multi", "compute_negentropy", "compute_rdf", "compute_shannon", From d2f3c54effeb4ea450d577df36e307a105bd6cc2 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 18:56:47 +0200 Subject: [PATCH 05/16] Solved bug in math formulas. --- src/dynsight/_internal/analysis/entropy.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 9f7110e1..4dd7fcff 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -121,17 +121,19 @@ def compute_kl_entropy( .. testcode:: kl-entropy-test :hide: - assert np.isclose(data_entropy, -3.3437736767342194) + assert np.isclose(data_entropy, -3.650626496174274) """ data = np.sort(data.flatten()) n_data = len(data) eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances eps = np.clip(eps, 1e-10, None) # avoid log(0) - const = digamma(n_data) - digamma(n_neigh) + 1 + const = digamma(n_data) - digamma(n_neigh) + np.log(2) # 1D volume + h_bits = const + np.mean(np.log2(eps)) if units == "bit": - return const + np.mean(np.log2(eps)) - return const + np.mean(np.log2(eps)) * np.log(2) + return h_bits + # nat + return h_bits * np.log(2) def compute_negentropy( From efec760e7662ec84f23708c15bbc521b47fe8cdc Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 18:57:15 +0200 Subject: [PATCH 06/16] Adding recipe for entropy calculations. --- docs/source/entropy.rst | 72 +++++++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 3 +- 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 docs/source/entropy.rst diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst new file mode 100644 index 00000000..e109a596 --- /dev/null +++ b/docs/source/entropy.rst @@ -0,0 +1,72 @@ +Entropy calculations +==================== + +This recipe explains how to compute Shannon entropy for different types of +datasets using the functions in the `dynsight.analysis` module. + +First of all, we import all the packages and objects we'll need: + +.. testcode:: recipe4-test + + import numpy as np + import dynsight + import matplotlib.pyplot as plt + + np.random.seed(42) # set the random seed + + +Entropy of a discrete variable +------------------------------ + +Let's compute the Shanon entropy of rolling a dice ``n_sample`` times, which +should be equal to log2(6) bit. + +.. testcode:: recipe4-test + + n_sample = 10000 + rolls = np.random.randint(1, 7, size=n_sample) + + dice_entropy = dynsight.analysis.compute_shannon( + data=rolls, + data_range=(1,6), + n_bins=6, + units="bit", + ) + # dice_entropy = 2.584832195231254 + + +Entropy of a continuous variable +--------------------------------- + +Shannon entropy is not univocally defined for continuous variables, but the +difference between the entropy of different distribution is. Let's compute the +difference between the Shannon entropy of two Gaussian distributions, with +standard deviations respectively equal to 1 and 2, which should be 1 bit. + +.. testcode:: recipe4-test + + n_sample = 10000000 + data_1 = np.random.normal(loc=0.0, scale=1.0, size=n_sample) + data_2 = np.random.normal(loc=0.0, scale=2.0, size=n_sample) + + gauss_entropy_1 = dynsight.analysis.compute_kl_entropy( + data=data_1, + units="bit", + ) + gauss_entropy_2 = dynsight.analysis.compute_kl_entropy( + data=data_2, + units="bit", + ) + diff = gauss_entropy_2 - gauss_entropy_1 + # diff = 1.0010395631476854 + + +%.. raw:: html +% +% ⬇️ Download Python Script + +.. testcode:: recipe4-test + :hide: + + assert np.isclose(dice_entropy, np.log2(6), rtol=1e-3) + assert np.isclose(diff, 1, rtol=1e-3, atol=1e-4) diff --git a/docs/source/index.rst b/docs/source/index.rst index a52ae8ac..f3a61eb7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,6 +33,7 @@ Descriptors from a Trj Dimensionality reduction methods + Entropy calculations Information gain analysis .. toctree:: @@ -81,7 +82,7 @@ How to get started ------------------ We suggest you give a read to the ``dynsight.trajectory`` module documentation, -which offers a compact and easy way of using most of the ``dynsight`` tools. +which offers a compact and easy way of using most of the ``dynsight`` tools. Also, the documentation offers some copiable Recipes and Examples for the most common analyses. From 2f0233a4ee806b27557c7d154f05d281995e1e95 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 19:07:53 +0200 Subject: [PATCH 07/16] Added discrete multivariate case. --- docs/source/entropy.rst | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst index e109a596..641bc513 100644 --- a/docs/source/entropy.rst +++ b/docs/source/entropy.rst @@ -32,7 +32,27 @@ should be equal to log2(6) bit. n_bins=6, units="bit", ) - # dice_entropy = 2.584832195231254 + # dice_entropy = 2.584832195231254 ~ log2(6) + + +Entropy of a discrete multivariate variable +------------------------------------------- + +Let's compute the Shanon entropy of rolling `two` dices ``n_sample`` times, +which should be equal to log2(36) bit. + +.. testcode:: recipe4-test + + n_sample = 10000 + rolls = np.random.randint(1, 7, size=(n_sample, 2)) + + dices_entropy = dynsight.analysis.compute_shannon_multi( + data=rolls, + data_ranges=[(1,6), (1,6)], + n_bins=[6, 6], + units="bit", + ) + # dices_entropy = 5.168428344754391 ~ log2(36) Entropy of a continuous variable @@ -61,6 +81,9 @@ standard deviations respectively equal to 1 and 2, which should be 1 bit. # diff = 1.0010395631476854 + + + %.. raw:: html % % ⬇️ Download Python Script @@ -69,4 +92,5 @@ standard deviations respectively equal to 1 and 2, which should be 1 bit. :hide: assert np.isclose(dice_entropy, np.log2(6), rtol=1e-3) + assert np.isclose(dices_entropy, np.log2(36), rtol=1e-3) assert np.isclose(diff, 1, rtol=1e-3, atol=1e-4) From c532cd1a58d5629c5ad22baa7f5fe60e2f8e9d58 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 19:17:59 +0200 Subject: [PATCH 08/16] Added continuous multivariate case. --- docs/source/entropy.rst | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst index 641bc513..125e9bf5 100644 --- a/docs/source/entropy.rst +++ b/docs/source/entropy.rst @@ -19,7 +19,7 @@ Entropy of a discrete variable ------------------------------ Let's compute the Shanon entropy of rolling a dice ``n_sample`` times, which -should be equal to log2(6) bit. +should be equal to log2(6) bits. .. testcode:: recipe4-test @@ -39,7 +39,7 @@ Entropy of a discrete multivariate variable ------------------------------------------- Let's compute the Shanon entropy of rolling `two` dices ``n_sample`` times, -which should be equal to log2(36) bit. +which should be equal to log2(36) bits. .. testcode:: recipe4-test @@ -81,7 +81,40 @@ standard deviations respectively equal to 1 and 2, which should be 1 bit. # diff = 1.0010395631476854 +Entropy of a continuous multivariate variable +--------------------------------------------- +And the same is true for multivariate distributions. Let's compute the +difference between the Shannon entropy of two bivariate Gaussian +distributions, with standard deviations respectively equal to 1 and 2, +which should be 2 bits. + +.. testcode:: recipe4-test + + n_sample = 100000 + mean = [1, 1] + cov = np.array([[1, 0], [0, 1]]) + data_1 = np.random.multivariate_normal( + mean=mean, + cov=cov, + size=n_sample, + ) + data_2 = np.random.multivariate_normal( + mean=mean, + cov=cov * 4.0, + size=n_sample, + ) + + gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi( + data=data_1, + units="bit", + ) + gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi( + data=data_2, + units="bit", + ) + diff_2d = gauss_entropy_2 - gauss_entropy_1 + # diff_2d = 1.9983384346024948 %.. raw:: html @@ -94,3 +127,4 @@ standard deviations respectively equal to 1 and 2, which should be 1 bit. assert np.isclose(dice_entropy, np.log2(6), rtol=1e-3) assert np.isclose(dices_entropy, np.log2(36), rtol=1e-3) assert np.isclose(diff, 1, rtol=1e-3, atol=1e-4) + assert np.isclose(diff_2d, 2, rtol=1e-3, atol=1e-4) From 1b2e2c16f11e06d038c12c0fe84a7b8e65e7f641 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 19:34:34 +0200 Subject: [PATCH 09/16] Added downloadable version. --- docs/source/_static/recipes/entropy.py | 69 ++++++++++++++++++++++++++ docs/source/entropy.rst | 25 +++++----- docs/source/index.rst | 2 +- 3 files changed, 82 insertions(+), 14 deletions(-) create mode 100644 docs/source/_static/recipes/entropy.py diff --git a/docs/source/_static/recipes/entropy.py b/docs/source/_static/recipes/entropy.py new file mode 100644 index 00000000..bc12e793 --- /dev/null +++ b/docs/source/_static/recipes/entropy.py @@ -0,0 +1,69 @@ +"""Copiable code from Recipe #4.""" + +import numpy as np + +import dynsight + +rng = np.random.default_rng(42) # set the random seed + +# Entropy of a discrete variable +n_sample = 10000 +rolls = rng.integers(1, 7, size=n_sample) +dice_entropy = dynsight.analysis.compute_shannon( + data=rolls.astype(float), + data_range=(1, 6), + n_bins=6, + units="bit", +) + +# Entropy of a discrete multivariate variable +n_sample = 10000 +rolls = rng.integers(1, 7, size=(n_sample, 2)) +dices_entropy = dynsight.analysis.compute_shannon_multi( + data=rolls.astype(float), + data_ranges=[(1, 6), (1, 6)], + n_bins=[6, 6], + units="bit", +) + + +# Entropy of a continuous variable +n_sample = 10000000 +data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample) +data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample) + +gauss_entropy_1 = dynsight.analysis.compute_kl_entropy( + data=data_1, + units="bit", +) +gauss_entropy_2 = dynsight.analysis.compute_kl_entropy( + data=data_2, + units="bit", +) +diff = gauss_entropy_2 - gauss_entropy_1 + + +# Entropy of a continuous multivariate variable +n_sample = 100000 +mean = [1, 1] +cov = np.array([[1, 0], [0, 1]]) +data_1 = rng.multivariate_normal( + mean=mean, + cov=cov, + size=n_sample, +) +data_2 = rng.multivariate_normal( + mean=mean, + cov=cov * 4.0, + size=n_sample, +) + +gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi( + data=data_1, + units="bit", +) +gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi( + data=data_2, + units="bit", +) +diff_2d = gauss_entropy_2 - gauss_entropy_1 diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst index 125e9bf5..c1eb58dd 100644 --- a/docs/source/entropy.rst +++ b/docs/source/entropy.rst @@ -10,9 +10,8 @@ First of all, we import all the packages and objects we'll need: import numpy as np import dynsight - import matplotlib.pyplot as plt - np.random.seed(42) # set the random seed + rng = np.random.default_rng(42) # set the random seed Entropy of a discrete variable @@ -24,10 +23,10 @@ should be equal to log2(6) bits. .. testcode:: recipe4-test n_sample = 10000 - rolls = np.random.randint(1, 7, size=n_sample) + rolls = rng.integers(1, 7, size=n_sample) dice_entropy = dynsight.analysis.compute_shannon( - data=rolls, + data=rolls.astype(float), data_range=(1,6), n_bins=6, units="bit", @@ -44,10 +43,10 @@ which should be equal to log2(36) bits. .. testcode:: recipe4-test n_sample = 10000 - rolls = np.random.randint(1, 7, size=(n_sample, 2)) + rolls = rng.integers(1, 7, size=(n_sample, 2)) dices_entropy = dynsight.analysis.compute_shannon_multi( - data=rolls, + data=rolls.astype(float), data_ranges=[(1,6), (1,6)], n_bins=[6, 6], units="bit", @@ -66,8 +65,8 @@ standard deviations respectively equal to 1 and 2, which should be 1 bit. .. testcode:: recipe4-test n_sample = 10000000 - data_1 = np.random.normal(loc=0.0, scale=1.0, size=n_sample) - data_2 = np.random.normal(loc=0.0, scale=2.0, size=n_sample) + data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample) + data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample) gauss_entropy_1 = dynsight.analysis.compute_kl_entropy( data=data_1, @@ -94,12 +93,12 @@ which should be 2 bits. n_sample = 100000 mean = [1, 1] cov = np.array([[1, 0], [0, 1]]) - data_1 = np.random.multivariate_normal( + data_1 = rng.multivariate_normal( mean=mean, cov=cov, size=n_sample, ) - data_2 = np.random.multivariate_normal( + data_2 = rng.multivariate_normal( mean=mean, cov=cov * 4.0, size=n_sample, @@ -117,9 +116,9 @@ which should be 2 bits. # diff_2d = 1.9983384346024948 -%.. raw:: html -% -% ⬇️ Download Python Script +.. raw:: html + + ⬇️ Download Python Script .. testcode:: recipe4-test :hide: diff --git a/docs/source/index.rst b/docs/source/index.rst index f3a61eb7..3ae663ba 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,8 +33,8 @@ Descriptors from a Trj Dimensionality reduction methods - Entropy calculations Information gain analysis + Entropy calculations .. toctree:: :hidden: From 26c37177eb163ffb95f5206f48cc19b0ae3a9e6c Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 19:44:55 +0200 Subject: [PATCH 10/16] Fixing downloadable recipes. --- docs/source/descr_from_trj.rst | 14 +++++++------- docs/source/entropy.rst | 4 ++-- docs/source/info_gain.rst | 10 +++++----- docs/source/soap_dim_red.rst | 12 ++++++------ 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/source/descr_from_trj.rst b/docs/source/descr_from_trj.rst index b7d21716..213056dc 100644 --- a/docs/source/descr_from_trj.rst +++ b/docs/source/descr_from_trj.rst @@ -1,12 +1,12 @@ -Descriptors from a :class:`.trajectory.Trj` +Descriptors from a :class:`.trajectory.Trj` =========================================== -This recipe explains how to compute descriptors directly from a -:class:`.trajectory.Trj` object. +This recipe explains how to compute descriptors directly from a +:class:`.trajectory.Trj` object. .. warning:: - This code works when run from the ``/docs`` directory of the ``dynsight`` + This code works when run from the ``/docs`` directory of the ``dynsight`` repo. To use it elsewhere, you have to change the ``Path`` variables accordingly. @@ -26,7 +26,7 @@ it's directly calculated by the :class:`.trajectory.Trj.get_soap()` method. .. warning:: Please consider that the SOAP dataset can be very large, due to the high - dimensionality, thus calculations can be expensive, and saving to/loading + dimensionality, thus calculations can be expensive, and saving to/loading from file quite slow. .. testcode:: recipe1-test @@ -83,7 +83,7 @@ calculation can be sped up significantly. ) Notice that, differently from SOAP - which is computed for every frame, LENS -is computed for every pair of frames. Thus, the LENS dataset has shape +is computed for every pair of frames. Thus, the LENS dataset has shape ``(n_particles, n_frames - 1)``. Consequently, if you need to match the LENS values with the particles along the trajectory, you will need to use a sliced trajectory (removing the last frame). The easiest way to do this is: @@ -95,7 +95,7 @@ trajectory (removing the last frame). The easiest way to do this is: .. raw:: html - ⬇️ Download Python Script + ⬇️ Download Python Script .. testcode:: recipe1-test :hide: diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst index c1eb58dd..115afadb 100644 --- a/docs/source/entropy.rst +++ b/docs/source/entropy.rst @@ -113,7 +113,7 @@ which should be 2 bits. units="bit", ) diff_2d = gauss_entropy_2 - gauss_entropy_1 - # diff_2d = 1.9983384346024948 + # diff_2d = 2.0142525628908743 .. raw:: html @@ -126,4 +126,4 @@ which should be 2 bits. assert np.isclose(dice_entropy, np.log2(6), rtol=1e-3) assert np.isclose(dices_entropy, np.log2(36), rtol=1e-3) assert np.isclose(diff, 1, rtol=1e-3, atol=1e-4) - assert np.isclose(diff_2d, 2, rtol=1e-3, atol=1e-4) + assert np.isclose(diff_2d, 2, rtol=1e-2, atol=1e-2) diff --git a/docs/source/info_gain.rst b/docs/source/info_gain.rst index 44b1d2f2..5af73631 100644 --- a/docs/source/info_gain.rst +++ b/docs/source/info_gain.rst @@ -3,16 +3,16 @@ Information gain analysis For the theoretical aspects of this work, see https://doi.org/10.48550/arXiv.2504.12990. -This recipe explains how to compute the information gain through clustering +This recipe explains how to compute the information gain through clustering analysis. We use a synthetic dataset containing a signal that oscillates between 0 and 1, with Gaussian noise. Onion clustering is run on a broad range of time resolutions ∆t. The information gain and the Shannon entropy of -the environments is computed for each value of ∆t. The analysis is implemented +the environments is computed for each value of ∆t. The analysis is implemented using onion 2.0.0 ("onion smooth"). .. warning:: - This code works when run from the ``/docs`` directory of the ``dynsight`` + This code works when run from the ``/docs`` directory of the ``dynsight`` repo. To use it elsewhere, you have to change the ``Path`` variables accordingly. @@ -54,7 +54,7 @@ Let's start by creating a the synthetic dataset: The following function takes as input the dataset, and a list of values of time resolutions ∆t, and for each of these it performs Onion clustering, and -computes the information gain achieved through clustering with that ∆t. +computes the information gain achieved through clustering with that ∆t. .. warning:: @@ -222,7 +222,7 @@ gain goes to 0. .. raw:: html - ⬇️ Download Python Script + ⬇️ Download Python Script .. testcode:: recipe3-test :hide: diff --git a/docs/source/soap_dim_red.rst b/docs/source/soap_dim_red.rst index 59740a65..06521f45 100644 --- a/docs/source/soap_dim_red.rst +++ b/docs/source/soap_dim_red.rst @@ -1,4 +1,4 @@ -Dimensionality reduction methods +Dimensionality reduction methods ================================ This recipe explains how to compute descriptors via dimensionality reduction @@ -15,12 +15,12 @@ from dynsight.utilities. .. warning:: Please consider that the SOAP dataset can be very large, due to the high - dimensionality, thus calculations can be expensive, and saving to/loading + dimensionality, thus calculations can be expensive, and saving to/loading from file quite slow. .. warning:: - This code works when run from the ``/docs`` directory of the ``dynsight`` + This code works when run from the ``/docs`` directory of the ``dynsight`` repo. To use it elsewhere, you have to change the ``Path`` variables accordingly. @@ -189,7 +189,7 @@ parameters, and performs the TICA of the corresponding SOAP dataset. ) The output :class:`.trajectory.Insight` stores the SOAP information in its -"meta" attribute, together with the ``lag_time`` parameter and ``rel_times``, +"meta" attribute, together with the ``lag_time`` parameter and ``rel_times``, the relaxation times of the computed TICs. @@ -257,7 +257,7 @@ The output :class:`.trajectory.Insight` stores the SOAP information in its "meta" attribute, together with the ``delay`` parameter. Notice that, differently from SOAP - which is computed for every frame, tSOAP -is computed for every pair of frames. Thus, the tSOAP dataset has shape +is computed for every pair of frames. Thus, the tSOAP dataset has shape ``(n_particles, n_frames - 1)``. Consequently, if you need to match the tSOAP values with the particles along the trajectory, you will need to use a sliced trajectory (removing the last frame). The easiest way to do this is: @@ -269,7 +269,7 @@ trajectory (removing the last frame). The easiest way to do this is: .. raw:: html - ⬇️ Download Python Script + ⬇️ Download Python Script .. testcode:: recipe2-test :hide: From 8115f74e4e775a2204026e0b0c7a3afead738a58 Mon Sep 17 00:00:00 2001 From: matteobecchi Date: Wed, 1 Oct 2025 19:50:18 +0200 Subject: [PATCH 11/16] Wrong units arg raises ValueError. --- src/dynsight/_internal/analysis/entropy.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 4dd7fcff..4cc141cb 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -66,6 +66,9 @@ def compute_shannon( if data.size == 0: msg = "data is empty" raise ValueError(msg) + if units not in ["bit", "nat", "frac"]: + msg = "units must be bit, nat or frac." + raise ValueError(msg) counts, _ = np.histogram( data, bins=n_bins, @@ -124,6 +127,9 @@ def compute_kl_entropy( assert np.isclose(data_entropy, -3.650626496174274) """ + if units not in ["bit", "nat"]: + msg = "units must be bit or nat." + raise ValueError(msg) data = np.sort(data.flatten()) n_data = len(data) eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances @@ -180,6 +186,9 @@ def compute_negentropy( assert np.isclose(negentropy, 0.2609932580146541) """ + if units not in ["bit", "nat"]: + msg = "units must be bit or nat." + raise ValueError(msg) data = data.flatten() rng = np.random.default_rng(seed=1234) data_norm = (data - np.mean(data)) / np.std(data, ddof=1) @@ -252,6 +261,9 @@ def compute_shannon_multi( if n_dims != len(data_ranges) or n_dims != len(n_bins): msg = "Mismatch between data dimensions, data_ranges, and n_bins" raise ValueError(msg) + if units not in ["bit", "nat", "frac"]: + msg = "units must be bit, nat or frac." + raise ValueError(msg) counts, _ = np.histogramdd(data, bins=n_bins, range=data_ranges) probs = counts / np.sum(counts) # Probability distribution @@ -308,6 +320,9 @@ def compute_kl_entropy_multi( assert np.isclose(data_entropy, -4.319358938644518) """ + if units not in ["bit", "nat"]: + msg = "units must be bit or nat." + raise ValueError(msg) n_samples, dim = data.shape tree = cKDTree(data) eps, _ = tree.query(data, k=n_neigh + 1, p=2) From 6578a08e33e2e5f10f8297d3b31131ef8c75d14d Mon Sep 17 00:00:00 2001 From: Matteo Becchi Date: Thu, 2 Oct 2025 09:52:46 +0200 Subject: [PATCH 12/16] Fixing mypy complaint. --- docs/source/_static/recipes/info_gain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/_static/recipes/info_gain.py b/docs/source/_static/recipes/info_gain.py index 873839a0..97eccbe1 100644 --- a/docs/source/_static/recipes/info_gain.py +++ b/docs/source/_static/recipes/info_gain.py @@ -26,7 +26,7 @@ def info_gain_with_onion( float, ]: """Performs full information gain analysis with Onion clustering.""" - data_range = (np.min(data), np.max(data)) + data_range = (float(np.min(data)), float(np.max(data))) n_clusters = np.zeros(len(delta_t_list), dtype=int) clusters_frac = [] From ea6dd61fc29867fc12afbc1b1294e2a4baad1a60 Mon Sep 17 00:00:00 2001 From: Matteo Becchi Date: Thu, 2 Oct 2025 10:11:38 +0200 Subject: [PATCH 13/16] Using tuple in value check; explaining units of measure. --- src/dynsight/_internal/analysis/entropy.py | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 4cc141cb..69c507a7 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -35,7 +35,8 @@ def compute_shannon( units: The units of measure of the output entropy. If "frac", entropy is - normalized between 0 and 1 by dividing by log(n_bins). + normalized between 0 and 1 by dividing by log(n_bins). If "bit", + it is computed with log base 2, if "nat" with natural log. Returns: The value of the normalized Shannon entropy of the dataset. @@ -66,7 +67,7 @@ def compute_shannon( if data.size == 0: msg = "data is empty" raise ValueError(msg) - if units not in ["bit", "nat", "frac"]: + if units not in ("bit", "nat", "frac"): msg = "units must be bit, nat or frac." raise ValueError(msg) counts, _ = np.histogram( @@ -104,7 +105,8 @@ def compute_kl_entropy( The number of neighbors considered in the KL estimator. units: - The units of measure of the output entropy. + The units of measure of the output entropy. If "bit", it is + computed with log base 2, if "nat" with natural log. Returns: The Shannon differential entropy of the dataset, in bits. @@ -127,7 +129,7 @@ def compute_kl_entropy( assert np.isclose(data_entropy, -3.650626496174274) """ - if units not in ["bit", "nat"]: + if units not in ("bit", "nat"): msg = "units must be bit or nat." raise ValueError(msg) data = np.sort(data.flatten()) @@ -162,7 +164,8 @@ def compute_negentropy( The dataset for which the entropy is to be computed. units: - The units of measure of the output negentropy. + The units of measure of the output negentropy. If "bit", it is + computed with log base 2, if "nat" with natural log. Returns: The negentropy of the dataset. @@ -186,7 +189,7 @@ def compute_negentropy( assert np.isclose(negentropy, 0.2609932580146541) """ - if units not in ["bit", "nat"]: + if units not in ("bit", "nat"): msg = "units must be bit or nat." raise ValueError(msg) data = data.flatten() @@ -225,7 +228,8 @@ def compute_shannon_multi( units: The units of measure of the output entropy. If "frac", entropy is - normalized between 0 and 1 by dividing by log(n_bins). + normalized between 0 and 1 by dividing by log(n_bins). If "bit", + it is computed with log base 2, if "nat" with natural log. Returns: The value of the normalized Shannon entropy of the dataset. @@ -261,7 +265,7 @@ def compute_shannon_multi( if n_dims != len(data_ranges) or n_dims != len(n_bins): msg = "Mismatch between data dimensions, data_ranges, and n_bins" raise ValueError(msg) - if units not in ["bit", "nat", "frac"]: + if units not in ("bit", "nat", "frac"): msg = "units must be bit, nat or frac." raise ValueError(msg) @@ -297,7 +301,8 @@ def compute_kl_entropy_multi( The number of neighbors considered in the KL estimator. units: - The units of measure of the output entropy. + The units of measure of the output entropy. If "bit", it is + computed with log base 2, if "nat" with natural log. Returns: The Shannon differential entropy of the dataset, in bits. @@ -320,7 +325,7 @@ def compute_kl_entropy_multi( assert np.isclose(data_entropy, -4.319358938644518) """ - if units not in ["bit", "nat"]: + if units not in ("bit", "nat"): msg = "units must be bit or nat." raise ValueError(msg) n_samples, dim = data.shape From f0a4d763f01b519e989a96e0b1cdb00a557bbdd8 Mon Sep 17 00:00:00 2001 From: Matteo Becchi Date: Thu, 2 Oct 2025 10:35:13 +0200 Subject: [PATCH 14/16] Info gain with KL estimator. --- src/dynsight/_internal/analysis/entropy.py | 49 +++++++++++++++------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 69c507a7..d4a1875f 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -349,6 +349,7 @@ def compute_kl_entropy_multi( def compute_entropy_gain( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], + method: Literal["histo", "kl"] = "histo", n_bins: int = 20, ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. @@ -364,6 +365,11 @@ def compute_entropy_gain( The number of bins with which the data histogram must be computed. Default is 20. + method: + How the Shannon entropy is computed. You shoud use "histo" for + discrete variables, and "kl" for continuous variables. If "kl" is + chosen, the "n_bins" arg is irrelevant. + Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` @@ -399,28 +405,41 @@ def compute_entropy_gain( "must have same shape[0]" ) raise RuntimeError(msg) + if method not in ("histo", "kl"): + msg = "method must be histo or kl." + raise ValueError(msg) - data_range = (float(np.min(data)), float(np.max(data))) - - # Compute the entropy of the raw data - total_entropy = compute_shannon( - data, - data_range, - n_bins, - ) - - # Compute the fraction and the entropy of the single clusters n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) - for i, label in enumerate(np.unique(labels)): - mask = labels == label - frac[i] = np.sum(mask) / labels.size - entr[i] = compute_shannon( - data[mask], + + if method == "histo": + data_range = (float(np.min(data)), float(np.max(data))) + # Compute the total entropy of the data + total_entropy = compute_shannon( + data, data_range, n_bins, ) + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_shannon( + data[mask], + data_range, + n_bins, + ) + else: # method == "kl" + # Compute the total entropy of the data + total_entropy = compute_kl_entropy(data) + + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_kl_entropy(data[mask]) + # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy From 74d3ab1aad9a0a78831701c1be6e1ab19e60cb01 Mon Sep 17 00:00:00 2001 From: Matteo Becchi Date: Thu, 2 Oct 2025 11:34:40 +0200 Subject: [PATCH 15/16] Debugging. Differences were OK but absolute values were wrong. --- src/dynsight/_internal/analysis/entropy.py | 92 +++++++++++++++------- tests/analysis/test_shannon.py | 21 +++++ 2 files changed, 83 insertions(+), 30 deletions(-) diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index d4a1875f..3e3d15ba 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -126,7 +126,7 @@ def compute_kl_entropy( .. testcode:: kl-entropy-test :hide: - assert np.isclose(data_entropy, -3.650626496174274) + assert np.isclose(data_entropy, 0.9891067080934253) """ if units not in ("bit", "nat"): @@ -137,11 +137,11 @@ def compute_kl_entropy( eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances eps = np.clip(eps, 1e-10, None) # avoid log(0) const = digamma(n_data) - digamma(n_neigh) + np.log(2) # 1D volume - h_bits = const + np.mean(np.log2(eps)) - if units == "bit": - return h_bits - # nat - return h_bits * np.log(2) + if units == "nat": + const = digamma(n_data) - digamma(n_neigh) + np.log(2) + return const + np.mean(np.log(eps)) + const = (digamma(n_data) - digamma(n_neigh)) / np.log(2) + 1.0 + return const + np.mean(np.log2(eps)) def compute_negentropy( @@ -322,7 +322,7 @@ def compute_kl_entropy_multi( .. testcode:: klm-entropy-test :hide: - assert np.isclose(data_entropy, -4.319358938644518) + assert np.isclose(data_entropy, 0.013521446183128614) """ if units not in ("bit", "nat"): @@ -334,16 +334,18 @@ def compute_kl_entropy_multi( eps = eps[:, -1] # distance to the n_neigh-th neighbor eps = np.clip(eps, 1e-10, None) # avoid log(0) unit_ball_volume = (np.pi ** (dim / 2)) / gamma(dim / 2 + 1) - entropy = ( + # --- Compute in nats --- + entropy_nats = ( digamma(n_samples) - digamma(n_neigh) - + np.log2(unit_ball_volume) - + (dim / n_samples) * np.sum(np.log2(eps)) + + np.log(unit_ball_volume) + + (dim / n_samples) * np.sum(np.log(eps)) ) - if units == "bit": - return entropy - return entropy * np.log(2) + if units == "nat": + return entropy_nats + # bits + return entropy_nats / np.log(2) def compute_entropy_gain( @@ -366,9 +368,11 @@ def compute_entropy_gain( Default is 20. method: - How the Shannon entropy is computed. You shoud use "histo" for + How the Shannon entropy is computed. You should use "histo" for discrete variables, and "kl" for continuous variables. If "kl" is - chosen, the "n_bins" arg is irrelevant. + chosen, the "n_bins" arg is irrelevant. See the documentation of + ``compute_shannon()`` and ``compute_kl_entropy()`` for more + details. Returns: * The absolute information gain :math:`H_0 - H_{clust}` @@ -376,6 +380,10 @@ def compute_entropy_gain( * The Shannon entropy of the initial data :math:`H_0` * The shannon entropy of the clustered data :math:`H_{clust}` + Note: + The output are expressed as fractions if method is "histo", in bit if + method is "kl". + Example: .. testcode:: shannon2-test @@ -456,6 +464,7 @@ def compute_entropy_gain_multi( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], n_bins: list[int], + method: Literal["histo", "kl"] = "histo", ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. @@ -472,6 +481,13 @@ def compute_entropy_gain_multi( The number of bins with which the data histogram must be computed, one for each dimension. + method: + How the Shannon entropy is computed. You should use "histo" for + discrete variables, and "kl" for continuous variables. If "kl" is + chosen, the "n_bins" arg is irrelevant. See the documentation of + ``compute_shannon_multi()`` and ``compute_kl_entropy_multi()`` for + more details. + Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` @@ -508,28 +524,44 @@ def compute_entropy_gain_multi( "must have same shape[0]" ) raise RuntimeError(msg) + if method not in ("histo", "kl"): + msg = "method must be histo or kl." + raise ValueError(msg) - data_range = [(float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T] - - # Compute the entropy of the raw data - total_entropy = compute_shannon_multi( - data, - data_range, - n_bins, - ) - - # Compute the fraction and the entropy of the single clusters n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) - for i, label in enumerate(np.unique(labels)): - mask = labels == label - frac[i] = np.sum(mask) / labels.size - entr[i] = compute_shannon_multi( - data[mask], + + if method == "histo": + data_range = [ + (float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T + ] + + # Compute the total entropy of the data + total_entropy = compute_shannon_multi( + data, data_range, n_bins, ) + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_shannon_multi( + data[mask], + data_range, + n_bins, + ) + else: # method == "kl" + # Compute the total entropy of the data + total_entropy = compute_kl_entropy_multi(data) + + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_kl_entropy_multi(data[mask]) + # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy diff --git a/tests/analysis/test_shannon.py b/tests/analysis/test_shannon.py index 2f90d41f..185c10de 100644 --- a/tests/analysis/test_shannon.py +++ b/tests/analysis/test_shannon.py @@ -27,6 +27,14 @@ def data_2d(rng: np.random.Generator) -> NDArray[np.float64]: return rng.random((100, 2)) +@pytest.fixture +def data_gauss(rng: np.random.Generator) -> NDArray[np.float64]: + """Random 2-Gaussians array.""" + data_1 = rng.normal(0.0, 0.1, 10000) + data_2 = rng.normal(1.0, 0.1, 10000) + return np.concatenate((data_1, data_2)) + + @pytest.fixture def labels(rng: np.random.Generator) -> NDArray[np.int64]: """Valid integer labels for 100 samples.""" @@ -89,6 +97,19 @@ def test_gain(data: NDArray[np.float64], labels: NDArray[np.int64]) -> None: assert np.isclose(gain, ref) +def test_kl_gain(data_gauss: NDArray[np.float64]) -> None: + """Check entropy gain value using KL estimator.""" + labels = np.concatenate( + (np.zeros(10000, dtype=int), np.ones(10000, dtype=int)) + ) + gain, *_ = dynsight.analysis.compute_entropy_gain( + data_gauss, + labels, + method="kl", + ) + assert np.isclose(gain, 1.0, rtol=1e-3, atol=1e-3) + + def test_gain_multi( data_2d: NDArray[np.float64], labels: NDArray[np.int64], From 3ba3d3c76aad65921b07f48a0101267de06fd34d Mon Sep 17 00:00:00 2001 From: Matteo Becchi Date: Thu, 2 Oct 2025 11:39:19 +0200 Subject: [PATCH 16/16] Fixing docs. --- docs/source/analysis.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 7f5fd089..7b7a4589 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -18,6 +18,7 @@ information-based calculations. compute_negentropy <_autosummary/dynsight.analysis.compute_negentropy> compute_entropy_gain <_autosummary/dynsight.analysis.compute_entropy_gain> compute_shannon_multi <_autosummary/dynsight.analysis.compute_shannon_multi> + compute_kl_entropy_multi <_autosummary/dynsight.analysis.compute_kl_entropy_multi> compute_entropy_gain_multi <_autosummary/dynsight.analysis.compute_entropy_gain_multi> sample_entropy <_autosummary/dynsight.analysis.sample_entropy>