diff --git a/docs/source/_static/recipes/entropy.py b/docs/source/_static/recipes/entropy.py
new file mode 100644
index 00000000..bc12e793
--- /dev/null
+++ b/docs/source/_static/recipes/entropy.py
@@ -0,0 +1,69 @@
+"""Copiable code from Recipe #4."""
+
+import numpy as np
+
+import dynsight
+
+rng = np.random.default_rng(42) # set the random seed
+
+# Entropy of a discrete variable
+n_sample = 10000
+rolls = rng.integers(1, 7, size=n_sample)
+dice_entropy = dynsight.analysis.compute_shannon(
+ data=rolls.astype(float),
+ data_range=(1, 6),
+ n_bins=6,
+ units="bit",
+)
+
+# Entropy of a discrete multivariate variable
+n_sample = 10000
+rolls = rng.integers(1, 7, size=(n_sample, 2))
+dices_entropy = dynsight.analysis.compute_shannon_multi(
+ data=rolls.astype(float),
+ data_ranges=[(1, 6), (1, 6)],
+ n_bins=[6, 6],
+ units="bit",
+)
+
+
+# Entropy of a continuous variable
+n_sample = 10000000
+data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample)
+data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample)
+
+gauss_entropy_1 = dynsight.analysis.compute_kl_entropy(
+ data=data_1,
+ units="bit",
+)
+gauss_entropy_2 = dynsight.analysis.compute_kl_entropy(
+ data=data_2,
+ units="bit",
+)
+diff = gauss_entropy_2 - gauss_entropy_1
+
+
+# Entropy of a continuous multivariate variable
+n_sample = 100000
+mean = [1, 1]
+cov = np.array([[1, 0], [0, 1]])
+data_1 = rng.multivariate_normal(
+ mean=mean,
+ cov=cov,
+ size=n_sample,
+)
+data_2 = rng.multivariate_normal(
+ mean=mean,
+ cov=cov * 4.0,
+ size=n_sample,
+)
+
+gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi(
+ data=data_1,
+ units="bit",
+)
+gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi(
+ data=data_2,
+ units="bit",
+)
+diff_2d = gauss_entropy_2 - gauss_entropy_1
diff --git a/docs/source/_static/recipes/info_gain.py b/docs/source/_static/recipes/info_gain.py
index 873839a0..97eccbe1 100644
--- a/docs/source/_static/recipes/info_gain.py
+++ b/docs/source/_static/recipes/info_gain.py
@@ -26,7 +26,7 @@ def info_gain_with_onion(
float,
]:
"""Performs full information gain analysis with Onion clustering."""
- data_range = (np.min(data), np.max(data))
+ data_range = (float(np.min(data)), float(np.max(data)))
n_clusters = np.zeros(len(delta_t_list), dtype=int)
clusters_frac = []
diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst
index 7f5fd089..7b7a4589 100644
--- a/docs/source/analysis.rst
+++ b/docs/source/analysis.rst
@@ -18,6 +18,7 @@ information-based calculations.
compute_negentropy <_autosummary/dynsight.analysis.compute_negentropy>
compute_entropy_gain <_autosummary/dynsight.analysis.compute_entropy_gain>
compute_shannon_multi <_autosummary/dynsight.analysis.compute_shannon_multi>
+ compute_kl_entropy_multi <_autosummary/dynsight.analysis.compute_kl_entropy_multi>
compute_entropy_gain_multi <_autosummary/dynsight.analysis.compute_entropy_gain_multi>
sample_entropy <_autosummary/dynsight.analysis.sample_entropy>
diff --git a/docs/source/descr_from_trj.rst b/docs/source/descr_from_trj.rst
index b7d21716..213056dc 100644
--- a/docs/source/descr_from_trj.rst
+++ b/docs/source/descr_from_trj.rst
@@ -1,12 +1,12 @@
-Descriptors from a :class:`.trajectory.Trj`
+Descriptors from a :class:`.trajectory.Trj`
===========================================
-This recipe explains how to compute descriptors directly from a
-:class:`.trajectory.Trj` object.
+This recipe explains how to compute descriptors directly from a
+:class:`.trajectory.Trj` object.
.. warning::
- This code works when run from the ``/docs`` directory of the ``dynsight``
+ This code works when run from the ``/docs`` directory of the ``dynsight``
repo. To use it elsewhere, you have to change the ``Path`` variables
accordingly.
@@ -26,7 +26,7 @@ it's directly calculated by the :class:`.trajectory.Trj.get_soap()` method.
.. warning::
Please consider that the SOAP dataset can be very large, due to the high
- dimensionality, thus calculations can be expensive, and saving to/loading
+ dimensionality, thus calculations can be expensive, and saving to/loading
from file quite slow.
.. testcode:: recipe1-test
@@ -83,7 +83,7 @@ calculation can be sped up significantly.
)
Notice that, differently from SOAP - which is computed for every frame, LENS
-is computed for every pair of frames. Thus, the LENS dataset has shape
+is computed for every pair of frames. Thus, the LENS dataset has shape
``(n_particles, n_frames - 1)``. Consequently, if you need to match the LENS
values with the particles along the trajectory, you will need to use a sliced
trajectory (removing the last frame). The easiest way to do this is:
@@ -95,7 +95,7 @@ trajectory (removing the last frame). The easiest way to do this is:
.. raw:: html
- ⬇️ Download Python Script
+ ⬇️ Download Python Script
.. testcode:: recipe1-test
:hide:
diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst
new file mode 100644
index 00000000..115afadb
--- /dev/null
+++ b/docs/source/entropy.rst
@@ -0,0 +1,129 @@
+Entropy calculations
+====================
+
+This recipe explains how to compute Shannon entropy for different types of
+datasets using the functions in the `dynsight.analysis` module.
+
+First of all, we import all the packages and objects we'll need:
+
+.. testcode:: recipe4-test
+
+ import numpy as np
+ import dynsight
+
+ rng = np.random.default_rng(42) # set the random seed
+
+
+Entropy of a discrete variable
+------------------------------
+
+Let's compute the Shanon entropy of rolling a dice ``n_sample`` times, which
+should be equal to log2(6) bits.
+
+.. testcode:: recipe4-test
+
+ n_sample = 10000
+ rolls = rng.integers(1, 7, size=n_sample)
+
+ dice_entropy = dynsight.analysis.compute_shannon(
+ data=rolls.astype(float),
+ data_range=(1,6),
+ n_bins=6,
+ units="bit",
+ )
+ # dice_entropy = 2.584832195231254 ~ log2(6)
+
+
+Entropy of a discrete multivariate variable
+-------------------------------------------
+
+Let's compute the Shanon entropy of rolling `two` dices ``n_sample`` times,
+which should be equal to log2(36) bits.
+
+.. testcode:: recipe4-test
+
+ n_sample = 10000
+ rolls = rng.integers(1, 7, size=(n_sample, 2))
+
+ dices_entropy = dynsight.analysis.compute_shannon_multi(
+ data=rolls.astype(float),
+ data_ranges=[(1,6), (1,6)],
+ n_bins=[6, 6],
+ units="bit",
+ )
+ # dices_entropy = 5.168428344754391 ~ log2(36)
+
+
+Entropy of a continuous variable
+---------------------------------
+
+Shannon entropy is not univocally defined for continuous variables, but the
+difference between the entropy of different distribution is. Let's compute the
+difference between the Shannon entropy of two Gaussian distributions, with
+standard deviations respectively equal to 1 and 2, which should be 1 bit.
+
+.. testcode:: recipe4-test
+
+ n_sample = 10000000
+ data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample)
+ data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample)
+
+ gauss_entropy_1 = dynsight.analysis.compute_kl_entropy(
+ data=data_1,
+ units="bit",
+ )
+ gauss_entropy_2 = dynsight.analysis.compute_kl_entropy(
+ data=data_2,
+ units="bit",
+ )
+ diff = gauss_entropy_2 - gauss_entropy_1
+ # diff = 1.0010395631476854
+
+
+Entropy of a continuous multivariate variable
+---------------------------------------------
+
+And the same is true for multivariate distributions. Let's compute the
+difference between the Shannon entropy of two bivariate Gaussian
+distributions, with standard deviations respectively equal to 1 and 2,
+which should be 2 bits.
+
+.. testcode:: recipe4-test
+
+ n_sample = 100000
+ mean = [1, 1]
+ cov = np.array([[1, 0], [0, 1]])
+ data_1 = rng.multivariate_normal(
+ mean=mean,
+ cov=cov,
+ size=n_sample,
+ )
+ data_2 = rng.multivariate_normal(
+ mean=mean,
+ cov=cov * 4.0,
+ size=n_sample,
+ )
+
+ gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi(
+ data=data_1,
+ units="bit",
+ )
+ gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi(
+ data=data_2,
+ units="bit",
+ )
+ diff_2d = gauss_entropy_2 - gauss_entropy_1
+ # diff_2d = 2.0142525628908743
+
+
+.. raw:: html
+
+ ⬇️ Download Python Script
+
+.. testcode:: recipe4-test
+ :hide:
+
+ assert np.isclose(dice_entropy, np.log2(6), rtol=1e-3)
+ assert np.isclose(dices_entropy, np.log2(36), rtol=1e-3)
+ assert np.isclose(diff, 1, rtol=1e-3, atol=1e-4)
+ assert np.isclose(diff_2d, 2, rtol=1e-2, atol=1e-2)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index a52ae8ac..3ae663ba 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -34,6 +34,7 @@
Descriptors from a Trj
Dimensionality reduction methods
Information gain analysis
+ Entropy calculations
.. toctree::
:hidden:
@@ -81,7 +82,7 @@ How to get started
------------------
We suggest you give a read to the ``dynsight.trajectory`` module documentation,
-which offers a compact and easy way of using most of the ``dynsight`` tools.
+which offers a compact and easy way of using most of the ``dynsight`` tools.
Also, the documentation offers some copiable Recipes and Examples for the most
common analyses.
diff --git a/docs/source/info_gain.rst b/docs/source/info_gain.rst
index 44b1d2f2..5af73631 100644
--- a/docs/source/info_gain.rst
+++ b/docs/source/info_gain.rst
@@ -3,16 +3,16 @@ Information gain analysis
For the theoretical aspects of this work, see https://doi.org/10.48550/arXiv.2504.12990.
-This recipe explains how to compute the information gain through clustering
+This recipe explains how to compute the information gain through clustering
analysis. We use a synthetic dataset containing a signal that oscillates
between 0 and 1, with Gaussian noise. Onion clustering is run on a broad
range of time resolutions ∆t. The information gain and the Shannon entropy of
-the environments is computed for each value of ∆t. The analysis is implemented
+the environments is computed for each value of ∆t. The analysis is implemented
using onion 2.0.0 ("onion smooth").
.. warning::
- This code works when run from the ``/docs`` directory of the ``dynsight``
+ This code works when run from the ``/docs`` directory of the ``dynsight``
repo. To use it elsewhere, you have to change the ``Path`` variables
accordingly.
@@ -54,7 +54,7 @@ Let's start by creating a the synthetic dataset:
The following function takes as input the dataset, and a list of values
of time resolutions ∆t, and for each of these it performs Onion clustering, and
-computes the information gain achieved through clustering with that ∆t.
+computes the information gain achieved through clustering with that ∆t.
.. warning::
@@ -222,7 +222,7 @@ gain goes to 0.
.. raw:: html
- ⬇️ Download Python Script
+ ⬇️ Download Python Script
.. testcode:: recipe3-test
:hide:
diff --git a/docs/source/soap_dim_red.rst b/docs/source/soap_dim_red.rst
index 59740a65..06521f45 100644
--- a/docs/source/soap_dim_red.rst
+++ b/docs/source/soap_dim_red.rst
@@ -1,4 +1,4 @@
-Dimensionality reduction methods
+Dimensionality reduction methods
================================
This recipe explains how to compute descriptors via dimensionality reduction
@@ -15,12 +15,12 @@ from dynsight.utilities.
.. warning::
Please consider that the SOAP dataset can be very large, due to the high
- dimensionality, thus calculations can be expensive, and saving to/loading
+ dimensionality, thus calculations can be expensive, and saving to/loading
from file quite slow.
.. warning::
- This code works when run from the ``/docs`` directory of the ``dynsight``
+ This code works when run from the ``/docs`` directory of the ``dynsight``
repo. To use it elsewhere, you have to change the ``Path`` variables
accordingly.
@@ -189,7 +189,7 @@ parameters, and performs the TICA of the corresponding SOAP dataset.
)
The output :class:`.trajectory.Insight` stores the SOAP information in its
-"meta" attribute, together with the ``lag_time`` parameter and ``rel_times``,
+"meta" attribute, together with the ``lag_time`` parameter and ``rel_times``,
the relaxation times of the computed TICs.
@@ -257,7 +257,7 @@ The output :class:`.trajectory.Insight` stores the SOAP information in its
"meta" attribute, together with the ``delay`` parameter.
Notice that, differently from SOAP - which is computed for every frame, tSOAP
-is computed for every pair of frames. Thus, the tSOAP dataset has shape
+is computed for every pair of frames. Thus, the tSOAP dataset has shape
``(n_particles, n_frames - 1)``. Consequently, if you need to match the tSOAP
values with the particles along the trajectory, you will need to use a sliced
trajectory (removing the last frame). The easiest way to do this is:
@@ -269,7 +269,7 @@ trajectory (removing the last frame). The easiest way to do this is:
.. raw:: html
- ⬇️ Download Python Script
+ ⬇️ Download Python Script
.. testcode:: recipe2-test
:hide:
diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py
index 30790072..3e3d15ba 100644
--- a/src/dynsight/_internal/analysis/entropy.py
+++ b/src/dynsight/_internal/analysis/entropy.py
@@ -1,20 +1,22 @@
from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
if TYPE_CHECKING:
from numpy.typing import NDArray
import numpy as np
import numpy.typing as npt
+from scipy.spatial import cKDTree
from scipy.spatial.distance import cdist
-from scipy.special import digamma
+from scipy.special import digamma, gamma
def compute_shannon(
data: NDArray[np.float64],
data_range: tuple[float, float],
n_bins: int,
+ units: Literal["bit", "nat", "frac"] = "frac",
) -> float:
"""Compute the Shannon entropy of a univariate data distribution.
@@ -31,6 +33,11 @@ def compute_shannon(
n_bins:
The number of bins with which the data histogram must be computed.
+ units:
+ The units of measure of the output entropy. If "frac", entropy is
+ normalized between 0 and 1 by dividing by log(n_bins). If "bit",
+ it is computed with log base 2, if "nat" with natural log.
+
Returns:
The value of the normalized Shannon entropy of the dataset.
@@ -60,6 +67,9 @@ def compute_shannon(
if data.size == 0:
msg = "data is empty"
raise ValueError(msg)
+ if units not in ("bit", "nat", "frac"):
+ msg = "units must be bit, nat or frac."
+ raise ValueError(msg)
counts, _ = np.histogram(
data,
bins=n_bins,
@@ -67,11 +77,19 @@ def compute_shannon(
)
probs = counts / np.sum(counts) # Data probabilities are needed
entropy = -np.sum([p * np.log2(p) for p in probs if p > 0.0])
- entropy /= np.log2(n_bins)
- return entropy
+
+ if units == "bit":
+ return entropy
+ if units == "nat":
+ return entropy * np.log(2)
+ return entropy / np.log2(n_bins)
-def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float:
+def compute_kl_entropy(
+ data: NDArray[np.float64],
+ n_neigh: int = 1,
+ units: Literal["bit", "nat"] = "bit",
+) -> float:
"""Estimate Shannon differential entropy using Kozachenko-Leonenko.
The Kozachenko-Leonenko k-nearest neighbors method approximates
@@ -86,6 +104,10 @@ def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float:
n_neigh:
The number of neighbors considered in the KL estimator.
+ units:
+ The units of measure of the output entropy. If "bit", it is
+ computed with log base 2, if "nat" with natural log.
+
Returns:
The Shannon differential entropy of the dataset, in bits.
@@ -104,18 +126,28 @@ def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float:
.. testcode:: kl-entropy-test
:hide:
- assert np.isclose(data_entropy, -3.3437736767342194)
+ assert np.isclose(data_entropy, 0.9891067080934253)
"""
+ if units not in ("bit", "nat"):
+ msg = "units must be bit or nat."
+ raise ValueError(msg)
data = np.sort(data.flatten())
n_data = len(data)
eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances
eps = np.clip(eps, 1e-10, None) # avoid log(0)
- const = digamma(n_data) - digamma(n_neigh) + 1
+ const = digamma(n_data) - digamma(n_neigh) + np.log(2) # 1D volume
+ if units == "nat":
+ const = digamma(n_data) - digamma(n_neigh) + np.log(2)
+ return const + np.mean(np.log(eps))
+ const = (digamma(n_data) - digamma(n_neigh)) / np.log(2) + 1.0
return const + np.mean(np.log2(eps))
-def compute_negentropy(data: NDArray[np.float64]) -> float:
+def compute_negentropy(
+ data: NDArray[np.float64],
+ units: Literal["bit", "nat"] = "bit",
+) -> float:
"""Estimate negentropy of a dataset.
Negentropy is a measure of non-Gaussianity representing the distance
@@ -131,8 +163,12 @@ def compute_negentropy(data: NDArray[np.float64]) -> float:
data:
The dataset for which the entropy is to be computed.
+ units:
+ The units of measure of the output negentropy. If "bit", it is
+ computed with log base 2, if "nat" with natural log.
+
Returns:
- The negentropy of the dataset, in bits.
+ The negentropy of the dataset.
Example:
@@ -153,13 +189,16 @@ def compute_negentropy(data: NDArray[np.float64]) -> float:
assert np.isclose(negentropy, 0.2609932580146541)
"""
+ if units not in ("bit", "nat"):
+ msg = "units must be bit or nat."
+ raise ValueError(msg)
data = data.flatten()
rng = np.random.default_rng(seed=1234)
data_norm = (data - np.mean(data)) / np.std(data, ddof=1)
sigma = np.std(data_norm, ddof=1)
data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size)
- h_gauss = compute_kl_entropy(data_gauss)
- h_data = compute_kl_entropy(data_norm)
+ h_gauss = compute_kl_entropy(data_gauss, units=units)
+ h_data = compute_kl_entropy(data_norm, units=units)
return h_gauss - h_data
@@ -167,6 +206,7 @@ def compute_shannon_multi(
data: NDArray[np.float64],
data_ranges: list[tuple[float, float]],
n_bins: list[int],
+ units: Literal["bit", "nat", "frac"] = "frac",
) -> float:
"""Compute the Shannon entropy of a multivariate data distribution.
@@ -186,6 +226,11 @@ def compute_shannon_multi(
A list of integers specifying the number of bins for each
dimension.
+ units:
+ The units of measure of the output entropy. If "frac", entropy is
+ normalized between 0 and 1 by dividing by log(n_bins). If "bit",
+ it is computed with log base 2, if "nat" with natural log.
+
Returns:
The value of the normalized Shannon entropy of the dataset.
@@ -220,18 +265,93 @@ def compute_shannon_multi(
if n_dims != len(data_ranges) or n_dims != len(n_bins):
msg = "Mismatch between data dimensions, data_ranges, and n_bins"
raise ValueError(msg)
+ if units not in ("bit", "nat", "frac"):
+ msg = "units must be bit, nat or frac."
+ raise ValueError(msg)
counts, _ = np.histogramdd(data, bins=n_bins, range=data_ranges)
probs = counts / np.sum(counts) # Probability distribution
entropy = -np.sum(probs[probs > 0] * np.log2(probs[probs > 0]))
- entropy /= np.log2(np.prod(n_bins)) # Normalization
- return entropy
+ if units == "bit":
+ return entropy
+ if units == "nat":
+ return entropy * np.log(2)
+ return entropy / np.log2(np.prod(n_bins)) # Normalization
+
+
+def compute_kl_entropy_multi(
+ data: NDArray[np.float64],
+ n_neigh: int = 1,
+ units: Literal["bit", "nat"] = "bit",
+) -> float:
+ """Estimate Shannon differential entropy using Kozachenko-Leonenko.
+
+ This function works for multivariate distribution.
+ The Kozachenko-Leonenko k-nearest neighbors method approximates
+ differential entropy based on distances to nearest neighbors
+ in the sample space. It's main advantage is being parameter-free.
+
+ Parameters:
+ data:
+ The dataset for which the entropy is to be computed.
+ Shape (n_data, n_dims)
+
+ n_neigh:
+ The number of neighbors considered in the KL estimator.
+
+ units:
+ The units of measure of the output entropy. If "bit", it is
+ computed with log base 2, if "nat" with natural log.
+
+ Returns:
+ The Shannon differential entropy of the dataset, in bits.
+
+ Example:
+
+ .. testcode:: klm-entropy-test
+
+ import numpy as np
+ from dynsight.analysis import compute_kl_entropy_multi
+
+ np.random.seed(1234)
+ data = np.random.rand(10000, 2)
+
+ data_entropy = compute_kl_entropy_multi(data)
+
+ .. testcode:: klm-entropy-test
+ :hide:
+
+ assert np.isclose(data_entropy, 0.013521446183128614)
+
+ """
+ if units not in ("bit", "nat"):
+ msg = "units must be bit or nat."
+ raise ValueError(msg)
+ n_samples, dim = data.shape
+ tree = cKDTree(data)
+ eps, _ = tree.query(data, k=n_neigh + 1, p=2)
+ eps = eps[:, -1] # distance to the n_neigh-th neighbor
+ eps = np.clip(eps, 1e-10, None) # avoid log(0)
+ unit_ball_volume = (np.pi ** (dim / 2)) / gamma(dim / 2 + 1)
+ # --- Compute in nats ---
+ entropy_nats = (
+ digamma(n_samples)
+ - digamma(n_neigh)
+ + np.log(unit_ball_volume)
+ + (dim / n_samples) * np.sum(np.log(eps))
+ )
+
+ if units == "nat":
+ return entropy_nats
+ # bits
+ return entropy_nats / np.log(2)
def compute_entropy_gain(
data: npt.NDArray[np.float64],
labels: npt.NDArray[np.int64],
+ method: Literal["histo", "kl"] = "histo",
n_bins: int = 20,
) -> tuple[float, float, float, float]:
"""Compute the relative information gained by the clustering.
@@ -247,12 +367,23 @@ def compute_entropy_gain(
The number of bins with which the data histogram must be computed.
Default is 20.
+ method:
+ How the Shannon entropy is computed. You should use "histo" for
+ discrete variables, and "kl" for continuous variables. If "kl" is
+ chosen, the "n_bins" arg is irrelevant. See the documentation of
+ ``compute_shannon()`` and ``compute_kl_entropy()`` for more
+ details.
+
Returns:
* The absolute information gain :math:`H_0 - H_{clust}`
* The relative information gain :math:`(H_0 - H_{clust}) / H_0`
* The Shannon entropy of the initial data :math:`H_0`
* The shannon entropy of the clustered data :math:`H_{clust}`
+ Note:
+ The output are expressed as fractions if method is "histo", in bit if
+ method is "kl".
+
Example:
.. testcode:: shannon2-test
@@ -282,28 +413,41 @@ def compute_entropy_gain(
"must have same shape[0]"
)
raise RuntimeError(msg)
+ if method not in ("histo", "kl"):
+ msg = "method must be histo or kl."
+ raise ValueError(msg)
- data_range = (float(np.min(data)), float(np.max(data)))
-
- # Compute the entropy of the raw data
- total_entropy = compute_shannon(
- data,
- data_range,
- n_bins,
- )
-
- # Compute the fraction and the entropy of the single clusters
n_clusters = np.unique(labels).size
frac, entr = np.zeros(n_clusters), np.zeros(n_clusters)
- for i, label in enumerate(np.unique(labels)):
- mask = labels == label
- frac[i] = np.sum(mask) / labels.size
- entr[i] = compute_shannon(
- data[mask],
+
+ if method == "histo":
+ data_range = (float(np.min(data)), float(np.max(data)))
+ # Compute the total entropy of the data
+ total_entropy = compute_shannon(
+ data,
data_range,
n_bins,
)
+ # Compute the fraction and the entropy of the single clusters
+ for i, label in enumerate(np.unique(labels)):
+ mask = labels == label
+ frac[i] = np.sum(mask) / labels.size
+ entr[i] = compute_shannon(
+ data[mask],
+ data_range,
+ n_bins,
+ )
+ else: # method == "kl"
+ # Compute the total entropy of the data
+ total_entropy = compute_kl_entropy(data)
+
+ # Compute the fraction and the entropy of the single clusters
+ for i, label in enumerate(np.unique(labels)):
+ mask = labels == label
+ frac[i] = np.sum(mask) / labels.size
+ entr[i] = compute_kl_entropy(data[mask])
+
# Compute the entropy of the clustered data
clustered_entropy = np.dot(frac, entr)
info_gain = total_entropy - clustered_entropy
@@ -320,6 +464,7 @@ def compute_entropy_gain_multi(
data: npt.NDArray[np.float64],
labels: npt.NDArray[np.int64],
n_bins: list[int],
+ method: Literal["histo", "kl"] = "histo",
) -> tuple[float, float, float, float]:
"""Compute the relative information gained by the clustering.
@@ -336,6 +481,13 @@ def compute_entropy_gain_multi(
The number of bins with which the data histogram must be computed,
one for each dimension.
+ method:
+ How the Shannon entropy is computed. You should use "histo" for
+ discrete variables, and "kl" for continuous variables. If "kl" is
+ chosen, the "n_bins" arg is irrelevant. See the documentation of
+ ``compute_shannon_multi()`` and ``compute_kl_entropy_multi()`` for
+ more details.
+
Returns:
* The absolute information gain :math:`H_0 - H_{clust}`
* The relative information gain :math:`(H_0 - H_{clust}) / H_0`
@@ -372,28 +524,44 @@ def compute_entropy_gain_multi(
"must have same shape[0]"
)
raise RuntimeError(msg)
+ if method not in ("histo", "kl"):
+ msg = "method must be histo or kl."
+ raise ValueError(msg)
- data_range = [(float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T]
-
- # Compute the entropy of the raw data
- total_entropy = compute_shannon_multi(
- data,
- data_range,
- n_bins,
- )
-
- # Compute the fraction and the entropy of the single clusters
n_clusters = np.unique(labels).size
frac, entr = np.zeros(n_clusters), np.zeros(n_clusters)
- for i, label in enumerate(np.unique(labels)):
- mask = labels == label
- frac[i] = np.sum(mask) / labels.size
- entr[i] = compute_shannon_multi(
- data[mask],
+
+ if method == "histo":
+ data_range = [
+ (float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T
+ ]
+
+ # Compute the total entropy of the data
+ total_entropy = compute_shannon_multi(
+ data,
data_range,
n_bins,
)
+ # Compute the fraction and the entropy of the single clusters
+ for i, label in enumerate(np.unique(labels)):
+ mask = labels == label
+ frac[i] = np.sum(mask) / labels.size
+ entr[i] = compute_shannon_multi(
+ data[mask],
+ data_range,
+ n_bins,
+ )
+ else: # method == "kl"
+ # Compute the total entropy of the data
+ total_entropy = compute_kl_entropy_multi(data)
+
+ # Compute the fraction and the entropy of the single clusters
+ for i, label in enumerate(np.unique(labels)):
+ mask = labels == label
+ frac[i] = np.sum(mask) / labels.size
+ entr[i] = compute_kl_entropy_multi(data[mask])
+
# Compute the entropy of the clustered data
clustered_entropy = np.dot(frac, entr)
info_gain = total_entropy - clustered_entropy
diff --git a/src/dynsight/analysis.py b/src/dynsight/analysis.py
index 3b59bcfa..4f08dc8a 100644
--- a/src/dynsight/analysis.py
+++ b/src/dynsight/analysis.py
@@ -4,6 +4,7 @@
compute_entropy_gain,
compute_entropy_gain_multi,
compute_kl_entropy,
+ compute_kl_entropy_multi,
compute_negentropy,
compute_shannon,
compute_shannon_multi,
@@ -22,6 +23,7 @@
"compute_entropy_gain",
"compute_entropy_gain_multi",
"compute_kl_entropy",
+ "compute_kl_entropy_multi",
"compute_negentropy",
"compute_rdf",
"compute_shannon",
diff --git a/tests/analysis/test_shannon.py b/tests/analysis/test_shannon.py
index 2f90d41f..185c10de 100644
--- a/tests/analysis/test_shannon.py
+++ b/tests/analysis/test_shannon.py
@@ -27,6 +27,14 @@ def data_2d(rng: np.random.Generator) -> NDArray[np.float64]:
return rng.random((100, 2))
+@pytest.fixture
+def data_gauss(rng: np.random.Generator) -> NDArray[np.float64]:
+ """Random 2-Gaussians array."""
+ data_1 = rng.normal(0.0, 0.1, 10000)
+ data_2 = rng.normal(1.0, 0.1, 10000)
+ return np.concatenate((data_1, data_2))
+
+
@pytest.fixture
def labels(rng: np.random.Generator) -> NDArray[np.int64]:
"""Valid integer labels for 100 samples."""
@@ -89,6 +97,19 @@ def test_gain(data: NDArray[np.float64], labels: NDArray[np.int64]) -> None:
assert np.isclose(gain, ref)
+def test_kl_gain(data_gauss: NDArray[np.float64]) -> None:
+ """Check entropy gain value using KL estimator."""
+ labels = np.concatenate(
+ (np.zeros(10000, dtype=int), np.ones(10000, dtype=int))
+ )
+ gain, *_ = dynsight.analysis.compute_entropy_gain(
+ data_gauss,
+ labels,
+ method="kl",
+ )
+ assert np.isclose(gain, 1.0, rtol=1e-3, atol=1e-3)
+
+
def test_gain_multi(
data_2d: NDArray[np.float64],
labels: NDArray[np.int64],