diff --git a/docs/source/_static/recipes/entropy.py b/docs/source/_static/recipes/entropy.py new file mode 100644 index 00000000..bc12e793 --- /dev/null +++ b/docs/source/_static/recipes/entropy.py @@ -0,0 +1,69 @@ +"""Copiable code from Recipe #4.""" + +import numpy as np + +import dynsight + +rng = np.random.default_rng(42) # set the random seed + +# Entropy of a discrete variable +n_sample = 10000 +rolls = rng.integers(1, 7, size=n_sample) +dice_entropy = dynsight.analysis.compute_shannon( + data=rolls.astype(float), + data_range=(1, 6), + n_bins=6, + units="bit", +) + +# Entropy of a discrete multivariate variable +n_sample = 10000 +rolls = rng.integers(1, 7, size=(n_sample, 2)) +dices_entropy = dynsight.analysis.compute_shannon_multi( + data=rolls.astype(float), + data_ranges=[(1, 6), (1, 6)], + n_bins=[6, 6], + units="bit", +) + + +# Entropy of a continuous variable +n_sample = 10000000 +data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample) +data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample) + +gauss_entropy_1 = dynsight.analysis.compute_kl_entropy( + data=data_1, + units="bit", +) +gauss_entropy_2 = dynsight.analysis.compute_kl_entropy( + data=data_2, + units="bit", +) +diff = gauss_entropy_2 - gauss_entropy_1 + + +# Entropy of a continuous multivariate variable +n_sample = 100000 +mean = [1, 1] +cov = np.array([[1, 0], [0, 1]]) +data_1 = rng.multivariate_normal( + mean=mean, + cov=cov, + size=n_sample, +) +data_2 = rng.multivariate_normal( + mean=mean, + cov=cov * 4.0, + size=n_sample, +) + +gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi( + data=data_1, + units="bit", +) +gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi( + data=data_2, + units="bit", +) +diff_2d = gauss_entropy_2 - gauss_entropy_1 diff --git a/docs/source/_static/recipes/info_gain.py b/docs/source/_static/recipes/info_gain.py index 873839a0..97eccbe1 100644 --- a/docs/source/_static/recipes/info_gain.py +++ b/docs/source/_static/recipes/info_gain.py @@ -26,7 +26,7 @@ def info_gain_with_onion( float, ]: """Performs full information gain analysis with Onion clustering.""" - data_range = (np.min(data), np.max(data)) + data_range = (float(np.min(data)), float(np.max(data))) n_clusters = np.zeros(len(delta_t_list), dtype=int) clusters_frac = [] diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 7f5fd089..7b7a4589 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -18,6 +18,7 @@ information-based calculations. compute_negentropy <_autosummary/dynsight.analysis.compute_negentropy> compute_entropy_gain <_autosummary/dynsight.analysis.compute_entropy_gain> compute_shannon_multi <_autosummary/dynsight.analysis.compute_shannon_multi> + compute_kl_entropy_multi <_autosummary/dynsight.analysis.compute_kl_entropy_multi> compute_entropy_gain_multi <_autosummary/dynsight.analysis.compute_entropy_gain_multi> sample_entropy <_autosummary/dynsight.analysis.sample_entropy> diff --git a/docs/source/descr_from_trj.rst b/docs/source/descr_from_trj.rst index b7d21716..213056dc 100644 --- a/docs/source/descr_from_trj.rst +++ b/docs/source/descr_from_trj.rst @@ -1,12 +1,12 @@ -Descriptors from a :class:`.trajectory.Trj` +Descriptors from a :class:`.trajectory.Trj` =========================================== -This recipe explains how to compute descriptors directly from a -:class:`.trajectory.Trj` object. +This recipe explains how to compute descriptors directly from a +:class:`.trajectory.Trj` object. .. warning:: - This code works when run from the ``/docs`` directory of the ``dynsight`` + This code works when run from the ``/docs`` directory of the ``dynsight`` repo. To use it elsewhere, you have to change the ``Path`` variables accordingly. @@ -26,7 +26,7 @@ it's directly calculated by the :class:`.trajectory.Trj.get_soap()` method. .. warning:: Please consider that the SOAP dataset can be very large, due to the high - dimensionality, thus calculations can be expensive, and saving to/loading + dimensionality, thus calculations can be expensive, and saving to/loading from file quite slow. .. testcode:: recipe1-test @@ -83,7 +83,7 @@ calculation can be sped up significantly. ) Notice that, differently from SOAP - which is computed for every frame, LENS -is computed for every pair of frames. Thus, the LENS dataset has shape +is computed for every pair of frames. Thus, the LENS dataset has shape ``(n_particles, n_frames - 1)``. Consequently, if you need to match the LENS values with the particles along the trajectory, you will need to use a sliced trajectory (removing the last frame). The easiest way to do this is: @@ -95,7 +95,7 @@ trajectory (removing the last frame). The easiest way to do this is: .. raw:: html - ⬇️ Download Python Script + ⬇️ Download Python Script .. testcode:: recipe1-test :hide: diff --git a/docs/source/entropy.rst b/docs/source/entropy.rst new file mode 100644 index 00000000..115afadb --- /dev/null +++ b/docs/source/entropy.rst @@ -0,0 +1,129 @@ +Entropy calculations +==================== + +This recipe explains how to compute Shannon entropy for different types of +datasets using the functions in the `dynsight.analysis` module. + +First of all, we import all the packages and objects we'll need: + +.. testcode:: recipe4-test + + import numpy as np + import dynsight + + rng = np.random.default_rng(42) # set the random seed + + +Entropy of a discrete variable +------------------------------ + +Let's compute the Shanon entropy of rolling a dice ``n_sample`` times, which +should be equal to log2(6) bits. + +.. testcode:: recipe4-test + + n_sample = 10000 + rolls = rng.integers(1, 7, size=n_sample) + + dice_entropy = dynsight.analysis.compute_shannon( + data=rolls.astype(float), + data_range=(1,6), + n_bins=6, + units="bit", + ) + # dice_entropy = 2.584832195231254 ~ log2(6) + + +Entropy of a discrete multivariate variable +------------------------------------------- + +Let's compute the Shanon entropy of rolling `two` dices ``n_sample`` times, +which should be equal to log2(36) bits. + +.. testcode:: recipe4-test + + n_sample = 10000 + rolls = rng.integers(1, 7, size=(n_sample, 2)) + + dices_entropy = dynsight.analysis.compute_shannon_multi( + data=rolls.astype(float), + data_ranges=[(1,6), (1,6)], + n_bins=[6, 6], + units="bit", + ) + # dices_entropy = 5.168428344754391 ~ log2(36) + + +Entropy of a continuous variable +--------------------------------- + +Shannon entropy is not univocally defined for continuous variables, but the +difference between the entropy of different distribution is. Let's compute the +difference between the Shannon entropy of two Gaussian distributions, with +standard deviations respectively equal to 1 and 2, which should be 1 bit. + +.. testcode:: recipe4-test + + n_sample = 10000000 + data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample) + data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample) + + gauss_entropy_1 = dynsight.analysis.compute_kl_entropy( + data=data_1, + units="bit", + ) + gauss_entropy_2 = dynsight.analysis.compute_kl_entropy( + data=data_2, + units="bit", + ) + diff = gauss_entropy_2 - gauss_entropy_1 + # diff = 1.0010395631476854 + + +Entropy of a continuous multivariate variable +--------------------------------------------- + +And the same is true for multivariate distributions. Let's compute the +difference between the Shannon entropy of two bivariate Gaussian +distributions, with standard deviations respectively equal to 1 and 2, +which should be 2 bits. + +.. testcode:: recipe4-test + + n_sample = 100000 + mean = [1, 1] + cov = np.array([[1, 0], [0, 1]]) + data_1 = rng.multivariate_normal( + mean=mean, + cov=cov, + size=n_sample, + ) + data_2 = rng.multivariate_normal( + mean=mean, + cov=cov * 4.0, + size=n_sample, + ) + + gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi( + data=data_1, + units="bit", + ) + gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi( + data=data_2, + units="bit", + ) + diff_2d = gauss_entropy_2 - gauss_entropy_1 + # diff_2d = 2.0142525628908743 + + +.. raw:: html + + ⬇️ Download Python Script + +.. testcode:: recipe4-test + :hide: + + assert np.isclose(dice_entropy, np.log2(6), rtol=1e-3) + assert np.isclose(dices_entropy, np.log2(36), rtol=1e-3) + assert np.isclose(diff, 1, rtol=1e-3, atol=1e-4) + assert np.isclose(diff_2d, 2, rtol=1e-2, atol=1e-2) diff --git a/docs/source/index.rst b/docs/source/index.rst index a52ae8ac..3ae663ba 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,6 +34,7 @@ Descriptors from a Trj Dimensionality reduction methods Information gain analysis + Entropy calculations .. toctree:: :hidden: @@ -81,7 +82,7 @@ How to get started ------------------ We suggest you give a read to the ``dynsight.trajectory`` module documentation, -which offers a compact and easy way of using most of the ``dynsight`` tools. +which offers a compact and easy way of using most of the ``dynsight`` tools. Also, the documentation offers some copiable Recipes and Examples for the most common analyses. diff --git a/docs/source/info_gain.rst b/docs/source/info_gain.rst index 44b1d2f2..5af73631 100644 --- a/docs/source/info_gain.rst +++ b/docs/source/info_gain.rst @@ -3,16 +3,16 @@ Information gain analysis For the theoretical aspects of this work, see https://doi.org/10.48550/arXiv.2504.12990. -This recipe explains how to compute the information gain through clustering +This recipe explains how to compute the information gain through clustering analysis. We use a synthetic dataset containing a signal that oscillates between 0 and 1, with Gaussian noise. Onion clustering is run on a broad range of time resolutions ∆t. The information gain and the Shannon entropy of -the environments is computed for each value of ∆t. The analysis is implemented +the environments is computed for each value of ∆t. The analysis is implemented using onion 2.0.0 ("onion smooth"). .. warning:: - This code works when run from the ``/docs`` directory of the ``dynsight`` + This code works when run from the ``/docs`` directory of the ``dynsight`` repo. To use it elsewhere, you have to change the ``Path`` variables accordingly. @@ -54,7 +54,7 @@ Let's start by creating a the synthetic dataset: The following function takes as input the dataset, and a list of values of time resolutions ∆t, and for each of these it performs Onion clustering, and -computes the information gain achieved through clustering with that ∆t. +computes the information gain achieved through clustering with that ∆t. .. warning:: @@ -222,7 +222,7 @@ gain goes to 0. .. raw:: html - ⬇️ Download Python Script + ⬇️ Download Python Script .. testcode:: recipe3-test :hide: diff --git a/docs/source/soap_dim_red.rst b/docs/source/soap_dim_red.rst index 59740a65..06521f45 100644 --- a/docs/source/soap_dim_red.rst +++ b/docs/source/soap_dim_red.rst @@ -1,4 +1,4 @@ -Dimensionality reduction methods +Dimensionality reduction methods ================================ This recipe explains how to compute descriptors via dimensionality reduction @@ -15,12 +15,12 @@ from dynsight.utilities. .. warning:: Please consider that the SOAP dataset can be very large, due to the high - dimensionality, thus calculations can be expensive, and saving to/loading + dimensionality, thus calculations can be expensive, and saving to/loading from file quite slow. .. warning:: - This code works when run from the ``/docs`` directory of the ``dynsight`` + This code works when run from the ``/docs`` directory of the ``dynsight`` repo. To use it elsewhere, you have to change the ``Path`` variables accordingly. @@ -189,7 +189,7 @@ parameters, and performs the TICA of the corresponding SOAP dataset. ) The output :class:`.trajectory.Insight` stores the SOAP information in its -"meta" attribute, together with the ``lag_time`` parameter and ``rel_times``, +"meta" attribute, together with the ``lag_time`` parameter and ``rel_times``, the relaxation times of the computed TICs. @@ -257,7 +257,7 @@ The output :class:`.trajectory.Insight` stores the SOAP information in its "meta" attribute, together with the ``delay`` parameter. Notice that, differently from SOAP - which is computed for every frame, tSOAP -is computed for every pair of frames. Thus, the tSOAP dataset has shape +is computed for every pair of frames. Thus, the tSOAP dataset has shape ``(n_particles, n_frames - 1)``. Consequently, if you need to match the tSOAP values with the particles along the trajectory, you will need to use a sliced trajectory (removing the last frame). The easiest way to do this is: @@ -269,7 +269,7 @@ trajectory (removing the last frame). The easiest way to do this is: .. raw:: html - ⬇️ Download Python Script + ⬇️ Download Python Script .. testcode:: recipe2-test :hide: diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 30790072..3e3d15ba 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -1,20 +1,22 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from numpy.typing import NDArray import numpy as np import numpy.typing as npt +from scipy.spatial import cKDTree from scipy.spatial.distance import cdist -from scipy.special import digamma +from scipy.special import digamma, gamma def compute_shannon( data: NDArray[np.float64], data_range: tuple[float, float], n_bins: int, + units: Literal["bit", "nat", "frac"] = "frac", ) -> float: """Compute the Shannon entropy of a univariate data distribution. @@ -31,6 +33,11 @@ def compute_shannon( n_bins: The number of bins with which the data histogram must be computed. + units: + The units of measure of the output entropy. If "frac", entropy is + normalized between 0 and 1 by dividing by log(n_bins). If "bit", + it is computed with log base 2, if "nat" with natural log. + Returns: The value of the normalized Shannon entropy of the dataset. @@ -60,6 +67,9 @@ def compute_shannon( if data.size == 0: msg = "data is empty" raise ValueError(msg) + if units not in ("bit", "nat", "frac"): + msg = "units must be bit, nat or frac." + raise ValueError(msg) counts, _ = np.histogram( data, bins=n_bins, @@ -67,11 +77,19 @@ def compute_shannon( ) probs = counts / np.sum(counts) # Data probabilities are needed entropy = -np.sum([p * np.log2(p) for p in probs if p > 0.0]) - entropy /= np.log2(n_bins) - return entropy + + if units == "bit": + return entropy + if units == "nat": + return entropy * np.log(2) + return entropy / np.log2(n_bins) -def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float: +def compute_kl_entropy( + data: NDArray[np.float64], + n_neigh: int = 1, + units: Literal["bit", "nat"] = "bit", +) -> float: """Estimate Shannon differential entropy using Kozachenko-Leonenko. The Kozachenko-Leonenko k-nearest neighbors method approximates @@ -86,6 +104,10 @@ def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float: n_neigh: The number of neighbors considered in the KL estimator. + units: + The units of measure of the output entropy. If "bit", it is + computed with log base 2, if "nat" with natural log. + Returns: The Shannon differential entropy of the dataset, in bits. @@ -104,18 +126,28 @@ def compute_kl_entropy(data: NDArray[np.float64], n_neigh: int = 1) -> float: .. testcode:: kl-entropy-test :hide: - assert np.isclose(data_entropy, -3.3437736767342194) + assert np.isclose(data_entropy, 0.9891067080934253) """ + if units not in ("bit", "nat"): + msg = "units must be bit or nat." + raise ValueError(msg) data = np.sort(data.flatten()) n_data = len(data) eps = data[n_neigh:] - data[:-n_neigh] # n_neigh-th neighbor distances eps = np.clip(eps, 1e-10, None) # avoid log(0) - const = digamma(n_data) - digamma(n_neigh) + 1 + const = digamma(n_data) - digamma(n_neigh) + np.log(2) # 1D volume + if units == "nat": + const = digamma(n_data) - digamma(n_neigh) + np.log(2) + return const + np.mean(np.log(eps)) + const = (digamma(n_data) - digamma(n_neigh)) / np.log(2) + 1.0 return const + np.mean(np.log2(eps)) -def compute_negentropy(data: NDArray[np.float64]) -> float: +def compute_negentropy( + data: NDArray[np.float64], + units: Literal["bit", "nat"] = "bit", +) -> float: """Estimate negentropy of a dataset. Negentropy is a measure of non-Gaussianity representing the distance @@ -131,8 +163,12 @@ def compute_negentropy(data: NDArray[np.float64]) -> float: data: The dataset for which the entropy is to be computed. + units: + The units of measure of the output negentropy. If "bit", it is + computed with log base 2, if "nat" with natural log. + Returns: - The negentropy of the dataset, in bits. + The negentropy of the dataset. Example: @@ -153,13 +189,16 @@ def compute_negentropy(data: NDArray[np.float64]) -> float: assert np.isclose(negentropy, 0.2609932580146541) """ + if units not in ("bit", "nat"): + msg = "units must be bit or nat." + raise ValueError(msg) data = data.flatten() rng = np.random.default_rng(seed=1234) data_norm = (data - np.mean(data)) / np.std(data, ddof=1) sigma = np.std(data_norm, ddof=1) data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size) - h_gauss = compute_kl_entropy(data_gauss) - h_data = compute_kl_entropy(data_norm) + h_gauss = compute_kl_entropy(data_gauss, units=units) + h_data = compute_kl_entropy(data_norm, units=units) return h_gauss - h_data @@ -167,6 +206,7 @@ def compute_shannon_multi( data: NDArray[np.float64], data_ranges: list[tuple[float, float]], n_bins: list[int], + units: Literal["bit", "nat", "frac"] = "frac", ) -> float: """Compute the Shannon entropy of a multivariate data distribution. @@ -186,6 +226,11 @@ def compute_shannon_multi( A list of integers specifying the number of bins for each dimension. + units: + The units of measure of the output entropy. If "frac", entropy is + normalized between 0 and 1 by dividing by log(n_bins). If "bit", + it is computed with log base 2, if "nat" with natural log. + Returns: The value of the normalized Shannon entropy of the dataset. @@ -220,18 +265,93 @@ def compute_shannon_multi( if n_dims != len(data_ranges) or n_dims != len(n_bins): msg = "Mismatch between data dimensions, data_ranges, and n_bins" raise ValueError(msg) + if units not in ("bit", "nat", "frac"): + msg = "units must be bit, nat or frac." + raise ValueError(msg) counts, _ = np.histogramdd(data, bins=n_bins, range=data_ranges) probs = counts / np.sum(counts) # Probability distribution entropy = -np.sum(probs[probs > 0] * np.log2(probs[probs > 0])) - entropy /= np.log2(np.prod(n_bins)) # Normalization - return entropy + if units == "bit": + return entropy + if units == "nat": + return entropy * np.log(2) + return entropy / np.log2(np.prod(n_bins)) # Normalization + + +def compute_kl_entropy_multi( + data: NDArray[np.float64], + n_neigh: int = 1, + units: Literal["bit", "nat"] = "bit", +) -> float: + """Estimate Shannon differential entropy using Kozachenko-Leonenko. + + This function works for multivariate distribution. + The Kozachenko-Leonenko k-nearest neighbors method approximates + differential entropy based on distances to nearest neighbors + in the sample space. It's main advantage is being parameter-free. + + Parameters: + data: + The dataset for which the entropy is to be computed. + Shape (n_data, n_dims) + + n_neigh: + The number of neighbors considered in the KL estimator. + + units: + The units of measure of the output entropy. If "bit", it is + computed with log base 2, if "nat" with natural log. + + Returns: + The Shannon differential entropy of the dataset, in bits. + + Example: + + .. testcode:: klm-entropy-test + + import numpy as np + from dynsight.analysis import compute_kl_entropy_multi + + np.random.seed(1234) + data = np.random.rand(10000, 2) + + data_entropy = compute_kl_entropy_multi(data) + + .. testcode:: klm-entropy-test + :hide: + + assert np.isclose(data_entropy, 0.013521446183128614) + + """ + if units not in ("bit", "nat"): + msg = "units must be bit or nat." + raise ValueError(msg) + n_samples, dim = data.shape + tree = cKDTree(data) + eps, _ = tree.query(data, k=n_neigh + 1, p=2) + eps = eps[:, -1] # distance to the n_neigh-th neighbor + eps = np.clip(eps, 1e-10, None) # avoid log(0) + unit_ball_volume = (np.pi ** (dim / 2)) / gamma(dim / 2 + 1) + # --- Compute in nats --- + entropy_nats = ( + digamma(n_samples) + - digamma(n_neigh) + + np.log(unit_ball_volume) + + (dim / n_samples) * np.sum(np.log(eps)) + ) + + if units == "nat": + return entropy_nats + # bits + return entropy_nats / np.log(2) def compute_entropy_gain( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], + method: Literal["histo", "kl"] = "histo", n_bins: int = 20, ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. @@ -247,12 +367,23 @@ def compute_entropy_gain( The number of bins with which the data histogram must be computed. Default is 20. + method: + How the Shannon entropy is computed. You should use "histo" for + discrete variables, and "kl" for continuous variables. If "kl" is + chosen, the "n_bins" arg is irrelevant. See the documentation of + ``compute_shannon()`` and ``compute_kl_entropy()`` for more + details. + Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` * The Shannon entropy of the initial data :math:`H_0` * The shannon entropy of the clustered data :math:`H_{clust}` + Note: + The output are expressed as fractions if method is "histo", in bit if + method is "kl". + Example: .. testcode:: shannon2-test @@ -282,28 +413,41 @@ def compute_entropy_gain( "must have same shape[0]" ) raise RuntimeError(msg) + if method not in ("histo", "kl"): + msg = "method must be histo or kl." + raise ValueError(msg) - data_range = (float(np.min(data)), float(np.max(data))) - - # Compute the entropy of the raw data - total_entropy = compute_shannon( - data, - data_range, - n_bins, - ) - - # Compute the fraction and the entropy of the single clusters n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) - for i, label in enumerate(np.unique(labels)): - mask = labels == label - frac[i] = np.sum(mask) / labels.size - entr[i] = compute_shannon( - data[mask], + + if method == "histo": + data_range = (float(np.min(data)), float(np.max(data))) + # Compute the total entropy of the data + total_entropy = compute_shannon( + data, data_range, n_bins, ) + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_shannon( + data[mask], + data_range, + n_bins, + ) + else: # method == "kl" + # Compute the total entropy of the data + total_entropy = compute_kl_entropy(data) + + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_kl_entropy(data[mask]) + # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy @@ -320,6 +464,7 @@ def compute_entropy_gain_multi( data: npt.NDArray[np.float64], labels: npt.NDArray[np.int64], n_bins: list[int], + method: Literal["histo", "kl"] = "histo", ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. @@ -336,6 +481,13 @@ def compute_entropy_gain_multi( The number of bins with which the data histogram must be computed, one for each dimension. + method: + How the Shannon entropy is computed. You should use "histo" for + discrete variables, and "kl" for continuous variables. If "kl" is + chosen, the "n_bins" arg is irrelevant. See the documentation of + ``compute_shannon_multi()`` and ``compute_kl_entropy_multi()`` for + more details. + Returns: * The absolute information gain :math:`H_0 - H_{clust}` * The relative information gain :math:`(H_0 - H_{clust}) / H_0` @@ -372,28 +524,44 @@ def compute_entropy_gain_multi( "must have same shape[0]" ) raise RuntimeError(msg) + if method not in ("histo", "kl"): + msg = "method must be histo or kl." + raise ValueError(msg) - data_range = [(float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T] - - # Compute the entropy of the raw data - total_entropy = compute_shannon_multi( - data, - data_range, - n_bins, - ) - - # Compute the fraction and the entropy of the single clusters n_clusters = np.unique(labels).size frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) - for i, label in enumerate(np.unique(labels)): - mask = labels == label - frac[i] = np.sum(mask) / labels.size - entr[i] = compute_shannon_multi( - data[mask], + + if method == "histo": + data_range = [ + (float(np.min(tmp)), float(np.max(tmp))) for tmp in data.T + ] + + # Compute the total entropy of the data + total_entropy = compute_shannon_multi( + data, data_range, n_bins, ) + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_shannon_multi( + data[mask], + data_range, + n_bins, + ) + else: # method == "kl" + # Compute the total entropy of the data + total_entropy = compute_kl_entropy_multi(data) + + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = compute_kl_entropy_multi(data[mask]) + # Compute the entropy of the clustered data clustered_entropy = np.dot(frac, entr) info_gain = total_entropy - clustered_entropy diff --git a/src/dynsight/analysis.py b/src/dynsight/analysis.py index 3b59bcfa..4f08dc8a 100644 --- a/src/dynsight/analysis.py +++ b/src/dynsight/analysis.py @@ -4,6 +4,7 @@ compute_entropy_gain, compute_entropy_gain_multi, compute_kl_entropy, + compute_kl_entropy_multi, compute_negentropy, compute_shannon, compute_shannon_multi, @@ -22,6 +23,7 @@ "compute_entropy_gain", "compute_entropy_gain_multi", "compute_kl_entropy", + "compute_kl_entropy_multi", "compute_negentropy", "compute_rdf", "compute_shannon", diff --git a/tests/analysis/test_shannon.py b/tests/analysis/test_shannon.py index 2f90d41f..185c10de 100644 --- a/tests/analysis/test_shannon.py +++ b/tests/analysis/test_shannon.py @@ -27,6 +27,14 @@ def data_2d(rng: np.random.Generator) -> NDArray[np.float64]: return rng.random((100, 2)) +@pytest.fixture +def data_gauss(rng: np.random.Generator) -> NDArray[np.float64]: + """Random 2-Gaussians array.""" + data_1 = rng.normal(0.0, 0.1, 10000) + data_2 = rng.normal(1.0, 0.1, 10000) + return np.concatenate((data_1, data_2)) + + @pytest.fixture def labels(rng: np.random.Generator) -> NDArray[np.int64]: """Valid integer labels for 100 samples.""" @@ -89,6 +97,19 @@ def test_gain(data: NDArray[np.float64], labels: NDArray[np.int64]) -> None: assert np.isclose(gain, ref) +def test_kl_gain(data_gauss: NDArray[np.float64]) -> None: + """Check entropy gain value using KL estimator.""" + labels = np.concatenate( + (np.zeros(10000, dtype=int), np.ones(10000, dtype=int)) + ) + gain, *_ = dynsight.analysis.compute_entropy_gain( + data_gauss, + labels, + method="kl", + ) + assert np.isclose(gain, 1.0, rtol=1e-3, atol=1e-3) + + def test_gain_multi( data_2d: NDArray[np.float64], labels: NDArray[np.int64],