diff --git a/README.rst b/README.rst index d909b678..17961984 100644 --- a/README.rst +++ b/README.rst @@ -86,9 +86,7 @@ If you use ``dynsight`` please cite https://github.com/GMPavanLab/dynsight -and - - TBD + https://doi.org/10.48550/arXiv.2510.23493 * Most modules also use MDAnalysis, https://www.mdanalysis.org/pages/citations/ * If you use SOAP, please cite https://doi.org/10.1103/PhysRevB.87.184115 and DScribe https://singroup.github.io/dscribe/latest/citing.html @@ -98,6 +96,7 @@ and * If you use tICA, please cite ``deeptime`` https://deeptime-ml.github.io/latest/index.html * If you use ``dynsight.vision``, please cite Ultralytics YOLO https://docs.ultralytics.com/it/models/yolo11/#usage-examples * If you use ``dynsight.track``, please cite Trackpy https://soft-matter.github.io/trackpy/dev/introduction.html +* Entropy calculations are based on ``infomeasure`` https://doi.org/10.1038/s41598-025-14053-5 Acknowledgements diff --git a/docs/source/_static/Example_0_1D.png b/docs/source/_static/Example_0_1D.png new file mode 100644 index 00000000..781fde93 Binary files /dev/null and b/docs/source/_static/Example_0_1D.png differ diff --git a/docs/source/_static/Example_0_2D.png b/docs/source/_static/Example_0_2D.png new file mode 100644 index 00000000..06e20695 Binary files /dev/null and b/docs/source/_static/Example_0_2D.png differ diff --git a/docs/source/_static/Example_1_1D.png b/docs/source/_static/Example_1_1D.png new file mode 100644 index 00000000..82704f91 Binary files /dev/null and b/docs/source/_static/Example_1_1D.png differ diff --git a/docs/source/_static/Example_1_2D.png b/docs/source/_static/Example_1_2D.png new file mode 100644 index 00000000..94c00e4c Binary files /dev/null and b/docs/source/_static/Example_1_2D.png differ diff --git a/docs/source/_static/Information_gains.png b/docs/source/_static/Information_gains.png index 42bea1e4..d8c08162 100644 Binary files a/docs/source/_static/Information_gains.png and b/docs/source/_static/Information_gains.png differ diff --git a/docs/source/_static/info_gain/trj_2.npy b/docs/source/_static/info_gain/trj_2.npy new file mode 100644 index 00000000..7952b4bb Binary files /dev/null and b/docs/source/_static/info_gain/trj_2.npy differ diff --git a/docs/source/_static/info_gain/trj_4.npy b/docs/source/_static/info_gain/trj_4.npy new file mode 100644 index 00000000..13a9d4dc Binary files /dev/null and b/docs/source/_static/info_gain/trj_4.npy differ diff --git a/docs/source/_static/info_gain_clusters_1d.png b/docs/source/_static/info_gain_clusters_1d.png deleted file mode 100644 index 52ed66b8..00000000 Binary files a/docs/source/_static/info_gain_clusters_1d.png and /dev/null differ diff --git a/docs/source/_static/info_gain_clusters_1d_y.png b/docs/source/_static/info_gain_clusters_1d_y.png deleted file mode 100644 index 47508f0c..00000000 Binary files a/docs/source/_static/info_gain_clusters_1d_y.png and /dev/null differ diff --git a/docs/source/_static/info_gain_clusters_2d.png b/docs/source/_static/info_gain_clusters_2d.png deleted file mode 100644 index 3c6e93ad..00000000 Binary files a/docs/source/_static/info_gain_clusters_2d.png and /dev/null differ diff --git a/docs/source/_static/info_gain_clusters_2d_y.png b/docs/source/_static/info_gain_clusters_2d_y.png deleted file mode 100644 index 0a108eaf..00000000 Binary files a/docs/source/_static/info_gain_clusters_2d_y.png and /dev/null differ diff --git a/docs/source/_static/info_plot.png b/docs/source/_static/info_plot.png index d21b3840..5a8c2a4c 100644 Binary files a/docs/source/_static/info_plot.png and b/docs/source/_static/info_plot.png differ diff --git a/docs/source/_static/recipes/descr_from_trj.py b/docs/source/_static/recipes/descr_from_trj.py index 01cba368..bdcf8411 100644 --- a/docs/source/_static/recipes/descr_from_trj.py +++ b/docs/source/_static/recipes/descr_from_trj.py @@ -21,7 +21,7 @@ def main() -> None: selection="all", # compute on a selection of particles centers="all", # compute for a selection of centers respect_pbc=False, # consider PBC - n_core=1, # use multiprocessing to speed up calculations + n_jobs=1, # use multiprocessing to speed up calculations ) # Loading an example trajectory diff --git a/docs/source/analysis.rst b/docs/source/analysis.rst index 7b7a4589..0ff1192b 100644 --- a/docs/source/analysis.rst +++ b/docs/source/analysis.rst @@ -10,17 +10,25 @@ Entropy The ``analysis`` module offers a variety of functions for entropy- and information-based calculations. +.. toctree:: + :maxdepth: 1 + + shannon <_autosummary/dynsight.analysis.shannon> + info_gain <_autosummary/dynsight.analysis.info_gain> + compute_negentropy <_autosummary/dynsight.analysis.compute_negentropy> + sample_entropy <_autosummary/dynsight.analysis.sample_entropy> + +The following functions are deprecated, the previous ones should be preferred. + .. toctree:: :maxdepth: 1 compute_shannon <_autosummary/dynsight.analysis.compute_shannon> compute_kl_entropy <_autosummary/dynsight.analysis.compute_kl_entropy> - compute_negentropy <_autosummary/dynsight.analysis.compute_negentropy> compute_entropy_gain <_autosummary/dynsight.analysis.compute_entropy_gain> compute_shannon_multi <_autosummary/dynsight.analysis.compute_shannon_multi> compute_kl_entropy_multi <_autosummary/dynsight.analysis.compute_kl_entropy_multi> compute_entropy_gain_multi <_autosummary/dynsight.analysis.compute_entropy_gain_multi> - sample_entropy <_autosummary/dynsight.analysis.sample_entropy> Other functions --------------- diff --git a/docs/source/example_info_gain.rst b/docs/source/example_info_gain.rst index 884f9c6d..519048be 100644 --- a/docs/source/example_info_gain.rst +++ b/docs/source/example_info_gain.rst @@ -11,11 +11,13 @@ To start, let's import the packages we will need and create a folder in the cwd from pathlib import Path from typing import Callable + from tqdm import tqdm import numpy as np import matplotlib.pyplot as plt + import dynsight cwd = Path.cwd() - folder_name = "info_gain" + folder_name = "source/_static/info_gain" folder_path = cwd / folder_name if not folder_path.exists(): folder_path.mkdir() @@ -95,12 +97,8 @@ This function simulates, for both energy landscapes, the dynamics of 100 particl trajectory[t, i] = particles[i] - plt.figure() - plt.plot(trajectory[:, :, 0], trajectory[:, :, 1]) - plt.show() - dataset = np.transpose(trajectory, (1, 0, 2)) - np.save(filename, dataset) + np.save(file_name, dataset) return dataset @@ -144,59 +142,54 @@ To check if the clustering is working in a meaningful way, we also plot the resu y_positions = dataset[:, :, 1] info_gain_y = np.zeros(delta_t_list.size) - for j, delta_t in enumerate(delta_t_list): - reshaped_data = dynsight.onion.helpers.reshape_from_nt( - y_positions, delta_t + for j, delta_t in enumerate(tqdm(delta_t_list)): + state_list, labels = dynsight.onion.onion_uni_smooth( + y_positions, + delta_t=delta_t, ) - state_list, labels = dynsight.onion.onion_uni(reshaped_data) if j == example_delta_t: - dynsight.onion.plot.plot_output_uni( - f"Example_{i}_1D.png", - reshaped_data, - n_atoms, + dynsight.onion.plot_smooth.plot_output_uni( + f"source/_static/Example_{i}_1D.png", + y_positions, state_list, ) # and compute the information gain: - info_gain_y[j], *_ = dynsight.analysis.compute_entropy_gain( - reshaped_data, labels, n_bins=40 - ) - results.append(info_gain_y) + info_gain_y[j], *_ = dynsight.analysis.info_gain( + data=y_positions.ravel(), + labels=labels.ravel(), + method="kl", + ) # Results are in bit + results[i * 2] = info_gain_y # Or we can do clustering using both (x, y) variables: info_gain_xy = np.zeros(delta_t_list.size) - tmp1_dataset = np.transpose(dataset, (2, 0, 1)) - for j, delta_t in enumerate(delta_t_list): - reshaped_data = dynsight.onion.helpers.reshape_from_dnt( - tmp1_dataset, delta_t + for j, delta_t in enumerate(tqdm(delta_t_list)): + state_list, labels = dynsight.onion.onion_multi_smooth( + dataset, + delta_t=delta_t, ) - state_list, labels = dynsight.onion.onion_multi(reshaped_data) if j == example_delta_t: - dynsight.onion.plot.plot_output_multi( - f"Example_{i}_2D.png", - tmp1_dataset, + dynsight.onion.plot_smooth.plot_output_multi( + f"source/_static/Example_{i}_2D.png", + dataset, state_list, labels, - delta_t, ) # and compute the information gain: # We need an array (n_samples, n_dims), and labels (n_samples,) - n_sequences = int(n_frames / delta_t) - long_labels = np.repeat(labels, delta_t) - tmp = dataset[:, : n_sequences * delta_t, :] - ds_reshaped = tmp.reshape((-1, n_dims)) + reshaped_data = dataset.reshape((-1, 2)) + reshaped_labels = labels.reshape((-1,)) - info_gain_xy[j], *_ = dynsight.analysis.compute_entropy_gain_multi( - ds_reshaped, long_labels, n_bins=[40, 40] - ) - # Need to multiply by two because it's 2 dimensional, and the output - # of the info_gain functions is normalized by the log volume of the - # phase space, which is 2D is doubled - info_gain_xy *= 2 - results.append(info_gain_xy) + info_gain_xy[j], *_ = dynsight.analysis.info_gain( + reshaped_data, + reshaped_labels, + method="kl", + ) # Results are in bit + results[i * 2 + 1] = info_gain_xy Here are the plots of the two datasets, with the different clusters identified when clustering the full, bi-dimensional data, using ∆t = 4 frames: @@ -205,8 +198,8 @@ Here are the plots of the two datasets, with the different clusters identified w :widths: auto :align: center - * - .. image:: _static/info_gain_clusters_1d.png - - .. image:: _static/info_gain_clusters_2d.png + * - .. image:: _static/Example_0_2D.png + - .. image:: _static/Example_1_2D.png As can be seen, all the clusters are correctly identified at this time resolution ∆t. When we are using only the y-coordinate instead, as expected in both cases just two clusters can be identified (the two plots look the same but they are actually from the two different systems): @@ -215,8 +208,8 @@ As can be seen, all the clusters are correctly identified at this time resolutio :widths: auto :align: center - * - .. image:: _static/info_gain_clusters_1d_y.png - - .. image:: _static/info_gain_clusters_2d_y.png + * - .. image:: _static/Example_0_1D.png + - .. image:: _static/Example_1_1D.png We can now plot, for every case and for every choice of ∆t, the corresponding information gain. @@ -246,7 +239,7 @@ We can now plot, for every case and for every choice of ∆t, the corresponding ax.set_ylabel(r"Information gain $\Delta H$ [bit]") ax.set_xscale("log") ax.legend() - plt.show() + fig.savefig("source/_static/Information_gains.png", dpi=600) As can be seen in the plot below, clustering both datasets using only the y coordinate gives the same information gain, because only two clusters can be distinguished. diff --git a/docs/source/index.rst b/docs/source/index.rst index 3ae663ba..f3ebab48 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -31,10 +31,10 @@ :maxdepth: 2 :caption: Recipes: - Descriptors from a Trj - Dimensionality reduction methods - Information gain analysis - Entropy calculations + Descriptors from a Trj + Dimensionality reduction methods + Entropy calculations + Information gain analysis .. toctree:: :hidden: @@ -118,8 +118,8 @@ Examples There are examples throughout the documentation and available in the ``examples/`` directory of this repository. -There are also examples available in the ``cpctools`` repository -`here ` +There are also examples available in the ``cpctools`` repository here: +https://github.com/GMPavanLab/cpctools/tree/main/Examples How To Cite @@ -129,9 +129,7 @@ If you use ``dynsight`` please cite https://github.com/GMPavanLab/dynsight -and - - TBD + https://doi.org/10.48550/arXiv.2510.23493 * Most modules also use MDAnalysis, https://www.mdanalysis.org/pages/citations/ * If you use SOAP, please cite https://doi.org/10.1103/PhysRevB.87.184115 and DScribe https://singroup.github.io/dscribe/latest/citing.html diff --git a/docs/source/descr_from_trj.rst b/docs/source/recipe_descr_from_trj.rst similarity index 98% rename from docs/source/descr_from_trj.rst rename to docs/source/recipe_descr_from_trj.rst index 213056dc..4c8e4529 100644 --- a/docs/source/descr_from_trj.rst +++ b/docs/source/recipe_descr_from_trj.rst @@ -45,7 +45,7 @@ it's directly calculated by the :class:`.trajectory.Trj.get_soap()` method. selection="all", # compute on a selection of particles centers="all", # compute for a selection of centers respect_pbc=False, # consider PBC - n_core=1, # use multiprocessing to speed up calculations + n_jobs=1, # use multiprocessing to speed up calculations ) Number of neighbors and LENS diff --git a/docs/source/entropy.rst b/docs/source/recipe_entropy.rst similarity index 68% rename from docs/source/entropy.rst rename to docs/source/recipe_entropy.rst index 115afadb..b089b0e2 100644 --- a/docs/source/entropy.rst +++ b/docs/source/recipe_entropy.rst @@ -2,7 +2,7 @@ Entropy calculations ==================== This recipe explains how to compute Shannon entropy for different types of -datasets using the functions in the `dynsight.analysis` module. +datasets using the functions in the ``dynsight.analysis`` module. First of all, we import all the packages and objects we'll need: @@ -25,11 +25,10 @@ should be equal to log2(6) bits. n_sample = 10000 rolls = rng.integers(1, 7, size=n_sample) - dice_entropy = dynsight.analysis.compute_shannon( - data=rolls.astype(float), - data_range=(1,6), - n_bins=6, - units="bit", + dice_entropy = dynsight.analysis.shannon( + data=rolls, + method="histo", + base=2, ) # dice_entropy = 2.584832195231254 ~ log2(6) @@ -37,7 +36,7 @@ should be equal to log2(6) bits. Entropy of a discrete multivariate variable ------------------------------------------- -Let's compute the Shanon entropy of rolling `two` dices ``n_sample`` times, +Let's compute the Shanon entropy of rolling two dices ``n_sample`` times, which should be equal to log2(36) bits. .. testcode:: recipe4-test @@ -45,11 +44,10 @@ which should be equal to log2(36) bits. n_sample = 10000 rolls = rng.integers(1, 7, size=(n_sample, 2)) - dices_entropy = dynsight.analysis.compute_shannon_multi( - data=rolls.astype(float), - data_ranges=[(1,6), (1,6)], - n_bins=[6, 6], - units="bit", + dices_entropy = dynsight.analysis.shannon( + data=rolls, + method="histo", + base=2, ) # dices_entropy = 5.168428344754391 ~ log2(36) @@ -58,26 +56,30 @@ Entropy of a continuous variable --------------------------------- Shannon entropy is not univocally defined for continuous variables, but the -difference between the entropy of different distribution is. Let's compute the -difference between the Shannon entropy of two Gaussian distributions, with -standard deviations respectively equal to 1 and 2, which should be 1 bit. +difference between the entropy of different distribution is. +For continuous variables, we need to use the Kozachenko-Leonenko (KL) +estimator, passing the argument ``method="kl"``. +Let's compute the difference between the Shannon entropy of two Gaussian distributions, with standard deviations respectively equal to 1 and 2, which +should be 1 bit. .. testcode:: recipe4-test - n_sample = 10000000 + n_sample = 100000 data_1 = rng.normal(loc=0.0, scale=1.0, size=n_sample) data_2 = rng.normal(loc=0.0, scale=2.0, size=n_sample) - gauss_entropy_1 = dynsight.analysis.compute_kl_entropy( + gauss_entropy_1 = dynsight.analysis.shannon( data=data_1, - units="bit", + method="kl", + base=2, ) - gauss_entropy_2 = dynsight.analysis.compute_kl_entropy( + gauss_entropy_2 = dynsight.analysis.shannon( data=data_2, - units="bit", + method="kl", + base=2, ) diff = gauss_entropy_2 - gauss_entropy_1 - # diff = 1.0010395631476854 + # diff = 0.9994806386420283 Entropy of a continuous multivariate variable @@ -104,16 +106,18 @@ which should be 2 bits. size=n_sample, ) - gauss_entropy_1 = dynsight.analysis.compute_kl_entropy_multi( + gauss_entropy_1 = dynsight.analysis.shannon( data=data_1, - units="bit", + method="kl", + base=2, ) - gauss_entropy_2 = dynsight.analysis.compute_kl_entropy_multi( + gauss_entropy_2 = dynsight.analysis.shannon( data=data_2, - units="bit", + method="kl", + base=2, ) diff_2d = gauss_entropy_2 - gauss_entropy_1 - # diff_2d = 2.0142525628908743 + # diff_2d = 2.0101274002195764 .. raw:: html diff --git a/docs/source/info_gain.rst b/docs/source/recipe_info_gain.rst similarity index 75% rename from docs/source/info_gain.rst rename to docs/source/recipe_info_gain.rst index 5af73631..539aa77c 100644 --- a/docs/source/info_gain.rst +++ b/docs/source/recipe_info_gain.rst @@ -5,25 +5,19 @@ For the theoretical aspects of this work, see https://doi.org/10.48550/arXiv.250 This recipe explains how to compute the information gain through clustering analysis. We use a synthetic dataset containing a signal that oscillates -between 0 and 1, with Gaussian noise. Onion clustering is run on a broad +between 0 and 10, with Gaussian noise. Onion clustering is run on a broad range of time resolutions ∆t. The information gain and the Shannon entropy of the environments is computed for each value of ∆t. The analysis is implemented using onion 2.0.0 ("onion smooth"). -.. warning:: - - This code works when run from the ``/docs`` directory of the ``dynsight`` - repo. To use it elsewhere, you have to change the ``Path`` variables - accordingly. - First of all, we import all the packages and objects we'll need: .. testcode:: recipe3-test from pathlib import Path + import numpy as np import dynsight - from dynsight.trajectory import Insight import matplotlib.pyplot as plt @@ -37,13 +31,13 @@ Let's start by creating a the synthetic dataset: n_atoms = 10 num_blocks = 10 block_size = 100 - sigma = 0.1 + sigma = 1.0 # Generate the array tmp_data = [ np.concatenate( [ - rng.normal(loc=(i % 2), scale=sigma, size=block_size) + rng.normal(loc=10 * (i % 2), scale=sigma, size=block_size) for i in range(num_blocks) ] ) @@ -75,14 +69,11 @@ Additionally, the (float) dataset Shannon entropy is returned. def info_gain_with_onion( delta_t_list: np.ndarray | list[int], data: np.array, - n_bins: int = 40, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, float]: """Performs full information gain analysis with Onion clustering.""" - data_range = (np.min(data), np.max(data)) - n_clusters = np.zeros(delta_t_list.size) clusters_frac = [] - info_gain = np.zeros(delta_t_list.size) + delta_h = np.zeros(delta_t_list.size) clusters_entr = [] for i, delta_t in enumerate(delta_t_list): @@ -96,10 +87,12 @@ Additionally, the (float) dataset Shannon entropy is returned. tmp_frac[0] = 1.0 - np.sum(tmp_frac) clusters_frac.append(tmp_frac) - flat_data = data.flatten() - flat_labels = labels.flatten() - info_gain[i], _, h_0, _ = dynsight.analysis.compute_entropy_gain( - flat_data, flat_labels, n_bins=n_bins, + flat_data = data.ravel() + flat_labels = labels.ravel() + delta_h[i], _, h_0, _ = dynsight.analysis.info_gain( + flat_data, + flat_labels, + method="kl", ) tmp_entr = [] @@ -111,11 +104,7 @@ Additionally, the (float) dataset Shannon entropy is returned. mask = labels == lab selected_points = data[mask] tmp_entr.append( - dynsight.analysis.compute_shannon( - selected_points, - data_range, - n_bins=n_bins, - ) + dynsight.analysis.shannon(selected_points, method="kl") ) clusters_entr.append(tmp_entr) @@ -128,13 +117,13 @@ Additionally, the (float) dataset Shannon entropy is returned. cl_frac = np.array(clusters_frac) cl_entr = np.array(clusters_entr) - return n_clusters, cl_frac, info_gain, cl_entr, h_0 + return n_clusters, cl_frac, delta_h, cl_entr, h_0 # Example usage _, n_frames = data.shape delta_t_list = np.unique(np.geomspace(2, n_frames, 10, dtype=int)) - n_cl, cl_frac, info_gain, cl_entr, h_0 = info_gain_with_onion( + n_cl, cl_frac, delta_h, cl_entr, h_0 = info_gain_with_onion( delta_t_list, data, ) @@ -166,31 +155,36 @@ values. fig, ax = plt.subplots() - i_0 = (1 - h_0) * np.ones(len(delta_t_list)) - ax.plot(delta_t_list, i_0, ls="--", c="black", marker="") # I_0 + h_0_array = np.ones(len(delta_t_list)) * h_0 + ax.plot(delta_t_list, h_0_array, ls="--", c="black", marker="") # H_0 ax.fill_between( delta_t_list, - 1, - 1 - s_cumul[0], + 0.0, + s_cumul[0], alpha=0.5, ) for i, tmp_s in enumerate(s_cumul[1:]): ax.fill_between( delta_t_list, - 1 - s_cumul[i], - 1 - tmp_s, + s_cumul[i], + tmp_s, alpha=0.5, ) ax.fill_between( - delta_t_list, 1 - s_cumul[-1], 1 - h_0, color="gainsboro", + delta_t_list, + s_cumul[-1], + h_0_array, + color="gainsboro", ) ax.plot( - delta_t_list, 1 - s_cumul[-1], c="black", marker="", + delta_t_list, + s_cumul[-1], + c="black", + marker="", ) # I_clust - ax.set_ylim(0.0, 1.0) ax.set_xlabel(r"Time resolution $\Delta t$") - ax.set_ylabel(r"Information $I$") + ax.set_ylabel(r"Entropy $H$") ax.set_xscale("log") fig.savefig(file_path, dpi=600) @@ -207,13 +201,13 @@ values. The figure obtained (see below) shows, for each value of ∆t: -* The initial information (1 - H) of the entire dataset: dashed line; -* The information after clustering: solid line; +* The initial entropy H of the entire dataset: dashed line; +* The entropy after clustering: solid line; * The information gained through clustering ∆I: gray area; * The Shannon entropy of each of the discovered clusters: colored bands. -In this case, 2 states are correctly identified for ∆t <= 100 (green and orange), -with an information gain of around 0.2. +In this case, 2 states are correctly identified for ∆t < 100 (green and orange), +with an information gain of around 1 bit. For ∆t > 100 all the data points remain unclassified (blue), and the information gain goes to 0. @@ -227,4 +221,4 @@ gain goes to 0. .. testcode:: recipe3-test :hide: - assert np.isclose(info_gain[0], 0.19043795503255656) + assert np.isclose(delta_h[0], 1.0141, atol=1e-3) diff --git a/docs/source/soap_dim_red.rst b/docs/source/recipe_soap_dim_red.rst similarity index 100% rename from docs/source/soap_dim_red.rst rename to docs/source/recipe_soap_dim_red.rst diff --git a/examples/analysis_workflow.py b/examples/analysis_workflow.py index 25725d6a..cc8cbc9a 100644 --- a/examples/analysis_workflow.py +++ b/examples/analysis_workflow.py @@ -26,7 +26,7 @@ def main() -> None: # We can do spatial average on the computed LENS trj_lens = trj.with_slice(slice(0, -1, 1)) - lens_aver = lens.spatial_average(trj=trj_lens, r_cut=7.5, num_processes=6) + lens_aver = lens.spatial_average(trj=trj_lens, r_cut=7.5, n_jobs=6) # And we can perform onion-clustering lens_onion = lens_aver.get_onion_smooth(delta_t=10) diff --git a/examples/lens.py b/examples/lens.py index 25fc2004..27f75776 100644 --- a/examples/lens.py +++ b/examples/lens.py @@ -63,8 +63,8 @@ def main() -> None: logger.info(natoms) neigcounts = dynsight.lens.list_neighbours_along_trajectory( - input_universe=universe, - cutoff=cutoff, + universe=universe, + r_cut=cutoff, ) lens, nn, *_ = dynsight.lens.neighbour_change_in_time(neigcounts) diff --git a/info_gain/trj_2.npy b/info_gain/trj_2.npy new file mode 100644 index 00000000..7952b4bb Binary files /dev/null and b/info_gain/trj_2.npy differ diff --git a/info_gain/trj_4.npy b/info_gain/trj_4.npy new file mode 100644 index 00000000..13a9d4dc Binary files /dev/null and b/info_gain/trj_4.npy differ diff --git a/pyproject.toml b/pyproject.toml index f5d2ee5d..9a3b8ff4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ maintainers = [ ] dependencies = [ - "numpy", + "numpy<2", "dscribe", "tropea-clustering", "MDAnalysis", @@ -19,6 +19,7 @@ dependencies = [ "trackpy", "ultralytics", "deeptime", + "infomeasure", ] # Set by cpctools. @@ -131,5 +132,6 @@ module = [ 'trackpy.*', 'deeptime.*', 'sklearn.decomposition.*', + 'infomeasure.*', ] ignore_missing_imports = true diff --git a/src/dynsight/_internal/analysis/entropy.py b/src/dynsight/_internal/analysis/entropy.py index 3e3d15ba..25b03382 100644 --- a/src/dynsight/_internal/analysis/entropy.py +++ b/src/dynsight/_internal/analysis/entropy.py @@ -5,6 +5,7 @@ if TYPE_CHECKING: from numpy.typing import NDArray +import infomeasure as im import numpy as np import numpy.typing as npt from scipy.spatial import cKDTree @@ -12,6 +13,268 @@ from scipy.special import digamma, gamma +def shannon( + data: NDArray[np.float64], + method: Literal["histo", "kl"], + base: float = 2.0, + n_neigh: int = 4, +) -> float: + """Compute the Shannon entropy of a data distribution. + + Parameters: + data: + The data for which the entropy is to be computed. Has shape + (n_samples, n_features). + + method: + How the Shannon entropy is computed. You should use "histo" for + discrete variables, and "kl" for continuous variables. If "histo" + is chosen, the "n_neigh" arg is irrelevant. See the documentation + of the infomeasure package for more details (link in the notes + below). + + base: + The units of measure of the returned value. Use "2" for bits, + "np.e" for nats. + + n_neigh: + The number of neighbors considered in the KL estimator. The + default value n_neigh = 4 is recommended in the literature. + + Returns: + The value of the Shannon entropy of the data. + + Notes: + This function uses the ``infomeasure.entropy()`` function, see + https://infomeasure.readthedocs.io/en/latest/guide/entropy/. + + Example: + + .. testcode:: shannon-test + + import numpy as np + from dynsight.analysis import shannon + rng = np.random.default_rng(seed=42) + + ### Discrete case: fair coin. H = 1 bit. ### + int_data = rng.integers(low=0, high=2, size=100000) + h_int = shannon(data=int_data, method="histo") + + ### Bivariate case: 2 fair coins. H = 2 bit. ### + int_data = rng.integers(low=0, high=2, size=(100000, 2)) + h_int_2 = shannon(data=int_data, method="histo") + + ### Continuous case: uniform distribution in [0, 10]. ### + float_data = rng.random(200000) * 10 + h_float = shannon(data=float_data, method="kl") + + .. testcode:: shannon-test + :hide: + + assert np.isclose(h_int, 1.0, atol=1e-3) + assert np.isclose(h_int_2, 2.0, atol=1e-3) + assert np.isclose(h_float, np.log2(10), atol=5e-3) + """ + if method not in ("histo", "kl"): + msg = "method must be 'histo' or 'kl'." + raise ValueError(msg) + if method == "histo": + return im.entropy(data, approach="discrete", base=base) + # If instead method == "kl": + min_samples = 2 + if data.shape[0] <= min_samples: + return 0.0 # can't compute KL entropy with only 2 samples + return im.entropy(data, approach="metric", k=n_neigh, base=base) + + +def compute_negentropy( + data: NDArray[np.float64], + units: Literal["bit", "nat"] = "bit", +) -> float: + """Estimate negentropy of a dataset. + + Negentropy is a measure of non-Gaussianity representing the distance + from a Gaussian distribution; it's used to quantify the amount of + information in a signal, the Gaussian being the less informative + distribution for a given variance. + + .. math:: + + Neg(X) = H(X_{Gauss}) - H(X) + + Parameters: + data: + The dataset for which the entropy is to be computed. + + units: + The units of measure of the output negentropy. If "bit", it is + computed with log base 2, if "nat" with natural log. + + Returns: + The negentropy of the dataset. + + + Example: + + .. testcode:: negentropy-test + + import numpy as np + from dynsight.analysis import compute_negentropy + + np.random.seed(1234) + data = np.random.rand(10000) + + negentropy = compute_negentropy(data) + + .. testcode:: negentropy-test + :hide: + + assert np.isclose(negentropy, 0.268, rtol=5e-3) + + """ + if units not in ("bit", "nat"): + msg = "units must be bit or nat." + raise ValueError(msg) + base = 2 if units == "bit" else np.e + data = data.flatten() + rng = np.random.default_rng(seed=1234) + data_norm = (data - np.mean(data)) / np.std(data, ddof=1) + sigma = np.std(data_norm, ddof=1) + data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size) + h_gauss = shannon(data_gauss, method="kl", base=base) + h_data = shannon(data_norm, method="kl", base=base) + return h_gauss - h_data + + +def info_gain( + data: npt.NDArray[np.float64], + labels: npt.NDArray[np.int64], + method: Literal["histo", "kl"], + base: float = 2.0, + n_neigh: int = 4, +) -> tuple[float, float, float, float]: + """Compute the information gained by the clustering. + + Parameters: + data: + The dataset over which the clustering is performed. Has shape + (n_samples, n_features). + + labels: + The clustering labels. Has shape (n_samples,). + + method: + How the Shannon entropy is computed. You should use "histo" for + discrete variables, and "kl" for continuous variables. If "histo" + is chosen, the "n_neigh" arg is irrelevant. See the documentation + of the infomeasure package for more details (link in the notes + below). + + base: + The units of measure of the returned value. Use "2" for bits, + "np.e" for nats. + + n_neigh: + The number of neighbors considered in the KL estimator. The + default value n_neigh = 4 is recommended in the literature. + + Returns: + * The absolute information gain :math:`H_0 - H_{clust}` + * The relative information gain :math:`(H_0 - H_{clust}) / H_0` + * The Shannon entropy of the initial data :math:`H_0` + * The shannon entropy of the clustered data :math:`H_{clust}` + + Notes: + This function uses the ``infomeasure.entropy()`` function, see + https://infomeasure.readthedocs.io/en/latest/guide/entropy/. + + Example: + + .. testcode:: info-gain-test + + import numpy as np + from dynsight.analysis import info_gain + rng = np.random.default_rng(seed=42) + + ### Descrete case ### + int_data = rng.integers(low=0, high=4, size=100000) + labels = (int_data < 2).astype(int) + delta_i, *_ = info_gain( + data=int_data, + labels=labels, + method="histo", + ) + + .. testcode:: info-gain-test + :hide: + + assert np.isclose(delta_i, 1.0) + + .. testcode:: info-gain-test + + ### Continuous case ### + float_data = rng.random(200000) + labels = (float_data < 0.5).astype(int) + delta_i, *_ = info_gain( + data=float_data, + labels=labels, + method="kl", + ) + + .. testcode:: info-gain-test + :hide: + + assert np.isclose(delta_i, 1.0) + """ + if data.shape[0] != labels.shape[0]: + msg = ( + f"data ({data.shape}) and labels ({labels.shape}) " + "must have same shape[0]" + ) + raise RuntimeError(msg) + if method not in ("histo", "kl"): + msg = "method must be 'histo' or 'kl'." + raise ValueError(msg) + + n_clusters = np.unique(labels).size + frac, entr = np.zeros(n_clusters), np.zeros(n_clusters) + + if method == "histo": + # Compute the total entropy of the data + total_entropy = shannon(data, method="histo", base=base) + + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = shannon(data[mask], method="histo", base=base) + else: # method == "kl" + # Compute the total entropy of the data + total_entropy = shannon(data, method="kl", base=base, n_neigh=n_neigh) + + # Compute the fraction and the entropy of the single clusters + for i, label in enumerate(np.unique(labels)): + mask = labels == label + frac[i] = np.sum(mask) / labels.size + entr[i] = shannon( + data[mask], + method="kl", + base=base, + n_neigh=n_neigh, + ) + + # Compute the entropy of the clustered data + clustered_entropy = np.dot(frac, entr) + info_gain = total_entropy - clustered_entropy + + return ( + info_gain, + info_gain / total_entropy, + total_entropy, + clustered_entropy, + ) + + def compute_shannon( data: NDArray[np.float64], data_range: tuple[float, float], @@ -22,6 +285,10 @@ def compute_shannon( It is normalized so that a uniform distribution has unitary entropy. + .. deprecated:: v2025.08.27 + This function is deprecated and will be removed after June 2026. + Use ``analysis.shannon()`` instead. + Parameters: data: The dataset for which the entropy is to be computed. @@ -96,6 +363,10 @@ def compute_kl_entropy( differential entropy based on distances to nearest neighbors in the sample space. It's main advantage is being parameter-free. + .. deprecated:: v2025.08.27 + This function is deprecated and will be removed after June 2026. + Use ``analysis.shannon()`` instead. + Parameters: data: The dataset for which the entropy is to be computed. @@ -144,64 +415,6 @@ def compute_kl_entropy( return const + np.mean(np.log2(eps)) -def compute_negentropy( - data: NDArray[np.float64], - units: Literal["bit", "nat"] = "bit", -) -> float: - """Estimate negentropy of a dataset. - - Negentropy is a measure of non-Gaussianity representing the distance - from a Gaussian distribution; it's used to quantify the amount of - information in a signal, the Gaussian being the less informative - distribution for a given variance. - - .. math:: - - Neg(X) = H(X_{Gauss}) - H(X) - - Parameters: - data: - The dataset for which the entropy is to be computed. - - units: - The units of measure of the output negentropy. If "bit", it is - computed with log base 2, if "nat" with natural log. - - Returns: - The negentropy of the dataset. - - - Example: - - .. testcode:: negentropy-test - - import numpy as np - from dynsight.analysis import compute_negentropy - - np.random.seed(1234) - data = np.random.rand(10000) - - negentropy = compute_negentropy(data) - - .. testcode:: negentropy-test - :hide: - - assert np.isclose(negentropy, 0.2609932580146541) - - """ - if units not in ("bit", "nat"): - msg = "units must be bit or nat." - raise ValueError(msg) - data = data.flatten() - rng = np.random.default_rng(seed=1234) - data_norm = (data - np.mean(data)) / np.std(data, ddof=1) - sigma = np.std(data_norm, ddof=1) - data_gauss = rng.normal(loc=0.0, scale=sigma, size=data.size) - h_gauss = compute_kl_entropy(data_gauss, units=units) - h_data = compute_kl_entropy(data_norm, units=units) - return h_gauss - h_data - - def compute_shannon_multi( data: NDArray[np.float64], data_ranges: list[tuple[float, float]], @@ -212,6 +425,10 @@ def compute_shannon_multi( It is normalized so that a uniform distribution has unitary entropy. + .. deprecated:: v2025.08.27 + This function is deprecated and will be removed after June 2026. + Use ``analysis.shannon()`` instead. + Parameters: data: shape (n_samples, n_dimensions) @@ -292,6 +509,10 @@ def compute_kl_entropy_multi( differential entropy based on distances to nearest neighbors in the sample space. It's main advantage is being parameter-free. + .. deprecated:: v2025.08.27 + This function is deprecated and will be removed after June 2026. + Use ``analysis.shannon()`` instead. + Parameters: data: The dataset for which the entropy is to be computed. @@ -356,6 +577,10 @@ def compute_entropy_gain( ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. + .. deprecated:: v2025.08.27 + This function is deprecated and will be removed after June 2026. + Use ``analysis.info_gain()`` instead. + Parameters: data: The dataset over which the clustering is performed. @@ -468,6 +693,10 @@ def compute_entropy_gain_multi( ) -> tuple[float, float, float, float]: """Compute the relative information gained by the clustering. + .. deprecated:: v2025.08.27 + This function is deprecated and will be removed after June 2026. + Use ``analysis.info_gain()`` instead. + Parameters: data: shape (n_samples, n_dimensions) diff --git a/src/dynsight/_internal/analysis/spatial_average.py b/src/dynsight/_internal/analysis/spatial_average.py index 2ff594f8..64b21870 100644 --- a/src/dynsight/_internal/analysis/spatial_average.py +++ b/src/dynsight/_internal/analysis/spatial_average.py @@ -67,9 +67,9 @@ def spatialaverage( universe: Universe, descriptor_array: NDArray[np.float64], selection: str, - cutoff: float, + r_cut: float, trajslice: slice | None = None, - num_processes: int = 1, + n_jobs: int = 1, ) -> NDArray[np.float64]: """Compute spatially averaged descriptor values over neighboring particles. @@ -78,7 +78,7 @@ def spatialaverage( as a physical property for each particle in each frame of the simulation). For each particle in the system, the function calculates the average of its descriptor values with the descriptor values of its neighboring - particles within a specified cutoff radius. The calculation is parallelized + particles within a specified r_cut radius. The calculation is parallelized across multiple processes for efficiency. .. caution:: @@ -111,13 +111,13 @@ def spatialaverage( An atom selection string compatible with MDAnalysis. This defines the subset of atoms for which the spatial averaging will be computed. - cutoff: - The distance cutoff (in the same units as the trajectory) that + r_cut: + The distance r_cut (in the same units as the trajectory) that defines the neighborhood radius within which particles are considered as neighbors. traj_cut: The number of frames to exclude from the end of the trajectory. - num_processes: + n_jobs: The number of processes to use for parallel computation. **Warning:** Adjust this based on the available cores. @@ -149,11 +149,12 @@ def spatialaverage( universe=u, descriptor_array=descriptor, selection='name CA', - cutoff=5.0, - num_processes=8) + r_cut=5.0, + n_jobs=8, + ) This example computes the spatial averages of the descriptor values - for atoms selected as `CA` atoms, within a cutoff of 5.0 units, using 8 + for atoms selected as `CA` atoms, within a r_cut of 5.0 units, using 8 processes in parallel. The result is stored in `averaged_values`, a NumPy array. All supported input file formats by MDAnalysis can be used to set up the Universe. @@ -184,7 +185,7 @@ def spatialaverage( raise ValueError(msg) pool = Pool( - processes=num_processes, + processes=n_jobs, initializer=initworker, initargs=(shared_array, shape, dtype), ) @@ -193,7 +194,7 @@ def spatialaverage( range(*trajslice.indices(universe.trajectory.n_frames)) ) args = [ - (universe, selection, cutoff, traj_frame, i, is_vector) + (universe, selection, r_cut, traj_frame, i, is_vector) for i, traj_frame in enumerate(frame_indices) ] results = pool.map(processframe, args) diff --git a/src/dynsight/_internal/lens/lens.py b/src/dynsight/_internal/lens/lens.py index 0da9ea9f..f4d5b82d 100644 --- a/src/dynsight/_internal/lens/lens.py +++ b/src/dynsight/_internal/lens/lens.py @@ -17,7 +17,7 @@ def _process_neighbour_frame( args: tuple[Universe, AtomGroup, float, int, int], ) -> tuple[int, list[NDArray[np.float64]]]: - universe, selection, cutoff, traj_frame, result_index = args + universe, selection, r_cut, traj_frame, result_index = args universe.trajectory[traj_frame] neigh_search = AtomNeighborSearch( @@ -25,27 +25,27 @@ def _process_neighbour_frame( ) neigh_list_per_atom = [ - neigh_search.search(atom, cutoff).ix for atom in selection + neigh_search.search(atom, r_cut).ix for atom in selection ] return result_index, neigh_list_per_atom def list_neighbours_along_trajectory( - input_universe: Universe, - cutoff: float, + universe: Universe, + r_cut: float, selection: str = "all", trajslice: slice | None = None, - num_processes: int = 1, + n_jobs: int = 1, ) -> list[list[AtomGroup]]: """Produce a per-frame list of the neighbors, atom by atom. * Original author: Martina Crippa Parameters: - input_universe: + universe: The universe, or the atom group containing the trajectory. - cutoff: + r_cut: The maximum neighbor distance. selection: Selection of atoms taken from the Universe for the computation. @@ -53,7 +53,7 @@ def list_neighbours_along_trajectory( `here `_ trajslice: The slice of the trajectory to consider. Defaults to slice(None). - num_processes: + n_jobs: The number of processes to use for parallel computation. **Warning:** Adjust this based on the available cores. @@ -76,11 +76,11 @@ def list_neighbours_along_trajectory( from dynsight.lens import list_neighbours_along_trajectory univ = MDAnalysis.Universe(path / "trajectory.xyz") - cutoff = 2.0 + r_cut = 2.0 neigh_counts = list_neighbours_along_trajectory( - input_universe=univ, - cutoff=cutoff, + universe=univ, + r_cut=r_cut, ) .. testcode:: lens1-test @@ -93,35 +93,35 @@ def list_neighbours_along_trajectory( """ if trajslice is None: trajslice = slice(None) - selected_atoms = input_universe.select_atoms(selection) + selected_atoms = universe.select_atoms(selection) frame_indices = list( - range(*trajslice.indices(input_universe.trajectory.n_frames)) + range(*trajslice.indices(universe.trajectory.n_frames)) ) - if num_processes < 1: - msg = "num_processes cannot be negative or zero." + if n_jobs < 1: + msg = "n_jobs cannot be negative or zero." raise ValueError(msg) - if num_processes == 1: + if n_jobs == 1: neigh_list_per_frame = [] for traj_frame in frame_indices: - input_universe.trajectory[traj_frame] + universe.trajectory[traj_frame] neigh_search = AtomNeighborSearch( - input_universe.atoms, - box=input_universe.trajectory.ts.dimensions, + universe.atoms, + box=universe.trajectory.ts.dimensions, ) neigh_list_per_atom = [ - neigh_search.search(atom, cutoff).ix for atom in selected_atoms + neigh_search.search(atom, r_cut).ix for atom in selected_atoms ] neigh_list_per_frame.append(neigh_list_per_atom) return neigh_list_per_frame args = [ - (input_universe, selected_atoms, cutoff, frame, i) + (universe, selected_atoms, r_cut, frame, i) for i, frame in enumerate(frame_indices) ] - with Pool(processes=num_processes) as pool: + with Pool(processes=n_jobs) as pool: results = pool.map(_process_neighbour_frame, args) ordered_results = dict(results) @@ -180,11 +180,11 @@ def neighbour_change_in_time( import dynsight.lens as lens univ = MDAnalysis.Universe(path / "trajectory.xyz") - cutoff = 3.0 + r_cut = 3.0 neig_counts = lens.list_neighbours_along_trajectory( - input_universe=univ, - cutoff=cutoff, + universe=univ, + r_cut=r_cut, ) lens, n_neigh, *_ = lens.neighbour_change_in_time(neig_counts) diff --git a/src/dynsight/_internal/timesoap/timesoap.py b/src/dynsight/_internal/timesoap/timesoap.py index 4a41b73a..e940c5e5 100644 --- a/src/dynsight/_internal/timesoap/timesoap.py +++ b/src/dynsight/_internal/timesoap/timesoap.py @@ -10,13 +10,11 @@ import numpy as np -def normalize_soap( - soaptrajectory: NDArray[np.float64], -) -> NDArray[np.float64]: +def normalize_soap(soap: NDArray[np.float64]) -> NDArray[np.float64]: """Returns the SOAP spectra normalized to unitary length. Parameters: - soaptrajectory : shape (n_particles, n_frames, n_comp) + soap : shape (n_particles, n_frames, n_comp) The SOAP spctra for the trajectory. Returns: @@ -53,10 +51,10 @@ def normalize_soap( np.sum(unitary_soap[0]), 21.987915602216525, atol=1e-6, rtol=1e-3) """ - norms = np.linalg.norm(soaptrajectory, axis=-1) + norms = np.linalg.norm(soap, axis=-1) mask = norms > 0.0 - norm_soap = np.zeros(soaptrajectory.shape) - norm_soap[mask] = soaptrajectory[mask] / norms[..., np.newaxis][mask] + norm_soap = np.zeros(soap.shape) + norm_soap[mask] = soap[mask] / norms[..., np.newaxis][mask] return norm_soap @@ -129,17 +127,14 @@ def soap_distance( return np.sqrt(tsoap) -def timesoap( - soaptrajectory: NDArray[np.float64], - delay: int = 1, -) -> NDArray[np.float64]: +def timesoap(soap: NDArray[np.float64], delay: int = 1) -> NDArray[np.float64]: """Performs the 'timeSOAP' analysis on the given SOAP trajectory. timeSOAP was developed by Cristina Caurso. See for reference the paper https://doi.org/10.1063/5.0147025. Parameters: - soaptrajectory: shape (n_particles, n_frames, n_comp) + soap: shape (n_particles, n_frames, n_comp) The SOAP spctra for the trajectory. delay: @@ -178,9 +173,7 @@ def timesoap( atol=1e-6, rtol=1e-3) """ value_error = "delay value outside bounds" - if delay < 1 or delay >= soaptrajectory.shape[1]: + if delay < 1 or delay >= soap.shape[1]: raise ValueError(value_error) - return soap_distance( - soaptrajectory[:, :-delay, :], soaptrajectory[:, delay:, :] - ) + return soap_distance(soap[:, :-delay, :], soap[:, delay:, :]) diff --git a/src/dynsight/_internal/trajectory/insight.py b/src/dynsight/_internal/trajectory/insight.py index 577a7cce..6cfd3c86 100644 --- a/src/dynsight/_internal/trajectory/insight.py +++ b/src/dynsight/_internal/trajectory/insight.py @@ -92,7 +92,7 @@ def spatial_average( trj: Trj, r_cut: float, selection: str = "all", - num_processes: int = 1, + n_jobs: int = 1, ) -> Insight: """Average the descriptor over the neighboring particles. @@ -103,9 +103,9 @@ def spatial_average( universe=trj.universe, descriptor_array=self.dataset, selection=selection, - cutoff=r_cut, + r_cut=r_cut, trajslice=trj.trajslice, - num_processes=num_processes, + n_jobs=n_jobs, ) attr_dict = {"sp_av_r_cut": r_cut, "selection": selection} diff --git a/src/dynsight/_internal/trajectory/trajectory.py b/src/dynsight/_internal/trajectory/trajectory.py index d45d3e29..99a1be23 100644 --- a/src/dynsight/_internal/trajectory/trajectory.py +++ b/src/dynsight/_internal/trajectory/trajectory.py @@ -131,9 +131,9 @@ def get_slice(self, start: int, stop: int, step: int) -> Trj: def get_coord_number( self, r_cut: float, - delay: int = 1, selection: str = "all", neigcounts: list[list[AtomGroup]] | None = None, + n_jobs: int = 1, ) -> tuple[list[list[AtomGroup]], Insight]: """Compute coordination number on the trajectory. @@ -146,17 +146,18 @@ def get_coord_number( """ if neigcounts is None: neigcounts = dynsight.lens.list_neighbours_along_trajectory( - input_universe=self.universe, - cutoff=r_cut, + universe=self.universe, + r_cut=r_cut, selection=selection, trajslice=self.trajslice, + n_jobs=n_jobs, ) _, nn, *_ = dynsight.lens.neighbour_change_in_time( neigh_list_per_frame=neigcounts, - delay=delay, + delay=1, ) - attr_dict = {"r_cut": r_cut, "delay": delay, "selection": selection} + attr_dict = {"r_cut": r_cut, "selection": selection} logger.log(f"Computed coord_number using args {attr_dict}.") return neigcounts, Insight( dataset=nn.astype(np.float64), @@ -169,7 +170,7 @@ def get_lens( delay: int = 1, selection: str = "all", neigcounts: list[list[AtomGroup]] | None = None, - num_processes: int = 1, + n_jobs: int = 1, ) -> tuple[list[list[AtomGroup]], Insight]: """Compute LENS on the trajectory. @@ -182,11 +183,11 @@ def get_lens( """ if neigcounts is None: neigcounts = dynsight.lens.list_neighbours_along_trajectory( - input_universe=self.universe, - cutoff=r_cut, + universe=self.universe, + r_cut=r_cut, selection=selection, trajslice=self.trajslice, - num_processes=num_processes, + n_jobs=n_jobs, ) lens, *_ = dynsight.lens.neighbour_change_in_time( neigh_list_per_frame=neigcounts, @@ -209,7 +210,7 @@ def get_soap( selection: str = "all", centers: str = "all", respect_pbc: bool = True, - n_core: int = 1, + n_jobs: int = 1, ) -> Insight: """Compute SOAP on the trajectory. @@ -224,7 +225,7 @@ def get_soap( selection=selection, soap_respectpbc=respect_pbc, centers=centers, - n_core=n_core, + n_core=n_jobs, trajslice=self.trajslice, ) attr_dict = { @@ -244,6 +245,7 @@ def get_orientational_op( order: int = 6, selection: str = "all", neigcounts: list[list[AtomGroup]] | None = None, + n_jobs: int = 1, ) -> tuple[list[list[AtomGroup]], Insight]: """Compute the magnitude of the orientational order parameter. @@ -256,10 +258,11 @@ def get_orientational_op( """ if neigcounts is None: neigcounts = dynsight.lens.list_neighbours_along_trajectory( - input_universe=self.universe, - cutoff=r_cut, + universe=self.universe, + r_cut=r_cut, selection=selection, trajslice=self.trajslice, + n_jobs=n_jobs, ) psi = dynsight.descriptors.orientational_order_param( self.universe, @@ -283,6 +286,7 @@ def get_velocity_alignment( r_cut: float, selection: str = "all", neigcounts: list[list[AtomGroup]] | None = None, + n_jobs: int = 1, ) -> tuple[list[list[AtomGroup]], Insight]: """Compute the average velocity alignment. @@ -295,10 +299,11 @@ def get_velocity_alignment( """ if neigcounts is None: neigcounts = dynsight.lens.list_neighbours_along_trajectory( - input_universe=self.universe, - cutoff=r_cut, + universe=self.universe, + r_cut=r_cut, selection=selection, trajslice=self.trajslice, + n_jobs=n_jobs, ) phi = dynsight.descriptors.velocity_alignment( self.universe, @@ -357,3 +362,44 @@ def get_rdf( } logger.log(f"Computed g(r) with args {attr_dict}.") return bins, rdf + + def dump_colored_trj( + self, + labels: NDArray[np.int64], + file_path: Path, + ) -> None: + """Save an .xyz file with the labels for each atom. + + The output file has columns: atom_type, x, y, z, label. + """ + trajslice = slice(None) if self.trajslice is None else self.trajslice + + if labels.shape != (self.n_atoms, self.n_frames): + msg = ( + f"Shape mismatch: ClusterInsight should have " + f"{self.n_atoms} atoms, {self.n_frames} frames, but has " + f"{labels.shape[0]} atoms, {labels.shape[1]} frames." + ) + logger.log(msg) + raise ValueError(msg) + + lab_new = labels + 2 + with file_path.open("w") as f: + for i, ts in enumerate(self.universe.trajectory[trajslice]): + f.write(f"{self.n_atoms}\n") + if ts.dimensions is not None: + box_str = " ".join(f"{x:.5f}" for x in ts.dimensions) + else: + box_str = "0.0 0.0 0.0 0.0 0.0 0.0" + f.write( + f"Lattice={box_str} " + f"Properties=species:S:1:pos:R:3:type:I:1\n" + ) + for atom_idx in range(self.n_atoms): + label = str(lab_new[atom_idx, i]) + x, y, z = ts.positions[atom_idx] + f.write( + f"{self.universe.atoms[atom_idx].name} {x:.5f}" + f" {y:.5f} {z:.5f} {label}\n" + ) + logger.log(f"Colored trj saved to {file_path}.") diff --git a/src/dynsight/_internal/utilities/utilities.py b/src/dynsight/_internal/utilities/utilities.py index 19f3b45f..43f8363b 100644 --- a/src/dynsight/_internal/utilities/utilities.py +++ b/src/dynsight/_internal/utilities/utilities.py @@ -94,7 +94,7 @@ def load_or_compute_soap( selection=selection, centers=centers, respect_pbc=respect_pbc, - n_core=n_core, + n_jobs=n_core, ) if soap_path is not None: diff --git a/src/dynsight/analysis.py b/src/dynsight/analysis.py index 4f08dc8a..a76d3946 100644 --- a/src/dynsight/analysis.py +++ b/src/dynsight/analysis.py @@ -8,7 +8,9 @@ compute_negentropy, compute_shannon, compute_shannon_multi, + info_gain, sample_entropy, + shannon, ) from dynsight._internal.analysis.rdf import compute_rdf from dynsight._internal.analysis.spatial_average import ( @@ -29,7 +31,9 @@ "compute_shannon", "compute_shannon_multi", "cross_time_correlation", + "info_gain", "sample_entropy", "self_time_correlation", + "shannon", "spatialaverage", ] diff --git a/tests/analysis/test_shannon.py b/tests/analysis/test_shannon.py index 185c10de..2bfb51a9 100644 --- a/tests/analysis/test_shannon.py +++ b/tests/analysis/test_shannon.py @@ -1,4 +1,4 @@ -"""Pytest for dynsight.analysis.compute_entropy_gain.""" +"""Pytest for dynsight.analysis.entropy.""" import numpy as np import pytest @@ -42,9 +42,8 @@ def labels(rng: np.random.Generator) -> NDArray[np.int64]: @pytest.fixture -def bad_labels() -> NDArray[np.int64]: +def bad_labels(rng: np.random.Generator) -> NDArray[np.int64]: """Mismatched label shape.""" - rng = np.random.default_rng(12345) return rng.integers(0, 5, (200, 50), dtype=np.int64) @@ -56,18 +55,25 @@ def test_wrong_input( labels: NDArray[np.int64], ) -> None: """Check wrong input raises Errors.""" + with pytest.raises(ValueError, match=r"method must be 'histo' or 'kl'."): + dynsight.analysis.shannon(data_2d, method="wrong") # type: ignore[arg-type] + with pytest.raises(ValueError, match="data is empty"): dynsight.analysis.compute_shannon(np.array([]), (0.0, 1.0), n_bins=20) with pytest.raises(ValueError, match="data is empty"): dynsight.analysis.compute_shannon_multi( - np.array([]), [(0.0, 1.0), (0.0, 1.0)], n_bins=[20, 20] + np.array([]), + [(0.0, 1.0), (0.0, 1.0)], + n_bins=[20, 20], ) with pytest.raises( ValueError, match="Mismatch between data dimensions, data_ranges, and n_bins", ): dynsight.analysis.compute_shannon_multi( - data_2d, [(0.0, 1.0)], n_bins=[20] + data_2d, + [(0.0, 1.0)], + n_bins=[20], ) with pytest.raises( RuntimeError, @@ -76,7 +82,9 @@ def test_wrong_input( r"must have same shape\[0\]", ): dynsight.analysis.compute_entropy_gain_multi( - data_2d, labels[:50], n_bins=[20, 20] + data_2d, + labels[:50], + n_bins=[20, 20], ) @@ -86,12 +94,28 @@ def test_bad_shape( """Bad label shape should raise RuntimeError.""" with pytest.raises(RuntimeError): dynsight.analysis.compute_entropy_gain(data, bad_labels, n_bins=20) + with pytest.raises(RuntimeError): + dynsight.analysis.info_gain(data, bad_labels, "kl") + + +def test_shannon(data_2d: NDArray[np.float64]) -> None: + """Check Shannon-based functions.""" + entropy = dynsight.analysis.shannon(data_2d, method="histo") + assert np.isclose(entropy, 6.643856189774724, rtol=1e-3, atol=1e-3) + entropy = dynsight.analysis.shannon(data_2d, method="kl") + assert np.isclose(entropy, 0.0755109447796211, rtol=1e-3, atol=1e-3) + entropy = dynsight.analysis.shannon(np.array([0.0, 1.0]), method="kl") + assert entropy == 0.0 + negentropy = dynsight.analysis.compute_negentropy(data_2d) + assert np.isclose(negentropy, 0.40485364974320515, rtol=1e-3, atol=1e-3) def test_gain(data: NDArray[np.float64], labels: NDArray[np.int64]) -> None: """Check entropy gain value.""" _, gain, *_ = dynsight.analysis.compute_entropy_gain( - data, labels, n_bins=20 + data, + labels, + n_bins=20, ) ref = 0.0010842808402454819 assert np.isclose(gain, ref) diff --git a/tests/analysis/test_spatialaverage.py b/tests/analysis/test_spatialaverage.py index 5c2c1ce6..31b2f6e7 100644 --- a/tests/analysis/test_spatialaverage.py +++ b/tests/analysis/test_spatialaverage.py @@ -56,7 +56,7 @@ def test_spavg(trj: Trj, insight: Insight, files: dict[str, Path]) -> None: trj, r_cut=5.0, selection="type O", - num_processes=1, + n_jobs=1, ) expected: NDArray[np.float64] = np.load(files["ref"]) diff --git a/tests/lens/test_lens.py b/tests/lens/test_lens.py index b01dd0ee..3149e162 100644 --- a/tests/lens/test_lens.py +++ b/tests/lens/test_lens.py @@ -42,7 +42,7 @@ def test_lens_signals(case_data: LENSCaseData) -> None: # Run LENS (and nn) calculation for different r_cuts for i, r_cut in enumerate(lens_cutoffs): neigcounts, test_lens = example_trj.get_lens( - r_cut=r_cut, num_processes=case_data.num_processes + r_cut=r_cut, n_jobs=case_data.num_processes ) test_lens_ds = np.array( [np.concatenate(([0.0], tmp)) for tmp in test_lens.dataset] diff --git a/tests/lens/test_special_lens.py b/tests/lens/test_special_lens.py index 3b822908..c10bb92b 100644 --- a/tests/lens/test_special_lens.py +++ b/tests/lens/test_special_lens.py @@ -22,8 +22,8 @@ def test_special_lens(lensfixtures: MDAnalysis.Universe) -> None: universe = lensfixtures[0] coff = 1.1 nnlistperframe = dynsight.lens.list_neighbours_along_trajectory( - input_universe=universe, - cutoff=coff, + universe=universe, + r_cut=coff, ) ( mynconttot, diff --git a/tests/lens/to_remove_test_count_neigh.py b/tests/lens/to_remove_test_count_neigh.py index c929ef2a..9a18f563 100644 --- a/tests/lens/to_remove_test_count_neigh.py +++ b/tests/lens/to_remove_test_count_neigh.py @@ -14,7 +14,7 @@ def test_count_neigh_for_lens( - hdf5_file: tuple[pathlib.Path, MDAnalysis.Universe], input1_2: int + hdf5_file: tuple[pathlib.Path, MDAnalysis.Universe], r_cut1_2: int ) -> None: """Test :class:`.list_neighbours_along_trajectory`. @@ -32,7 +32,7 @@ def test_count_neigh_for_lens( # this is the original version by Martina Crippa inputuniverse: MDAnalysis.Universe = hdf5_file[1] - wantedslice = slice(0, len(inputuniverse.trajectory) // input1_2, 1) + wantedslice = slice(0, len(inputuniverse.trajectory) // r_cut1_2, 1) coff = 10.0 init = wantedslice.start end = wantedslice.stop @@ -49,8 +49,8 @@ def test_count_neigh_for_lens( cont_list.append([nsearch.search(i, coff, level="A") for i in beads]) for selection in [inputuniverse, beads]: neigh_list_per_frame = dynsight.lens.list_neighbours_along_trajectory( - input_universe=selection, - cutoff=coff, + universe=selection, + r_cut=coff, trajslice=wantedslice, ) diff --git a/tests/lens/to_remove_test_emulate_lens.py b/tests/lens/to_remove_test_emulate_lens.py index 80a8afe6..51ab7237 100644 --- a/tests/lens/to_remove_test_emulate_lens.py +++ b/tests/lens/to_remove_test_emulate_lens.py @@ -36,8 +36,8 @@ def test_emulate_lens( wantedslice = slice(0, len(inputuniverse.trajectory) // input1_2, 1) coff = 4.0 nnlistperframe = dynsight.lens.list_neighbours_along_trajectory( - input_universe=inputuniverse, - cutoff=coff, + universe=inputuniverse, + r_cut=coff, trajslice=wantedslice, ) # def local_dynamics(list_sum): diff --git a/tests/trajectory/test_trj.py b/tests/trajectory/test_trj.py index 5480cbdb..6e674a9d 100644 --- a/tests/trajectory/test_trj.py +++ b/tests/trajectory/test_trj.py @@ -47,7 +47,8 @@ def universe(file_paths: dict[str, Path]) -> MDAnalysis.Universe: def test_trj_inits( - universe: MDAnalysis.Universe, file_paths: dict[str, Path] + universe: MDAnalysis.Universe, + file_paths: dict[str, Path], ) -> None: """Test initialization methods for Trj class.""" n_frames_xyz = 21 @@ -107,7 +108,9 @@ def test_get_descriptors(file_paths: dict[str, Path]) -> None: def test_insight( - tmp_path: Path, file_paths: dict[str, Path], universe: MDAnalysis.Universe + tmp_path: Path, + file_paths: dict[str, Path], + universe: MDAnalysis.Universe, ) -> None: """Test Insight methods.""" trj = Trj(universe)