Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_and_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ jobs:
run: uv build

- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
uses: pypa/gh-action-pypi-publish@release/v1
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
.eggs/*
.tox/*
*/__pycache__/*
__pycache__/
*.egg-info/
build/*
dist/*
*/*/AGENTS.md
**/.DS_Store
**/.DS_Store
1 change: 0 additions & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

85 changes: 81 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ uv pip install sdcpy

## Usage

The main class is `SDCAnalysis`, which takes two time series as input and performs the SDC analysis. If only one time series is provided, it will be compared with itself.

In this example we generate two synthetic time series with a transient pattern between indices 63-169 to showcase the detection of transient correlations.

```python
import numpy as np
import pandas as pd
Expand All @@ -46,14 +50,87 @@ ts1 = pd.Series([tc_signal(i) for i in range(250)])
ts2 = pd.Series([tc_signal(i) for i in range(250)])

# Run SDC analysis
sdc = SDCAnalysis(ts1, ts2, fragment_size=50, n_permutations=99)
sdc = SDCAnalysis(
ts1=ts1, # First time series (pd.Series or np.ndarray)
ts2=ts2, # Second time series (pd.Series or np.ndarray)
fragment_size=50, # Size of the sliding window fragment
n_permutations=99, # Number of permutations for significance testing
method="pearson", # Correlation method ("pearson", "spearman" or Callable)
two_tailed=True, # Whether to use two-tailed test
permutations=True, # Whether to compute p-values via permutation
min_lag=-np.inf, # Minimum lag to compute
max_lag=np.inf, # Maximum lag to compute
max_memory_gb=2.0, # Max memory usage for vectorized ops before chunking
)

# Generate 2-way SDC combi plot
fig = sdc.combi_plot(xlabel="TS1", ylabel="TS2")
fig.savefig("sdc_plot.png", dpi=150, bbox_inches="tight")
fig = sdc.combi_plot(
xlabel="$TS_1$", # Label for top axis (TS1)
ylabel="$TS_2$", # Label for left axis (TS2)
title=None, # Plot title (None = auto-generated)
max_r=None, # Max correlation for color scale (None = auto)
date_fmt="%Y-%m-%d", # Date format for axes
align="center", # Alignment of heatmap cells ("center", "left", "right")
min_lag=-np.inf, # Start of lag range to plot
max_lag=np.inf, # End of lag range to plot
fontsize=9, # Base font size
figsize=(7, 7), # Figure size (width, height)
show_colorbar=True, # Whether to show the colorbar
show_ts2=True, # Whether to show TS2 time series panel
dpi=250, # Resolution for saving/displaying
)

```

<img src="sdc_example.png" width="600"/>

### Access Detailed Results

You can access the detailed SDC results DataFrame via `sdc.sdc_df`. This contains the coordinates of each fragment, lag, correlation, and p-value.

```python
sdc.sdc_df.head()
```

<img src="sdc_example.png" width="500" />
```text
start_1 stop_1 start_2 stop_2 lag r p_value
0.0 50.0 0.0 50.0 0.0 0.2267 0.0495
0.0 50.0 1.0 51.0 -1.0 -0.0047 1.0000
0.0 50.0 2.0 52.0 -2.0 0.0579 0.6238
0.0 50.0 3.0 53.0 -3.0 0.0602 0.6337
0.0 50.0 4.0 54.0 -4.0 -0.1660 0.2475
```

### Correlation by Value Range

To check if synchronies occur in specific value ranges of the time series, use `get_ranges_df()`. This computes statistics of Positive/Non-significant/Negative correlations binned by the value of the time series and returns a `pandas.DataFrame` with the results.

```python
sdc.get_ranges_df(
ts=1, # Which TS to bin by (1 or 2)
bin_size=0.5, # Size of value bins
agg_func="mean", # Aggregation function for fragment values
alpha=0.05, # Significance threshold
min_bin=None, # Manual lower bound for bins (None = auto)
max_bin=None, # Manual upper bound for bins (None = auto)
min_lag=0, # Minimum lag to include in stats
max_lag=10, # Maximum lag to include in stats
)
```

```text
cat_value direction counts n freq label
(-0.6, 0.0] Positive 504 1232 0.4091 40.9 %
(-0.6, 0.0] Negative 0 1232 0.0000 0.0 %
(-0.6, 0.0] NS 728 1232 0.5909 59.1 %
(0.0, 0.5] Positive 218 883 0.2469 24.7 %
(0.0, 0.5] Negative 10 883 0.0111 1.1 %
(0.0, 0.5] NS 655 883 0.7418 74.2 %
(0.5, 1.0] Positive 1 19 0.0526 5.3 %
(0.5, 1.0] Negative 0 19 0.0000 0.0 %
(0.5, 1.0] NS 18 19 0.9474 94.7 %
(1.0, 1.5] Positive 0 0 0.0000 0.0 %
```

See [examples/basic_usage.py](examples/basic_usage.py) for a complete example with synthetic data showing transient correlations.

Expand Down
2 changes: 1 addition & 1 deletion examples/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ def tc_signal(i):

# Generate combination plot
fig = sdc.combi_plot(xlabel="$TS_1$", ylabel="$TS_2$")
fig.savefig("sdc_example.png", dpi=300, bbox_inches="tight")
fig.savefig("sdc_example.png", bbox_inches="tight")
print("Saved: sdc_example.png")
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "sdcpy"
version = "0.5.2"
version = "0.7.0"
description = "Scale Dependent Correlation in Python"
readme = "README.md"
license = "MIT"
Expand Down Expand Up @@ -88,7 +88,7 @@ ignore = [
known-first-party = ["sdcpy"]

[tool.bumpversion]
current_version = "0.5.0"
current_version = "0.7.0"
commit = true
tag = true

Expand Down
Binary file modified sdc_example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
180 changes: 140 additions & 40 deletions sdcpy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,51 @@
"spearman": lambda x, y: stats.spearmanr(x, y),
}

CONSTANT_WARNING = {"pearson": stats.ConstantInputWarning, "spearman": stats.ConstantInputWarning}

# Default maximum memory threshold (in GB) for full vectorized computation.
# Above this, chunked processing is used automatically.
DEFAULT_MAX_MEMORY_GB = 2.0


def _estimate_vectorized_memory(
n1: int, n2: int, n_permutations: int, dtype: np.dtype = np.float64
) -> float:
"""
Estimate peak memory usage (in GB) for the fully vectorized SDC computation.

The dominant memory consumers are:
- Permuted correlation matrices: (n_root^2, n1, n2) where n_root = sqrt(n_permutations)
- Fragment matrices: (n1, fragment_size) + (n2, fragment_size)
- Correlation matrix: (n1, n2)
- Grid/lag matrices: 3 * (n1, n2)

Parameters
----------
n1
Number of fragments from first time series
n2
Number of fragments from second time series
n_permutations
Number of permutations for the randomization test
dtype
Data type (default float64 = 8 bytes)

Returns
-------
float
Estimated peak memory usage in gigabytes
"""
bytes_per_element = np.dtype(dtype).itemsize
n_root = int(np.sqrt(n_permutations).round())
n_actual_perms = n_root * n_root

# Main memory consumers
perm_matrices = n_actual_perms * n1 * n2 * bytes_per_element
corr_matrix = n1 * n2 * bytes_per_element
grid_matrices = 3 * n1 * n2 * bytes_per_element # start_1_grid, start_2_grid, lag_matrix

total_bytes = perm_matrices + corr_matrix + grid_matrices
return total_bytes / (1024**3) # Convert to GB


def generate_correlation_map(x: np.ndarray, y: np.ndarray, method: str = "pearson") -> np.ndarray:
Expand Down Expand Up @@ -94,6 +138,7 @@ def compute_sdc(
permutations: bool = True,
min_lag: int = -np.inf,
max_lag: int = np.inf,
max_memory_gb: float = DEFAULT_MAX_MEMORY_GB,
) -> pd.DataFrame:
"""
Computes scale dependent correlation (https://doi.org/10.1007/s00382-005-0106-4) matrix among two time series
Expand Down Expand Up @@ -124,6 +169,10 @@ def compute_sdc(
Lower limit of the lags between ts1 and ts2 that will be computed.
max_lag
Upper limit of the lags between ts1 and ts2 that will be computed.
max_memory_gb
Maximum memory (in GB) to use for full vectorized computation. If estimated memory exceeds
this limit, the computation falls back to chunked processing which uses constant memory but
may be slightly slower. Default is 2.0 GB.

Returns
-------
Expand All @@ -147,6 +196,7 @@ def compute_sdc(
permutations,
min_lag,
max_lag,
max_memory_gb,
)
else:
# Fall back to original loop-based implementation for custom callables
Expand All @@ -173,8 +223,16 @@ def _compute_sdc_vectorized(
permutations: bool,
min_lag: int,
max_lag: int,
max_memory_gb: float = DEFAULT_MAX_MEMORY_GB,
) -> pd.DataFrame:
"""Vectorized implementation for built-in correlation methods."""
"""Vectorized implementation for built-in correlation methods.

Parameters
----------
max_memory_gb
Maximum memory (in GB) to use for full vectorization. If estimated
memory exceeds this, chunked processing is used automatically.
"""
n1 = len(ts1) - fragment_size
n2 = len(ts2) - fragment_size

Expand Down Expand Up @@ -210,44 +268,86 @@ def _compute_sdc_vectorized(
n_root = int(np.sqrt(n_permutations).round())
n_actual_perms = n_root * n_root

# OPTIMIZED: Pre-shuffle all fragments and compute full permuted correlation matrices
# This is much faster than shuffling per-pair because we compute n_root^2 full
# correlation matrices instead of n_valid individual permutation tests

# Pre-compute shuffled versions of all fragments
# Shape: (n_root, n_fragments, fragment_size)
shuffled_frags1 = np.array(
[shuffle_along_axis(frags1.copy(), axis=1) for _ in range(n_root)]
)
shuffled_frags2 = np.array(
[shuffle_along_axis(frags2.copy(), axis=1) for _ in range(n_root)]
)

# Compute permuted correlation matrices for all combinations of shuffled fragments
# Shape: (n_root, n_root, n1, n2)
perm_corr_matrices = np.zeros((n_root, n_root, n1, n2))
for i in tqdm(range(n_root), desc="Computing permutations", leave=False):
for j in range(n_root):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
perm_corr_matrices[i, j] = generate_correlation_map(
shuffled_frags1[i], shuffled_frags2[j], method=method
)

# Reshape to (n_actual_perms, n1, n2)
perm_corrs_flat = perm_corr_matrices.reshape(n_actual_perms, n1, n2)

# Compute p-values vectorized
if two_tailed:
# Count how many abs(perm) >= abs(observed) for each position
abs_observed = np.abs(corr_matrix)
abs_perms = np.abs(perm_corrs_flat)
counts = (abs_perms >= abs_observed[np.newaxis, :, :]).sum(axis=0)
# Estimate memory and decide strategy
estimated_memory = _estimate_vectorized_memory(n1, n2, n_permutations)
use_chunked = estimated_memory > max_memory_gb

if use_chunked:
warnings.warn(
f"Estimated memory ({estimated_memory:.2f} GB) exceeds limit "
f"({max_memory_gb:.2f} GB). Using chunked processing.",
UserWarning,
stacklevel=3,
)
# Chunked approach: accumulate counts without storing all permutation matrices
counts = np.zeros((n1, n2), dtype=np.int32)

# Pre-compute shuffled versions of all fragments
shuffled_frags1 = np.array(
[shuffle_along_axis(frags1.copy(), axis=1) for _ in range(n_root)]
)
shuffled_frags2 = np.array(
[shuffle_along_axis(frags2.copy(), axis=1) for _ in range(n_root)]
)

# Process one permutation pair at a time, accumulating counts
abs_observed = np.abs(corr_matrix) if two_tailed else None
with tqdm(
total=n_actual_perms, desc="Computing permutations (chunked)", leave=False
) as pbar:
for i in range(n_root):
for j in range(n_root):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
perm_corr = generate_correlation_map(
shuffled_frags1[i], shuffled_frags2[j], method=method
)
if two_tailed:
counts += (np.abs(perm_corr) >= abs_observed).astype(np.int32)
else:
counts += (perm_corr >= corr_matrix).astype(np.int32)
pbar.update(n_root)

# P-value: (count + 1) / (n_perms + 1) for proper permutation test
p_value_matrix = (counts + 1) / (n_actual_perms + 1)
else:
counts = (perm_corrs_flat >= corr_matrix[np.newaxis, :, :]).sum(axis=0)

# P-value: (count + 1) / (n_perms + 1) for proper permutation test
p_value_matrix = (counts + 1) / (n_actual_perms + 1)
# Full vectorized approach: store all permutation matrices
# Pre-compute shuffled versions of all fragments
# Shape: (n_root, n_fragments, fragment_size)
shuffled_frags1 = np.array(
[shuffle_along_axis(frags1.copy(), axis=1) for _ in range(n_root)]
)
shuffled_frags2 = np.array(
[shuffle_along_axis(frags2.copy(), axis=1) for _ in range(n_root)]
)

# Compute permuted correlation matrices for all combinations of shuffled fragments
# Shape: (n_root, n_root, n1, n2)
perm_corr_matrices = np.zeros((n_root, n_root, n1, n2))
with tqdm(total=n_actual_perms, desc="Computing permutations", leave=False) as pbar:
for i in range(n_root):
for j in range(n_root):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
perm_corr_matrices[i, j] = generate_correlation_map(
shuffled_frags1[i], shuffled_frags2[j], method=method
)
pbar.update(n_root)

# Reshape to (n_actual_perms, n1, n2)
perm_corrs_flat = perm_corr_matrices.reshape(n_actual_perms, n1, n2)

# Compute p-values vectorized
if two_tailed:
# Count how many abs(perm) >= abs(observed) for each position
abs_observed = np.abs(corr_matrix)
abs_perms = np.abs(perm_corrs_flat)
counts = (abs_perms >= abs_observed[np.newaxis, :, :]).sum(axis=0)
else:
counts = (perm_corrs_flat >= corr_matrix[np.newaxis, :, :]).sum(axis=0)

# P-value: (count + 1) / (n_perms + 1) for proper permutation test
p_value_matrix = (counts + 1) / (n_actual_perms + 1)

# Extract p-values for valid entries
p_values = p_value_matrix[valid_mask]
Expand Down Expand Up @@ -291,7 +391,7 @@ def _compute_sdc_loop(
"""Original loop-based implementation for custom callable methods."""
method_fun = method
n_iterations = (len(ts1) - fragment_size) * (len(ts2) - fragment_size)
int(np.sqrt(n_permutations).round())

sdc_array = np.empty(shape=(n_iterations, 7))
sdc_array[:] = np.nan
i = 0
Expand Down
Loading