Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions chemap/fingerprint_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,12 @@ def _rdkit_unfolded(

if cfg.count:
out: UnfoldedCount = []
for s, mol in zip(smiles, mols):
for s, mol in tqdm(
zip(smiles, mols),
disable=not show_progress,
desc="Compute fingerprints",
total=len(mols)
):
if mol is None:
_handle_invalid(cfg.invalid_policy, s)
if cfg.invalid_policy == "keep":
Expand All @@ -356,7 +361,12 @@ def _rdkit_unfolded(
return out

out: UnfoldedBinary = []
for s, mol in zip(smiles, mols):
for s, mol in tqdm(
zip(smiles, mols),
disable=not show_progress,
desc="Compute fingerprints",
total=len(mols)
):
if mol is None:
_handle_invalid(cfg.invalid_policy, s)
if cfg.invalid_policy == "keep":
Expand Down Expand Up @@ -386,7 +396,12 @@ def _rdkit_folded_dense(
n_features: Optional[int] = None
pending_invalid: List[int] = [] # indices in `rows` that need backfill after we learn D

for s, mol in zip(smiles, mols):
for s, mol in tqdm(
zip(smiles, mols),
disable=not show_progress,
desc="Compute fingerprints",
total=len(mols)
):
if mol is None:
_handle_invalid(cfg.invalid_policy, s)
if cfg.invalid_policy == "keep":
Expand Down Expand Up @@ -450,7 +465,12 @@ def _rdkit_folded_csr(
if cfg.folded_weights is not None:
w = np.asarray(cfg.folded_weights, dtype=np.float32).ravel()

for s, mol in zip(smiles, mols):
for s, mol in tqdm(
zip(smiles, mols),
disable=not show_progress,
desc="Compute fingerprints",
total=len(mols)
):
if mol is None:
_handle_invalid(cfg.invalid_policy, s)

Expand Down
36 changes: 18 additions & 18 deletions chemap/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,55 +219,55 @@ def tanimoto_similarity_matrix_dense(references: np.ndarray, queries: np.ndarray
# This is O(R*Q*avg_nnz_merge) and can be expensive for large R,Q.
# For huge datasets prefer ANN (PyNNDescent/UMAP) with `tanimoto_distance_sparse`.
@numba.njit(parallel=True, fastmath=True, cache=True)
def tanimoto_similarity_matrix_sparse_binary(references, queries) -> np.ndarray:
def tanimoto_similarity_matrix_sparse_binary(fingerprints_1, fingerprints_2) -> np.ndarray:
"""
Pairwise Tanimoto similarity between two sets of unfolded or sparse binary fingerprints.

Parameters
----------
references
fingerprints_1
List of 1D numpy arrays of sorted bit indices (unique).
queries
fingerprints_2
List of 1D numpy arrays of sorted bit indices (unique).
"""
R = len(references)
Q = len(queries)
R = len(fingerprints_1)
Q = len(fingerprints_2)
out = np.empty((R, Q), dtype=np.float32)
for i in numba.prange(R):
for j in range(Q):
out[i, j] = tanimoto_similarity_sparse_binary(references[i], queries[j])
out[i, j] = tanimoto_similarity_sparse_binary(fingerprints_1[i], fingerprints_2[j])
return out


@numba.njit(parallel=True, fastmath=True, cache=True)
def tanimoto_similarity_matrix_sparse(
references_bits,
references_vals,
queries_bits,
queries_vals
fingerprints_1_bits,
fingerprints_1_vals,
fingerprints_2_bits,
fingerprints_2_vals
) -> np.ndarray:
"""
Pairwise generalized Tanimoto similarity between two sets of unfolded count/weight fingerprints.

Parameters
----------
references_bits
fingerprints_1_bits
List of 1D numpy arrays of sorted bit indices (unique) for reference fingerprints.
references_vals
fingerprints_1_vals
List of 1D numpy arrays of counts/weights for reference fingerprints.
queries_bits
fingerprints_2_bits
List of 1D numpy arrays of sorted bit indices (unique) for query fingerprints.
queries_vals
fingerprints_2_vals
List of 1D numpy arrays of counts/weights for query fingerprints.
"""
R = len(references_bits)
Q = len(queries_bits)
R = len(fingerprints_1_bits)
Q = len(fingerprints_2_bits)
out = np.empty((R, Q), dtype=np.float32)
for i in numba.prange(R):
for j in range(Q):
out[i, j] = tanimoto_similarity_sparse(
references_bits[i], references_vals[i],
queries_bits[j], queries_vals[j],
fingerprints_1_bits[i], fingerprints_1_vals[i],
fingerprints_2_bits[j], fingerprints_2_vals[j],
)
return out

Expand Down
3 changes: 3 additions & 0 deletions chemap/plotting/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .chem_space_umap import create_chem_space_umap, create_chem_space_umap_gpu
from .cleveland import ClevelandStyle, cleveland_dotplot
from .colormap_handling import (
LabelMapConfig,
PaletteConfig,
Expand All @@ -19,12 +20,14 @@


__all__ = [
"ClevelandStyle",
"LabelMapConfig",
"PaletteConfig",
"PresentPairsConfig",
"build_hier_label_map",
"build_selected_label_column",
"build_selected_palette",
"cleveland_dotplot",
"create_chem_space_umap",
"create_chem_space_umap_gpu",
"make_hier_palette",
Expand Down
7 changes: 2 additions & 5 deletions chemap/plotting/chem_space_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,9 @@ def create_chem_space_umap_gpu(
fpgen: Optional[Any] = None,
fingerprint_config: Optional[FingerprintConfig] = None,
show_progress: bool = True,
log_count: bool = True,
log_count: bool = False,
# UMAP (GPU / cuML)
n_neighbors: int = 15,
n_neighbors: int = 100,
min_dist: float = 0.25,
) -> pd.DataFrame:
"""Compute fingerprints and create 2D UMAP coordinates using cuML (GPU).
Expand Down Expand Up @@ -221,9 +221,6 @@ def create_chem_space_umap_gpu(
show_progress=show_progress,
)

# Convert to sparse array
# fps_csr = fingerprints_to_csr(fingerprints).X

# Reduce memory footprint (works well for count fingerprints)
if not log_count:
# stays integer-like
Expand Down
Loading