From 3894b18b7b3acfc11d37a131669efcf27e8790a8 Mon Sep 17 00:00:00 2001 From: Luca Dovichi Date: Sat, 15 Nov 2025 15:32:54 -0600 Subject: [PATCH 1/4] [ndm] added warnings regarding inconsistent random_state usage --- src/netml/ndm/ae.py | 8 ++++++-- src/netml/ndm/kde.py | 8 ++++++-- src/netml/ndm/model.py | 10 ++++++++-- src/netml/ndm/ocsvm.py | 10 ++++++---- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/netml/ndm/ae.py b/src/netml/ndm/ae.py index 10ffa39..cc86a96 100644 --- a/src/netml/ndm/ae.py +++ b/src/netml/ndm/ae.py @@ -71,7 +71,7 @@ def __init__(self, epochs=100, batch_size=32, lr=1e-3, loss='mse', dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, - verbose=1, random_state=42, contamination=0.1, hid_dim=16, lat_dim=8): + verbose=1, contamination=0.1, hid_dim=16, lat_dim=8, **kwargs): """AutoEncoder Parameters @@ -120,7 +120,6 @@ def __init__(self, epochs=100, batch_size=32, lr=1e-3, self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.verbose = verbose - self.random_state = random_state self.lr = lr self.contamination = contamination self.hid_dim = hid_dim @@ -128,6 +127,11 @@ def __init__(self, epochs=100, batch_size=32, lr=1e-3, check_parameter(dropout_rate, 0, 1, param_name='dropout_rate', include_left=True) + if "random_state" in kwargs and verbose > 5: + print( + "Warning: 'random_state' passed to AutoEncoder. Use torch.manual_seed() instead." + ) + if self.loss == 'mse' or (not self.loss): self.criterion = nn.MSELoss() diff --git a/src/netml/ndm/kde.py b/src/netml/ndm/kde.py index 8f4655e..45ccb4f 100644 --- a/src/netml/ndm/kde.py +++ b/src/netml/ndm/kde.py @@ -16,7 +16,7 @@ class KDE(KernelDensity, BaseDetector): def __init__(self, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric="euclidean", atol=0, rtol=0, contamination=0.1, - breadth_first=True, leaf_size=40, metric_params=None, random_state=42): + breadth_first=True, leaf_size=40, metric_params=None, verbose=0, **kwargs): """Kernel density estimation (KDE) Parameters ---------- @@ -63,7 +63,11 @@ def __init__(self, bandwidth=1.0, algorithm='auto', self.leaf_size = leaf_size self.metric_params = metric_params self.contamination = contamination - self.random_state = random_state + + if "random_state" in kwargs and verbose > 5: + print( + "Warning: 'random_state' passed to KDE has no effect." + ) # run the choose algorithm code so that exceptions will happen here # we're using clone() in the GenerativeBayes classifier, diff --git a/src/netml/ndm/model.py b/src/netml/ndm/model.py index a4a5532..8df29c0 100644 --- a/src/netml/ndm/model.py +++ b/src/netml/ndm/model.py @@ -13,7 +13,7 @@ class MODEL: - def __init__(self, model=None, *, score_metric='auc', verbose=1, random_state=42): + def __init__(self, model=None, *, score_metric='auc', verbose=1, **kwargs): """Train and test a model on a given data. Parameters @@ -40,10 +40,16 @@ def __init__(self, model=None, *, score_metric='auc', verbose=1, random_state=42 self.model_name = model.name self.score_metric = score_metric self.verbose = verbose - self.random_state = random_state # store all data generated during training and testing the model. self.history = {} + if "random_state" in kwargs: + self.random_state = kwargs["random_state"] + if verbose > 5: + print("Warning: setting random_state for a model wrapper doesn't affect the underlying predictions.") + else: + self.random_state = 42 + @timing def _train(self, X_train, y_train=None): """fit the model on the train set diff --git a/src/netml/ndm/ocsvm.py b/src/netml/ndm/ocsvm.py index bc4bcc6..c98d250 100644 --- a/src/netml/ndm/ocsvm.py +++ b/src/netml/ndm/ocsvm.py @@ -12,7 +12,7 @@ class OCSVM(OneClassSVM): def __init__(self, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, - verbose=False, max_iter=-1, random_state=100): + verbose=False, max_iter=-1, **kwargs): """One Class SVM (OCSVM) Parameters @@ -49,10 +49,13 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale', verbose: bool (default is False) Enable verbose output. + """ - random_state: int (default is 42) + if "random_state" in kwargs and verbose > 5: + print( + "Warning: 'random_state' passed to OCSVM has no effect." + ) - """ super(OCSVM, self).__init__( kernel=kernel, degree=degree, @@ -66,7 +69,6 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale', max_iter=max_iter, ) - self.random_state = random_state self.verbose = verbose # override decision_function. because test and grid_search will use decision_function first From 245741817ba02d57aefa543b10cea9063129310c Mon Sep 17 00:00:00 2001 From: Luca Dovichi Date: Sat, 15 Nov 2025 15:40:51 -0600 Subject: [PATCH 2/4] clean up random_state documentation within source files --- src/netml/ndm/ae.py | 4 +--- src/netml/ndm/gmm.py | 2 ++ src/netml/ndm/kde.py | 2 +- src/netml/ndm/model.py | 5 ----- src/netml/ndm/ocsvm.py | 2 +- src/netml/ndm/pca.py | 4 ++-- 6 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/netml/ndm/ae.py b/src/netml/ndm/ae.py index cc86a96..b8c0e00 100644 --- a/src/netml/ndm/ae.py +++ b/src/netml/ndm/ae.py @@ -109,9 +109,7 @@ def __init__(self, epochs=100, batch_size=32, lr=1e-3, verbose: int (default is 1) A print level is to control what information should be printed according to the given value. The higher the value is, the more info is printed. - - random_state: int (default is 42) - + """ self.epochs = epochs self.batch_size = batch_size diff --git a/src/netml/ndm/gmm.py b/src/netml/ndm/gmm.py index cba4278..724eda2 100644 --- a/src/netml/ndm/gmm.py +++ b/src/netml/ndm/gmm.py @@ -71,6 +71,8 @@ def __init__(self, n_components=1, covariance_type='full', tol=1e-3, contamination: float (default is 0.1) It's in range (0,1). A threshold used to decide the normal score (not used). + random_state: int (default is 42) + """ self.n_components = n_components self.covariance_type = covariance_type diff --git a/src/netml/ndm/kde.py b/src/netml/ndm/kde.py index 45ccb4f..d670597 100644 --- a/src/netml/ndm/kde.py +++ b/src/netml/ndm/kde.py @@ -66,7 +66,7 @@ def __init__(self, bandwidth=1.0, algorithm='auto', if "random_state" in kwargs and verbose > 5: print( - "Warning: 'random_state' passed to KDE has no effect." + "Warning: argument 'random_state' passed to KDE has no effect." ) # run the choose algorithm code so that exceptions will happen here diff --git a/src/netml/ndm/model.py b/src/netml/ndm/model.py index 8df29c0..72d108f 100644 --- a/src/netml/ndm/model.py +++ b/src/netml/ndm/model.py @@ -28,9 +28,6 @@ def __init__(self, model=None, *, score_metric='auc', verbose=1, **kwargs): a print level is to control what information should be printed according to the given value. The higher the value is, the more info is printed. - random_state: int - a value is to make your experiments more reproducible. - Returns ------- a MODEL instance @@ -47,8 +44,6 @@ def __init__(self, model=None, *, score_metric='auc', verbose=1, **kwargs): self.random_state = kwargs["random_state"] if verbose > 5: print("Warning: setting random_state for a model wrapper doesn't affect the underlying predictions.") - else: - self.random_state = 42 @timing def _train(self, X_train, y_train=None): diff --git a/src/netml/ndm/ocsvm.py b/src/netml/ndm/ocsvm.py index c98d250..00bbc4b 100644 --- a/src/netml/ndm/ocsvm.py +++ b/src/netml/ndm/ocsvm.py @@ -53,7 +53,7 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale', if "random_state" in kwargs and verbose > 5: print( - "Warning: 'random_state' passed to OCSVM has no effect." + "Warning: argument 'random_state' passed to OCSVM has no effect." ) super(OCSVM, self).__init__( diff --git a/src/netml/ndm/pca.py b/src/netml/ndm/pca.py index 53aa3ee..d505366 100644 --- a/src/netml/ndm/pca.py +++ b/src/netml/ndm/pca.py @@ -14,7 +14,7 @@ class PCA(BaseDetector): def __init__(self, n_components=None, n_selected_components=None, contamination=0.1, copy=True, whiten=False, svd_solver='auto', - tol=0.0, iterated_power='auto', random_state=None, + tol=0.0, iterated_power='auto', random_state=42, weighted=True, standardization=True): """Principal component analysis (PCA) @@ -48,7 +48,7 @@ def __init__(self, n_components=None, n_selected_components=None, Number of iterations for the power method computed by svd_solver == 'randomized'. - random_state : int + random_state: int (default is 42) weighted : bool, optional (default=True) If True, the eigenvalues are used in score computation. From 3900e4e6d6e55252bf7938e7cdff51177f778dc3 Mon Sep 17 00:00:00 2001 From: Luca Dovichi Date: Sat, 15 Nov 2025 15:49:04 -0600 Subject: [PATCH 3/4] add expected verbose arg to PCA --- src/netml/ndm/pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/netml/ndm/pca.py b/src/netml/ndm/pca.py index d505366..821fbf6 100644 --- a/src/netml/ndm/pca.py +++ b/src/netml/ndm/pca.py @@ -15,7 +15,7 @@ class PCA(BaseDetector): def __init__(self, n_components=None, n_selected_components=None, contamination=0.1, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=42, - weighted=True, standardization=True): + weighted=True, standardization=True, verbose=1): """Principal component analysis (PCA) Parameters @@ -70,6 +70,7 @@ def __init__(self, n_components=None, n_selected_components=None, self.weighted = weighted self.standardization = standardization self.score_name = "reconstructed" # the way to obtain outlier scores + self.verbose = verbose self.contamination = contamination From 1ab3ea688c8c0bd833767e999b00ff1c2af9909a Mon Sep 17 00:00:00 2001 From: Luca Dovichi Date: Sat, 15 Nov 2025 16:07:53 -0600 Subject: [PATCH 4/4] ensure that verbose defaults to 0 for signatures where it's recently been added --- src/netml/ndm/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/netml/ndm/pca.py b/src/netml/ndm/pca.py index 821fbf6..68f8915 100644 --- a/src/netml/ndm/pca.py +++ b/src/netml/ndm/pca.py @@ -15,7 +15,7 @@ class PCA(BaseDetector): def __init__(self, n_components=None, n_selected_components=None, contamination=0.1, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=42, - weighted=True, standardization=True, verbose=1): + weighted=True, standardization=True, verbose=0): """Principal component analysis (PCA) Parameters