From 6c6a9f8749ba5d1b6b95d5a192bc387c3d64cc54 Mon Sep 17 00:00:00 2001 From: Julius Schwartz Date: Thu, 19 Aug 2021 20:33:12 +0100 Subject: [PATCH 01/12] new file --- GP/random_walk.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 GP/random_walk.py diff --git a/GP/random_walk.py b/GP/random_walk.py new file mode 100644 index 0000000..a2e83c4 --- /dev/null +++ b/GP/random_walk.py @@ -0,0 +1,74 @@ +# Author: Henry Moss & Ryan-Rhys Griffiths +""" +Molecule kernels for Gaussian Process Regression implemented in GPflow. +""" + +import gpflow +import tensorflow as tf + + +class RandomWalk(gpflow.kernels.Kernel): + def __init__(self, uniform_probabilities=False, geometric=True): + super().__init__() + self.uniform_probabilities=uniform_probabilities + self.geometric = geometric + + def K(self, X, X2=None): + if X2 is None: + X2 = X + + X_is_X2 = X == X2 + + eigenvals, eigenvecs = tf.linalg.eigh(X) + flanking_factors = self._generate_flanking_factors(eigenvecs) + + if X_is_X2: + eigenvals_2, eigenvecs_2 = eigenvals, eigenvecs + flanking_factors_2 = flanking_factors + else: + eigenvals_2, eigenvecs_2 = tf.linalg.eigh(X2) + flanking_factors_2 = self._generate_flanking_factors(eigenvecs_2) + + k_matrix = tf.zeros((X.shape[0], X2.shape[0])) + + for i in k_matrix.shape[0]: + for j in k_matrix.shape[1]: + + if X_is_X2 and j < i: + k_matrix[i, j] = k_matrix[j, i] + continue + + flanking_factor = tf.linalg.LinearOperatorKronecker(flanking_factors[i], flanking_factors_2[j]) + diagonal = tf.linalg.LinearOperatorKronecker(eigenvecs[i], eigenvecs_2[j]) + + if self.geometric: + power_series = tf.linalg.diag(1 / 1 - diagonal) + else: + power_series = tf.linalg.diag(tf.exp(diagonal)) + + k_matrix[i, j] = tf.linalg.matmul( + flanking_factor, + tf.linalg.matmul( + power_series, + tf.transpose(flanking_factor, perm=[-2, -1]) + ) + ) + + return k_matrix + + def K_diag(self, X): + return tf.linalg.diag(X) ## TODO: implement + + def _generate_flanking_factors(self, eigenvecs): + flanking_factors = [] + + for eigenvec in eigenvecs: + start_stop_probs = tf.ones((1, eigenvec.shape[0])) + if self.uniform_probabilities: + start_stop_probs = tf.divide(start_stop_probs, eigenvec.shape(0)) + + flanking_factors.append( + tf.linalg.matmul(start_stop_probs, eigenvec) + ) + + return flanking_factors From edd15f11a14830b33b3bca1c352ad2ef24abb28b Mon Sep 17 00:00:00 2001 From: Julius Schwartz Date: Thu, 19 Aug 2021 20:38:32 +0100 Subject: [PATCH 02/12] fix new dir location --- GP/{ => kernels}/random_walk.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename GP/{ => kernels}/random_walk.py (100%) diff --git a/GP/random_walk.py b/GP/kernels/random_walk.py similarity index 100% rename from GP/random_walk.py rename to GP/kernels/random_walk.py From c4f49c50580a48872d221ebbb1061ca2d4dd8bf2 Mon Sep 17 00:00:00 2001 From: Julius Date: Sun, 5 Sep 2021 15:40:38 +0100 Subject: [PATCH 03/12] modified sys.path in test file --- GP/__init__.py | 0 GP/kernels/__init__.py | 0 GP/kernels/random_walk.py | 7 ++++--- tests/__init__.py | 0 tests/kernels/__init__.py | 0 tests/kernels/random_walk.py | 29 +++++++++++++++++++++++++++++ 6 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 GP/__init__.py create mode 100644 GP/kernels/__init__.py create mode 100644 tests/__init__.py create mode 100644 tests/kernels/__init__.py create mode 100644 tests/kernels/random_walk.py diff --git a/GP/__init__.py b/GP/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GP/kernels/__init__.py b/GP/kernels/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GP/kernels/random_walk.py b/GP/kernels/random_walk.py index a2e83c4..3e3d5b3 100644 --- a/GP/kernels/random_walk.py +++ b/GP/kernels/random_walk.py @@ -8,10 +8,11 @@ class RandomWalk(gpflow.kernels.Kernel): - def __init__(self, uniform_probabilities=False, geometric=True): + def __init__(self, uniform_probabilities=False, geometric=True, weight=0.1): super().__init__() self.uniform_probabilities=uniform_probabilities self.geometric = geometric + self.weight = weight def K(self, X, X2=None): if X2 is None: @@ -39,7 +40,7 @@ def K(self, X, X2=None): continue flanking_factor = tf.linalg.LinearOperatorKronecker(flanking_factors[i], flanking_factors_2[j]) - diagonal = tf.linalg.LinearOperatorKronecker(eigenvecs[i], eigenvecs_2[j]) + diagonal = self.weight * tf.linalg.LinearOperatorKronecker(eigenvecs[i], eigenvecs_2[j]) if self.geometric: power_series = tf.linalg.diag(1 / 1 - diagonal) @@ -57,7 +58,7 @@ def K(self, X, X2=None): return k_matrix def K_diag(self, X): - return tf.linalg.diag(X) ## TODO: implement + return tf.linalg.tensor_diag_part(self.K(X)) def _generate_flanking_factors(self, eigenvecs): flanking_factors = [] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/kernels/__init__.py b/tests/kernels/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/kernels/random_walk.py b/tests/kernels/random_walk.py new file mode 100644 index 0000000..83f978a --- /dev/null +++ b/tests/kernels/random_walk.py @@ -0,0 +1,29 @@ +# Author: Henry Moss & Ryan-Rhys Griffiths +""" +Molecule kernels for Gaussian Process Regression implemented in GPflow. +""" + +import gpflow +import tensorflow as tf +import grakel +import pytest +from rdkit.Chem import MolFromSmiles +from rdkit.Chem.rdmolops import GetAdjacencyMatrix +import sys + +sys.path.append('/Users/juliusschwartz/Mystuff/FlowMO') + +from GP.kernels.random_walk import RandomWalk + +test_smiles = [] +test_dataset_file = '../../smiles_enumeration/enumerated_datasets/ESOL/X_test_split_aug_x2_split_0.txt' +lines = open(test_dataset_file).readlines() +for line in open(test_dataset_file).readlines(): + test_smiles.append(line.strip('\n')) + +adj_mats = [GetAdjacencyMatrix(MolFromSmiles(smiles)) for smiles in test_smiles] +tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] +grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] + +random_walk_grakel = grakel.kernels.RandomWalk() +grakel_results = random_walk_grakel.fit_transform(grakel_graphs) From 783f2099942fc0fbdb360a2bc3c01fd304e24fdd Mon Sep 17 00:00:00 2001 From: Julius Date: Wed, 29 Sep 2021 18:16:24 +0100 Subject: [PATCH 04/12] pass basic tests --- GP/{kernels => kernel_modules}/__init__.py | 0 GP/kernel_modules/kernel_utils.py | 14 +++ GP/kernel_modules/random_walk.py | 102 +++++++++++++++++++++ GP/kernels/random_walk.py | 75 --------------- tests/kernels/random_walk.py | 29 ------ tests/kernels/test_random_walk.py | 55 +++++++++++ 6 files changed, 171 insertions(+), 104 deletions(-) rename GP/{kernels => kernel_modules}/__init__.py (100%) create mode 100644 GP/kernel_modules/kernel_utils.py create mode 100644 GP/kernel_modules/random_walk.py delete mode 100644 GP/kernels/random_walk.py delete mode 100644 tests/kernels/random_walk.py create mode 100644 tests/kernels/test_random_walk.py diff --git a/GP/kernels/__init__.py b/GP/kernel_modules/__init__.py similarity index 100% rename from GP/kernels/__init__.py rename to GP/kernel_modules/__init__.py diff --git a/GP/kernel_modules/kernel_utils.py b/GP/kernel_modules/kernel_utils.py new file mode 100644 index 0000000..9899b15 --- /dev/null +++ b/GP/kernel_modules/kernel_utils.py @@ -0,0 +1,14 @@ +# Author: Henry Moss & Ryan-Rhys Griffiths +""" +Molecule kernels for Gaussian Process Regression implemented in GPflow. +""" + +import tensorflow as tf + + +def normalize(k_matrix): + k_matrix_diagonal = tf.linalg.diag_part(k_matrix) + squared_normalization_factor = tf.multiply(tf.expand_dims(k_matrix_diagonal, 1), + tf.expand_dims(k_matrix_diagonal, 0)) + + return tf.divide(k_matrix, tf.sqrt(squared_normalization_factor)) diff --git a/GP/kernel_modules/random_walk.py b/GP/kernel_modules/random_walk.py new file mode 100644 index 0000000..549217a --- /dev/null +++ b/GP/kernel_modules/random_walk.py @@ -0,0 +1,102 @@ +# Author: Henry Moss & Ryan-Rhys Griffiths +""" +Molecule kernels for Gaussian Process Regression implemented in GPflow. +""" + +import gpflow +import numpy as np +import tensorflow as tf + +from .kernel_utils import normalize + + +class RandomWalk(gpflow.kernels.Kernel): + def __init__(self, uniform_probabilities=False, series_type='geometric', weight=0.1, normalize=True): + super().__init__() + self.uniform_probabilities=uniform_probabilities + if series_type == 'geometric': + self.geometric = True + elif series_type == 'exponential': + self.geometric = False + self.weight = weight + self.normalize = normalize + + def K(self, X, X2=None): + if X2 is None: + X2 = X + + X_is_X2 = X == X2 + + eigenvecs, eigenvals = [], [] + eigenvecs_2, eigenvals_2 = [], [] + + for adj_mat in X: + val, vec = tf.linalg.eigh(tf.cast(adj_mat, tf.float64)) + eigenvals.append(val) + eigenvecs.append(vec) + + flanking_factors = self._generate_flanking_factors(eigenvecs) + + if X_is_X2: + eigenvals_2, eigenvecs_2 = eigenvals, eigenvecs + flanking_factors_2 = flanking_factors + else: + for adj_mat in X2: + val, vec = tf.linalg.eigh(tf.cast(adj_mat, tf.float64)) + eigenvals_2.append(val) + eigenvecs_2.append(vec) + flanking_factors_2 = self._generate_flanking_factors(eigenvecs_2) + + k_matrix = np.zeros((len(X), len(X2))) + + for idx_1 in range(k_matrix.shape[0]): + for idx_2 in range(k_matrix.shape[1]): + + if X_is_X2 and idx_2 < idx_1: + k_matrix[idx_1, idx_2] = k_matrix[idx_2, idx_1] + continue + + flanking_factor = tf.linalg.LinearOperatorKronecker( + [tf.linalg.LinearOperatorFullMatrix(flanking_factors[idx_1]), + tf.linalg.LinearOperatorFullMatrix(flanking_factors_2[idx_2]) + ]).to_dense() + + diagonal = self.weight * tf.linalg.LinearOperatorKronecker( + [tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenvals[idx_1], axis=0)), + tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenvals_2[idx_2], axis=0)) + ]).to_dense() + + if self.geometric: + power_series = tf.linalg.diag(1 / (1 - diagonal)) + else: + power_series = tf.linalg.diag(tf.exp(diagonal)) + + k_matrix[idx_1, idx_2] = tf.linalg.matmul( + flanking_factor, + tf.linalg.matmul( + power_series, + tf.transpose(flanking_factor, perm=[1, 0]) + ) + ) + + if self.normalize: + return tf.convert_to_tensor(normalize(k_matrix)) + + return tf.convert_to_tensor(k_matrix) + + def K_diag(self, X): + return tf.linalg.tensor_diag_part(self.K(X)) + + def _generate_flanking_factors(self, eigenvecs): + flanking_factors = [] + + for eigenvec in eigenvecs: + start_stop_probs = tf.ones((1, eigenvec.shape[0]), tf.float64) + if self.uniform_probabilities: + start_stop_probs = tf.divide(start_stop_probs, eigenvec.shape(0)) + + flanking_factors.append( + tf.linalg.matmul(start_stop_probs, eigenvec) + ) + + return flanking_factors diff --git a/GP/kernels/random_walk.py b/GP/kernels/random_walk.py deleted file mode 100644 index 3e3d5b3..0000000 --- a/GP/kernels/random_walk.py +++ /dev/null @@ -1,75 +0,0 @@ -# Author: Henry Moss & Ryan-Rhys Griffiths -""" -Molecule kernels for Gaussian Process Regression implemented in GPflow. -""" - -import gpflow -import tensorflow as tf - - -class RandomWalk(gpflow.kernels.Kernel): - def __init__(self, uniform_probabilities=False, geometric=True, weight=0.1): - super().__init__() - self.uniform_probabilities=uniform_probabilities - self.geometric = geometric - self.weight = weight - - def K(self, X, X2=None): - if X2 is None: - X2 = X - - X_is_X2 = X == X2 - - eigenvals, eigenvecs = tf.linalg.eigh(X) - flanking_factors = self._generate_flanking_factors(eigenvecs) - - if X_is_X2: - eigenvals_2, eigenvecs_2 = eigenvals, eigenvecs - flanking_factors_2 = flanking_factors - else: - eigenvals_2, eigenvecs_2 = tf.linalg.eigh(X2) - flanking_factors_2 = self._generate_flanking_factors(eigenvecs_2) - - k_matrix = tf.zeros((X.shape[0], X2.shape[0])) - - for i in k_matrix.shape[0]: - for j in k_matrix.shape[1]: - - if X_is_X2 and j < i: - k_matrix[i, j] = k_matrix[j, i] - continue - - flanking_factor = tf.linalg.LinearOperatorKronecker(flanking_factors[i], flanking_factors_2[j]) - diagonal = self.weight * tf.linalg.LinearOperatorKronecker(eigenvecs[i], eigenvecs_2[j]) - - if self.geometric: - power_series = tf.linalg.diag(1 / 1 - diagonal) - else: - power_series = tf.linalg.diag(tf.exp(diagonal)) - - k_matrix[i, j] = tf.linalg.matmul( - flanking_factor, - tf.linalg.matmul( - power_series, - tf.transpose(flanking_factor, perm=[-2, -1]) - ) - ) - - return k_matrix - - def K_diag(self, X): - return tf.linalg.tensor_diag_part(self.K(X)) - - def _generate_flanking_factors(self, eigenvecs): - flanking_factors = [] - - for eigenvec in eigenvecs: - start_stop_probs = tf.ones((1, eigenvec.shape[0])) - if self.uniform_probabilities: - start_stop_probs = tf.divide(start_stop_probs, eigenvec.shape(0)) - - flanking_factors.append( - tf.linalg.matmul(start_stop_probs, eigenvec) - ) - - return flanking_factors diff --git a/tests/kernels/random_walk.py b/tests/kernels/random_walk.py deleted file mode 100644 index 83f978a..0000000 --- a/tests/kernels/random_walk.py +++ /dev/null @@ -1,29 +0,0 @@ -# Author: Henry Moss & Ryan-Rhys Griffiths -""" -Molecule kernels for Gaussian Process Regression implemented in GPflow. -""" - -import gpflow -import tensorflow as tf -import grakel -import pytest -from rdkit.Chem import MolFromSmiles -from rdkit.Chem.rdmolops import GetAdjacencyMatrix -import sys - -sys.path.append('/Users/juliusschwartz/Mystuff/FlowMO') - -from GP.kernels.random_walk import RandomWalk - -test_smiles = [] -test_dataset_file = '../../smiles_enumeration/enumerated_datasets/ESOL/X_test_split_aug_x2_split_0.txt' -lines = open(test_dataset_file).readlines() -for line in open(test_dataset_file).readlines(): - test_smiles.append(line.strip('\n')) - -adj_mats = [GetAdjacencyMatrix(MolFromSmiles(smiles)) for smiles in test_smiles] -tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] -grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] - -random_walk_grakel = grakel.kernels.RandomWalk() -grakel_results = random_walk_grakel.fit_transform(grakel_graphs) diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py new file mode 100644 index 0000000..bf9fd45 --- /dev/null +++ b/tests/kernels/test_random_walk.py @@ -0,0 +1,55 @@ +# Author: Henry Moss & Ryan-Rhys Griffiths +""" +Molecule kernels for Gaussian Process Regression implemented in GPflow. +""" + +import sys + +import grakel +import numpy.testing as npt +import pandas as pd +import pytest +import tensorflow as tf +from rdkit.Chem import MolFromSmiles +from rdkit.Chem.rdmolops import GetAdjacencyMatrix + +from GP.kernel_modules.random_walk import RandomWalk + +sys.path.append('/Users/juliusschwartz/Mystuff/FlowMO') + +@pytest.fixture +def load_data(): + test_dataset_file = '../../datasets/ESOL.csv' + df = pd.read_csv(test_dataset_file) + return df["smiles"].to_list() + + +@pytest.mark.parametrize( + 'weight, series_type, p', + [ + (0.1, 'geometric', None), + (0.1, 'exponential', None), + #(0.3, 'geometric', None), TODO: Fix + (0.5, 'exponential', None), + #(0.2, 'geometric', 3), TODO: Implement + #(0.8, 'exponential', 3), TODO: Implement + ] +) +def test_random_walk_unlabelled(weight, series_type, p, load_data): + adj_mats = [GetAdjacencyMatrix(MolFromSmiles(smiles)) for smiles in load_data[:50]] + + tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] + grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] + + random_walk_grakel = grakel.kernels.RandomWalk(normalize=False, lamda=weight, kernel_type=series_type) + grakel_results = random_walk_grakel.fit_transform(grakel_graphs) + + random_walk_FlowMo = RandomWalk(series_type=series_type, weight=weight, normalize=False) + FlowMo_results = random_walk_FlowMo.K(tensor_adj_mats, tensor_adj_mats) + + import pdb; pdb.set_trace() + + npt.assert_almost_equal( + grakel_results, FlowMo_results.numpy(), + decimal=2 + ) From d608c16507afc2a18f14941908c8a18564595d4d Mon Sep 17 00:00:00 2001 From: Julius Date: Wed, 29 Sep 2021 18:34:26 +0100 Subject: [PATCH 05/12] remove debugger --- tests/kernels/test_random_walk.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py index bf9fd45..d116946 100644 --- a/tests/kernels/test_random_walk.py +++ b/tests/kernels/test_random_walk.py @@ -47,8 +47,6 @@ def test_random_walk_unlabelled(weight, series_type, p, load_data): random_walk_FlowMo = RandomWalk(series_type=series_type, weight=weight, normalize=False) FlowMo_results = random_walk_FlowMo.K(tensor_adj_mats, tensor_adj_mats) - import pdb; pdb.set_trace() - npt.assert_almost_equal( grakel_results, FlowMo_results.numpy(), decimal=2 From d16f465f453a65e535de0912e84e7126f56d2f2a Mon Sep 17 00:00:00 2001 From: Julius Date: Wed, 29 Sep 2021 19:48:13 +0100 Subject: [PATCH 06/12] change method_type --- tests/kernels/test_random_walk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py index d116946..82cad03 100644 --- a/tests/kernels/test_random_walk.py +++ b/tests/kernels/test_random_walk.py @@ -29,8 +29,8 @@ def load_data(): [ (0.1, 'geometric', None), (0.1, 'exponential', None), - #(0.3, 'geometric', None), TODO: Fix - (0.5, 'exponential', None), + (0.3, 'geometric', None), #NB: requires `method_type="baseline" in grakel kernel constructor + (0.3, 'exponential', None), #(0.2, 'geometric', 3), TODO: Implement #(0.8, 'exponential', 3), TODO: Implement ] @@ -41,10 +41,10 @@ def test_random_walk_unlabelled(weight, series_type, p, load_data): tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] - random_walk_grakel = grakel.kernels.RandomWalk(normalize=False, lamda=weight, kernel_type=series_type) + random_walk_grakel = grakel.kernels.RandomWalk(normalize=True, lamda=weight, kernel_type=series_type, method_type="baseline") grakel_results = random_walk_grakel.fit_transform(grakel_graphs) - random_walk_FlowMo = RandomWalk(series_type=series_type, weight=weight, normalize=False) + random_walk_FlowMo = RandomWalk(series_type=series_type, weight=weight, normalize=True) FlowMo_results = random_walk_FlowMo.K(tensor_adj_mats, tensor_adj_mats) npt.assert_almost_equal( From 7e7cc5c6d4075b7f8f8ec1fa8009fa9fff6453d1 Mon Sep 17 00:00:00 2001 From: Julius Date: Fri, 1 Oct 2021 09:19:24 +0100 Subject: [PATCH 07/12] remove local file path --- tests/kernels/test_random_walk.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py index 82cad03..130b2a3 100644 --- a/tests/kernels/test_random_walk.py +++ b/tests/kernels/test_random_walk.py @@ -3,7 +3,7 @@ Molecule kernels for Gaussian Process Regression implemented in GPflow. """ -import sys +import os import grakel import numpy.testing as npt @@ -15,13 +15,21 @@ from GP.kernel_modules.random_walk import RandomWalk -sys.path.append('/Users/juliusschwartz/Mystuff/FlowMO') - @pytest.fixture def load_data(): - test_dataset_file = '../../datasets/ESOL.csv' - df = pd.read_csv(test_dataset_file) - return df["smiles"].to_list() + benchmark_path = os.path.abspath( + os.path.join( + os.getcwd(), '..', '..', 'datasets', 'ESOL.csv' + ) + ) + df = pd.read_csv(benchmark_path) + smiles = df["smiles"].to_list() + + adj_mats = [GetAdjacencyMatrix(MolFromSmiles(smiles)) for smiles in smiles[:50]] + tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] + grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] + + return tensor_adj_mats, grakel_graphs @pytest.mark.parametrize( @@ -36,10 +44,7 @@ def load_data(): ] ) def test_random_walk_unlabelled(weight, series_type, p, load_data): - adj_mats = [GetAdjacencyMatrix(MolFromSmiles(smiles)) for smiles in load_data[:50]] - - tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] - grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] + tensor_adj_mats, grakel_graphs = load_data random_walk_grakel = grakel.kernels.RandomWalk(normalize=True, lamda=weight, kernel_type=series_type, method_type="baseline") grakel_results = random_walk_grakel.fit_transform(grakel_graphs) From 517e9351c32ebdf2e252b60e863fced2ce4f6970 Mon Sep 17 00:00:00 2001 From: Julius Date: Sat, 2 Oct 2021 21:48:48 +0100 Subject: [PATCH 08/12] implement finite length random walks --- GP/kernel_modules/random_walk.py | 23 +++++++++++++++++++---- tests/kernels/test_random_walk.py | 10 +++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/GP/kernel_modules/random_walk.py b/GP/kernel_modules/random_walk.py index 549217a..373bd7b 100644 --- a/GP/kernel_modules/random_walk.py +++ b/GP/kernel_modules/random_walk.py @@ -7,13 +7,16 @@ import numpy as np import tensorflow as tf +from math import factorial + from .kernel_utils import normalize class RandomWalk(gpflow.kernels.Kernel): - def __init__(self, uniform_probabilities=False, series_type='geometric', weight=0.1, normalize=True): + def __init__(self, uniform_probabilities=False, p=None, series_type='geometric', weight=0.1, normalize=True): super().__init__() self.uniform_probabilities=uniform_probabilities + self.p = p if series_type == 'geometric': self.geometric = True elif series_type == 'exponential': @@ -66,10 +69,22 @@ def K(self, X, X2=None): tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenvals_2[idx_2], axis=0)) ]).to_dense() - if self.geometric: - power_series = tf.linalg.diag(1 / (1 - diagonal)) + if self.p is not None: + power_series = tf.ones_like(diagonal) + temp_diagonal = tf.ones_like(diagonal) + + for k in range(self.p): + temp_diagonal = tf.multiply(temp_diagonal, diagonal) + if not self.geometric: + temp_diagonal = tf.divide(temp_diagonal, factorial(k+1)) + power_series = tf.add(power_series, temp_diagonal) + + power_series = tf.linalg.diag(power_series) else: - power_series = tf.linalg.diag(tf.exp(diagonal)) + if self.geometric: + power_series = tf.linalg.diag(1 / (1 - diagonal)) + else: + power_series = tf.linalg.diag(tf.exp(diagonal)) k_matrix[idx_1, idx_2] = tf.linalg.matmul( flanking_factor, diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py index 130b2a3..814718e 100644 --- a/tests/kernels/test_random_walk.py +++ b/tests/kernels/test_random_walk.py @@ -37,19 +37,19 @@ def load_data(): [ (0.1, 'geometric', None), (0.1, 'exponential', None), - (0.3, 'geometric', None), #NB: requires `method_type="baseline" in grakel kernel constructor + #(0.3, 'geometric', None), #NB: requires `method_type="baseline" in grakel kernel constructor (0.3, 'exponential', None), - #(0.2, 'geometric', 3), TODO: Implement - #(0.8, 'exponential', 3), TODO: Implement + (0.3, 'geometric', 3), + (0.8, 'exponential', 3), ] ) def test_random_walk_unlabelled(weight, series_type, p, load_data): tensor_adj_mats, grakel_graphs = load_data - random_walk_grakel = grakel.kernels.RandomWalk(normalize=True, lamda=weight, kernel_type=series_type, method_type="baseline") + random_walk_grakel = grakel.kernels.RandomWalk(normalize=True, lamda=weight, kernel_type=series_type, p=p) grakel_results = random_walk_grakel.fit_transform(grakel_graphs) - random_walk_FlowMo = RandomWalk(series_type=series_type, weight=weight, normalize=True) + random_walk_FlowMo = RandomWalk(p=p, series_type=series_type, weight=weight, normalize=True) FlowMo_results = random_walk_FlowMo.K(tensor_adj_mats, tensor_adj_mats) npt.assert_almost_equal( From 1fcad8f52fefa5f6e219f98d084c6b8616e26218 Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 4 Oct 2021 18:35:47 +0100 Subject: [PATCH 09/12] added documentation, moved arguments around --- GP/kernel_modules/kernel_utils.py | 2 +- GP/kernel_modules/random_walk.py | 31 ++++++++++++++++++++++++++----- tests/kernels/test_random_walk.py | 11 ++++++----- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/GP/kernel_modules/kernel_utils.py b/GP/kernel_modules/kernel_utils.py index 9899b15..5cff6a0 100644 --- a/GP/kernel_modules/kernel_utils.py +++ b/GP/kernel_modules/kernel_utils.py @@ -1,6 +1,6 @@ # Author: Henry Moss & Ryan-Rhys Griffiths """ -Molecule kernels for Gaussian Process Regression implemented in GPflow. +Utility methods for graph-based kernels """ import tensorflow as tf diff --git a/GP/kernel_modules/random_walk.py b/GP/kernel_modules/random_walk.py index 373bd7b..937e8ec 100644 --- a/GP/kernel_modules/random_walk.py +++ b/GP/kernel_modules/random_walk.py @@ -13,18 +13,28 @@ class RandomWalk(gpflow.kernels.Kernel): - def __init__(self, uniform_probabilities=False, p=None, series_type='geometric', weight=0.1, normalize=True): + def __init__(self, normalize=True, weight=0.1, series_type='geometric', p=None, uniform_probabilities=False): super().__init__() - self.uniform_probabilities=uniform_probabilities - self.p = p + self.normalize = normalize + self.weight = weight if series_type == 'geometric': self.geometric = True elif series_type == 'exponential': self.geometric = False - self.weight = weight - self.normalize = normalize + self.p = p + self.uniform_probabilities = uniform_probabilities def K(self, X, X2=None): + """ + Compute the random walk graph kernel (Gartner et al. 2003), + specifically using the spectral decomposition approach + given by https://www.jmlr.org/papers/volume11/vishwanathan10a/vishwanathan10a.pdf + + :param X: array of N graph objects (represented as adjacency matrices of varying sizes) + :param X2: array of M graph objects (represented as adjacency matrices of varying sizes) + If None, compute the N x N kernel matrix for X. + :return: The kernel matrix of dimension N x M. + """ if X2 is None: X2 = X @@ -100,9 +110,20 @@ def K(self, X, X2=None): return tf.convert_to_tensor(k_matrix) def K_diag(self, X): + """ + Compute the diagonal of the N x N kernel matrix of X. + :param X: array of N graph objects (represented as adjacency matrices of varying sizes). + :return: N x 1 array. + """ return tf.linalg.tensor_diag_part(self.K(X)) def _generate_flanking_factors(self, eigenvecs): + """ + Helper method to calculate intermediate terms in the expression for random + walk kernel evaluated for two graphs. + :param eigenvecs: array of N matrices of varying sizes + :return: array of N matrices of varying sizes + """ flanking_factors = [] for eigenvec in eigenvecs: diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py index 814718e..cf0b164 100644 --- a/tests/kernels/test_random_walk.py +++ b/tests/kernels/test_random_walk.py @@ -1,6 +1,7 @@ # Author: Henry Moss & Ryan-Rhys Griffiths """ -Molecule kernels for Gaussian Process Regression implemented in GPflow. +Verifies the FlowMO implementation of the Random Walk graph kernel +against GraKel """ import os @@ -37,10 +38,10 @@ def load_data(): [ (0.1, 'geometric', None), (0.1, 'exponential', None), - #(0.3, 'geometric', None), #NB: requires `method_type="baseline" in grakel kernel constructor + #(0.3, 'geometric', None), #Requires `method_type="baseline" in GraKel kernel constructor (0.3, 'exponential', None), - (0.3, 'geometric', 3), - (0.8, 'exponential', 3), + (0.3, 'geometric', 3), #Doesn't pass due to suspected GraKel bug, see https://github.com/ysig/GraKeL/issues/71 + (0.8, 'exponential', 3), #Same issue as above test ] ) def test_random_walk_unlabelled(weight, series_type, p, load_data): @@ -49,7 +50,7 @@ def test_random_walk_unlabelled(weight, series_type, p, load_data): random_walk_grakel = grakel.kernels.RandomWalk(normalize=True, lamda=weight, kernel_type=series_type, p=p) grakel_results = random_walk_grakel.fit_transform(grakel_graphs) - random_walk_FlowMo = RandomWalk(p=p, series_type=series_type, weight=weight, normalize=True) + random_walk_FlowMo = RandomWalk(normalize=True, weight=weight, series_type=series_type, p=p) FlowMo_results = random_walk_FlowMo.K(tensor_adj_mats, tensor_adj_mats) npt.assert_almost_equal( From eccb9185a52ef08f0d9f21173296ef3fe5e3e7a9 Mon Sep 17 00:00:00 2001 From: Julius Date: Thu, 14 Oct 2021 22:21:07 +0100 Subject: [PATCH 10/12] refactor to work with gpflow --- GP/kernel_modules/kernel_utils.py | 29 ++++ GP/kernel_modules/random_walk.py | 108 ++++++------- .../Gaussian_Process_RandomWalk_Kernel.ipynb | 152 ++++++++++++++++++ tests/kernels/test_random_walk.py | 8 +- 4 files changed, 239 insertions(+), 58 deletions(-) create mode 100644 examples/Gaussian_Process_RandomWalk_Kernel.ipynb diff --git a/GP/kernel_modules/kernel_utils.py b/GP/kernel_modules/kernel_utils.py index 5cff6a0..20f92e5 100644 --- a/GP/kernel_modules/kernel_utils.py +++ b/GP/kernel_modules/kernel_utils.py @@ -4,6 +4,7 @@ """ import tensorflow as tf +import numpy as np def normalize(k_matrix): @@ -12,3 +13,31 @@ def normalize(k_matrix): tf.expand_dims(k_matrix_diagonal, 0)) return tf.divide(k_matrix, tf.sqrt(squared_normalization_factor)) + + +def pad_tensor(tensor, target_dim): + return tf.pad(tensor, [[0, target_dim - tensor.shape[0]], [0, target_dim - tensor.shape[0]]], 'CONSTANT') + + +def pad_tensors(tensor_list): + max_dim = max(tensor_list, key=lambda x: x.shape[0]).shape[0] + return [pad_tensor(tensor, max_dim) for tensor in tensor_list] + + +def unpad_tensor(tensor): + mask = tf.reduce_sum(tensor, 0) != 0 + rows_unpadded = tf.boolean_mask(tensor, mask, axis=0) + fully_unpadded = tf.boolean_mask(rows_unpadded, mask, axis=1) + return fully_unpadded + + +def preprocess_adjacency_matrix_inputs(adj_mat_list): + padded_adj_mats = pad_tensors(adj_mat_list) + flattened_padded_adj_mats = tf.reshape(padded_adj_mats, (len(padded_adj_mats), padded_adj_mats[0].shape[0]**2)) + return flattened_padded_adj_mats + + +def extract_adj_mats_from_vector_inputs(preprocessed_data): + adj_mat_dim = int(np.sqrt(preprocessed_data.shape[1])) + rehydrated_adj_mats = tf.reshape(preprocessed_data, (len(preprocessed_data), adj_mat_dim, adj_mat_dim)) + return rehydrated_adj_mats diff --git a/GP/kernel_modules/random_walk.py b/GP/kernel_modules/random_walk.py index 937e8ec..51b21b9 100644 --- a/GP/kernel_modules/random_walk.py +++ b/GP/kernel_modules/random_walk.py @@ -9,7 +9,7 @@ from math import factorial -from .kernel_utils import normalize +from .kernel_utils import normalize, unpad_tensor, extract_adj_mats_from_vector_inputs class RandomWalk(gpflow.kernels.Kernel): @@ -35,48 +35,60 @@ def K(self, X, X2=None): If None, compute the N x N kernel matrix for X. :return: The kernel matrix of dimension N x M. """ - if X2 is None: - X2 = X - - X_is_X2 = X == X2 - eigenvecs, eigenvals = [], [] - eigenvecs_2, eigenvals_2 = [], [] + X_padded = False + X2_padded = False + X_is_X2 = False - for adj_mat in X: - val, vec = tf.linalg.eigh(tf.cast(adj_mat, tf.float64)) - eigenvals.append(val) - eigenvecs.append(vec) + if isinstance(X, (np.ndarray, tf.Tensor)) and len(X.shape) == 2: + X = extract_adj_mats_from_vector_inputs(X) + X_padded = True - flanking_factors = self._generate_flanking_factors(eigenvecs) - - if X_is_X2: - eigenvals_2, eigenvecs_2 = eigenvals, eigenvecs - flanking_factors_2 = flanking_factors - else: - for adj_mat in X2: - val, vec = tf.linalg.eigh(tf.cast(adj_mat, tf.float64)) - eigenvals_2.append(val) - eigenvecs_2.append(vec) - flanking_factors_2 = self._generate_flanking_factors(eigenvecs_2) + if X2 is None: + X2 = X + X2_padded = X_padded + X_is_X2 = True + elif isinstance(X2, (np.ndarray, tf.Tensor)) and len(X2.shape) == 2: + X2 = extract_adj_mats_from_vector_inputs(X2) + X2_padded = True + + flattened_k_matrix = tf.TensorArray(tf.float64, size=len(X)*len(X2)) + matrix_idx = 0 + + for idx_1 in range(len(X)): + + adj_mat_1 = X[idx_1] + if X_padded: + adj_mat_1 = unpad_tensor(adj_mat_1) + eigenval_1, eigenvec_1 = tf.linalg.eigh(tf.cast(adj_mat_1, tf.float64)) + start_stop_probs = tf.ones((1, tf.shape(eigenvec_1)[0]), tf.float64) + if self.uniform_probabilities: + start_stop_probs = tf.divide(start_stop_probs, tf.shape(eigenvec_1)[0]) + flanking_factor_1 = tf.linalg.matmul(start_stop_probs, eigenvec_1) - k_matrix = np.zeros((len(X), len(X2))) + for idx_2 in range(len(X2)): - for idx_1 in range(k_matrix.shape[0]): - for idx_2 in range(k_matrix.shape[1]): + if X_is_X2 and idx_1 == idx_2: + eigenval_2, eigenval_2, flanking_factor_2 = eigenval_1, eigenval_1, flanking_factor_1 + else: + adj_mat_2 = X2[idx_2] + if X2_padded: + adj_mat_2 = unpad_tensor(adj_mat_2) + eigenval_2, eigenvec_2 = tf.linalg.eigh(tf.cast(adj_mat_2, tf.float64)) + start_stop_probs = tf.ones((1, tf.shape(eigenvec_2)[0]), tf.float64) + if self.uniform_probabilities: + start_stop_probs = tf.divide(start_stop_probs, tf.shape(eigenvec_2)[0]) + flanking_factor_2 = tf.linalg.matmul(start_stop_probs, eigenvec_2) - if X_is_X2 and idx_2 < idx_1: - k_matrix[idx_1, idx_2] = k_matrix[idx_2, idx_1] - continue flanking_factor = tf.linalg.LinearOperatorKronecker( - [tf.linalg.LinearOperatorFullMatrix(flanking_factors[idx_1]), - tf.linalg.LinearOperatorFullMatrix(flanking_factors_2[idx_2]) + [tf.linalg.LinearOperatorFullMatrix(flanking_factor_1), + tf.linalg.LinearOperatorFullMatrix(flanking_factor_2) ]).to_dense() diagonal = self.weight * tf.linalg.LinearOperatorKronecker( - [tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenvals[idx_1], axis=0)), - tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenvals_2[idx_2], axis=0)) + [tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenval_1, axis=0)), + tf.linalg.LinearOperatorFullMatrix(tf.expand_dims(eigenval_2, axis=0)) ]).to_dense() if self.p is not None: @@ -96,7 +108,7 @@ def K(self, X, X2=None): else: power_series = tf.linalg.diag(tf.exp(diagonal)) - k_matrix[idx_1, idx_2] = tf.linalg.matmul( + matrix_entry = tf.linalg.matmul( flanking_factor, tf.linalg.matmul( power_series, @@ -104,10 +116,16 @@ def K(self, X, X2=None): ) ) + flattened_k_matrix = flattened_k_matrix.write(matrix_idx, matrix_entry) + matrix_idx += 1 + + k_matrix = tf.reshape(flattened_k_matrix.stack(), (len(X), len(X2))) + if self.normalize: - return tf.convert_to_tensor(normalize(k_matrix)) + normalized_k_matrix = normalize(k_matrix) + return normalized_k_matrix - return tf.convert_to_tensor(k_matrix) + return k_matrix def K_diag(self, X): """ @@ -116,23 +134,3 @@ def K_diag(self, X): :return: N x 1 array. """ return tf.linalg.tensor_diag_part(self.K(X)) - - def _generate_flanking_factors(self, eigenvecs): - """ - Helper method to calculate intermediate terms in the expression for random - walk kernel evaluated for two graphs. - :param eigenvecs: array of N matrices of varying sizes - :return: array of N matrices of varying sizes - """ - flanking_factors = [] - - for eigenvec in eigenvecs: - start_stop_probs = tf.ones((1, eigenvec.shape[0]), tf.float64) - if self.uniform_probabilities: - start_stop_probs = tf.divide(start_stop_probs, eigenvec.shape(0)) - - flanking_factors.append( - tf.linalg.matmul(start_stop_probs, eigenvec) - ) - - return flanking_factors diff --git a/examples/Gaussian_Process_RandomWalk_Kernel.ipynb b/examples/Gaussian_Process_RandomWalk_Kernel.ipynb new file mode 100644 index 0000000..4813d24 --- /dev/null +++ b/examples/Gaussian_Process_RandomWalk_Kernel.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import pandas as pd\n", + "sys.path.append('..') # to import from GP.kernels and property_predition.data_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import gpflow\n", + "from gpflow.mean_functions import Constant\n", + "from gpflow.utilities import print_summary\n", + "import numpy as np\n", + "from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error\n", + "from sklearn.model_selection import train_test_split\n", + "import tensorflow as tf\n", + "from rdkit.Chem import MolFromSmiles\n", + "from rdkit.Chem.rdmolops import GetAdjacencyMatrix\n", + "\n", + "from property_prediction.data_utils import transform_data\n", + "from GP.kernel_modules.random_walk import RandomWalk\n", + "from GP.kernel_modules.kernel_utils import pad_tensors\n", + "from GP.kernel_modules.kernel_utils import preprocess_adjacency_matrix_inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"../datasets/ESOL.csv\")[:50]\n", + "smiles = df[\"smiles\"].to_numpy()\n", + "y = df['measured log solubility in mols per litre'].to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = [tf.convert_to_tensor(GetAdjacencyMatrix(MolFromSmiles(smiles))) for smiles in smiles]\n", + "X = preprocess_adjacency_matrix_inputs(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define the Gaussian Process Regression training objective" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def objective_closure():\n", + " return -m.log_marginal_likelihood()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X.numpy(), y, test_size=0.2, random_state=0)\n", + "y_train = y_train.reshape(-1, 1)\n", + "y_test = y_test.reshape(-1, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " We standardise the outputs but leave the inputs unchanged" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "_, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "k = RandomWalk()\n", + "m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optimise the kernel variance and noise level by the marginal likelihood" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "opt = gpflow.optimizers.Scipy()\n", + "opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))\n", + "print_summary(m) # Model summary" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/kernels/test_random_walk.py b/tests/kernels/test_random_walk.py index cf0b164..933e0ec 100644 --- a/tests/kernels/test_random_walk.py +++ b/tests/kernels/test_random_walk.py @@ -15,6 +15,7 @@ from rdkit.Chem.rdmolops import GetAdjacencyMatrix from GP.kernel_modules.random_walk import RandomWalk +from GP.kernel_modules.kernel_utils import preprocess_adjacency_matrix_inputs @pytest.fixture def load_data(): @@ -28,9 +29,10 @@ def load_data(): adj_mats = [GetAdjacencyMatrix(MolFromSmiles(smiles)) for smiles in smiles[:50]] tensor_adj_mats = [tf.convert_to_tensor(adj_mat) for adj_mat in adj_mats] + preprocessed_tensor_adj_mats = preprocess_adjacency_matrix_inputs(tensor_adj_mats) grakel_graphs = [grakel.Graph(adj_mat) for adj_mat in adj_mats] - return tensor_adj_mats, grakel_graphs + return preprocessed_tensor_adj_mats, grakel_graphs @pytest.mark.parametrize( @@ -45,13 +47,13 @@ def load_data(): ] ) def test_random_walk_unlabelled(weight, series_type, p, load_data): - tensor_adj_mats, grakel_graphs = load_data + preprocessed_tensor_adj_mats, grakel_graphs = load_data random_walk_grakel = grakel.kernels.RandomWalk(normalize=True, lamda=weight, kernel_type=series_type, p=p) grakel_results = random_walk_grakel.fit_transform(grakel_graphs) random_walk_FlowMo = RandomWalk(normalize=True, weight=weight, series_type=series_type, p=p) - FlowMo_results = random_walk_FlowMo.K(tensor_adj_mats, tensor_adj_mats) + FlowMo_results = random_walk_FlowMo.K(preprocessed_tensor_adj_mats, preprocessed_tensor_adj_mats) npt.assert_almost_equal( grakel_results, FlowMo_results.numpy(), From 90457127f7df3c018584503755d2cb37d275fd90 Mon Sep 17 00:00:00 2001 From: Julius Date: Sat, 16 Oct 2021 12:41:47 +0200 Subject: [PATCH 11/12] minor clean-up --- GP/kernel_modules/random_walk.py | 65 +++++++++++++------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/GP/kernel_modules/random_walk.py b/GP/kernel_modules/random_walk.py index 51b21b9..fd8d68f 100644 --- a/GP/kernel_modules/random_walk.py +++ b/GP/kernel_modules/random_walk.py @@ -3,12 +3,11 @@ Molecule kernels for Gaussian Process Regression implemented in GPflow. """ +from math import factorial + import gpflow -import numpy as np import tensorflow as tf -from math import factorial - from .kernel_utils import normalize, unpad_tensor, extract_adj_mats_from_vector_inputs @@ -30,56 +29,36 @@ def K(self, X, X2=None): specifically using the spectral decomposition approach given by https://www.jmlr.org/papers/volume11/vishwanathan10a/vishwanathan10a.pdf - :param X: array of N graph objects (represented as adjacency matrices of varying sizes) - :param X2: array of M graph objects (represented as adjacency matrices of varying sizes) - If None, compute the N x N kernel matrix for X. - :return: The kernel matrix of dimension N x M. + :param X: N x D array. + :param X2: M x D array. If None, compute the N x N kernel matrix for X. + :return: The kernel matrix of dimension N x M """ - X_padded = False - X2_padded = False - X_is_X2 = False - - if isinstance(X, (np.ndarray, tf.Tensor)) and len(X.shape) == 2: - X = extract_adj_mats_from_vector_inputs(X) - X_padded = True + X = extract_adj_mats_from_vector_inputs(X) if X2 is None: X2 = X - X2_padded = X_padded X_is_X2 = True - elif isinstance(X2, (np.ndarray, tf.Tensor)) and len(X2.shape) == 2: + else: X2 = extract_adj_mats_from_vector_inputs(X2) - X2_padded = True + X_is_X2 = False flattened_k_matrix = tf.TensorArray(tf.float64, size=len(X)*len(X2)) matrix_idx = 0 for idx_1 in range(len(X)): - adj_mat_1 = X[idx_1] - if X_padded: - adj_mat_1 = unpad_tensor(adj_mat_1) - eigenval_1, eigenvec_1 = tf.linalg.eigh(tf.cast(adj_mat_1, tf.float64)) - start_stop_probs = tf.ones((1, tf.shape(eigenvec_1)[0]), tf.float64) - if self.uniform_probabilities: - start_stop_probs = tf.divide(start_stop_probs, tf.shape(eigenvec_1)[0]) - flanking_factor_1 = tf.linalg.matmul(start_stop_probs, eigenvec_1) + adj_mat_1 = tf.cast(unpad_tensor(X[idx_1]), tf.float64) + eigenval_1, eigenvec_1, flanking_factor_1 = self._eigendecompose_and_calculate_flanking_factor(adj_mat_1) for idx_2 in range(len(X2)): if X_is_X2 and idx_1 == idx_2: eigenval_2, eigenval_2, flanking_factor_2 = eigenval_1, eigenval_1, flanking_factor_1 else: - adj_mat_2 = X2[idx_2] - if X2_padded: - adj_mat_2 = unpad_tensor(adj_mat_2) - eigenval_2, eigenvec_2 = tf.linalg.eigh(tf.cast(adj_mat_2, tf.float64)) - start_stop_probs = tf.ones((1, tf.shape(eigenvec_2)[0]), tf.float64) - if self.uniform_probabilities: - start_stop_probs = tf.divide(start_stop_probs, tf.shape(eigenvec_2)[0]) - flanking_factor_2 = tf.linalg.matmul(start_stop_probs, eigenvec_2) - + adj_mat_2 = tf.cast(unpad_tensor(X2[idx_2]), tf.float64) + eigenval_2, eigenvec_2, flanking_factor_2 = self._eigendecompose_and_calculate_flanking_factor( + adj_mat_2) flanking_factor = tf.linalg.LinearOperatorKronecker( [tf.linalg.LinearOperatorFullMatrix(flanking_factor_1), @@ -122,15 +101,23 @@ def K(self, X, X2=None): k_matrix = tf.reshape(flattened_k_matrix.stack(), (len(X), len(X2))) if self.normalize: - normalized_k_matrix = normalize(k_matrix) - return normalized_k_matrix + return normalize(k_matrix) return k_matrix def K_diag(self, X): """ - Compute the diagonal of the N x N kernel matrix of X. - :param X: array of N graph objects (represented as adjacency matrices of varying sizes). - :return: N x 1 array. + Compute the diagonal of the N x N kernel matrix of X + :param X: N x D array + :return: N x 1 array """ return tf.linalg.tensor_diag_part(self.K(X)) + + def _eigendecompose_and_calculate_flanking_factor(self, adjacency_matrix): + eigenval, eigenvec = tf.linalg.eigh(adjacency_matrix) + start_stop_probs = tf.ones((1, tf.shape(eigenvec)[0]), tf.float64) + if self.uniform_probabilities: + start_stop_probs = tf.divide(start_stop_probs, tf.shape(eigenvec)[0]) + flanking_factor = tf.linalg.matmul(start_stop_probs, eigenvec) + + return eigenval, eigenvec, flanking_factor From 20358734ec760c241dec4ac1dce7ea7c8b854191 Mon Sep 17 00:00:00 2001 From: Julius Date: Wed, 20 Oct 2021 22:28:14 +0100 Subject: [PATCH 12/12] don't normalise when predicting --- GP/kernel_modules/random_walk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/GP/kernel_modules/random_walk.py b/GP/kernel_modules/random_walk.py index fd8d68f..f43f1c4 100644 --- a/GP/kernel_modules/random_walk.py +++ b/GP/kernel_modules/random_walk.py @@ -42,6 +42,7 @@ def K(self, X, X2=None): else: X2 = extract_adj_mats_from_vector_inputs(X2) X_is_X2 = False + self.normalize = False flattened_k_matrix = tf.TensorArray(tf.float64, size=len(X)*len(X2)) matrix_idx = 0