From d4280882a14c0c8e92112a0820dcd4de7b73819c Mon Sep 17 00:00:00 2001 From: ThanosTsiamis Date: Fri, 10 May 2024 16:24:31 +0300 Subject: [PATCH 1/4] Addition of a function called add_noise_to_df_column which adds noise to a specified column in a dataframe. --- valentine/data_sources/utils.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py index 4774686..7724e7f 100644 --- a/valentine/data_sources/utils.py +++ b/valentine/data_sources/utils.py @@ -1,5 +1,7 @@ -import chardet import csv + +import chardet +import numpy as np from dateutil.parser import parse @@ -40,3 +42,30 @@ def is_date(string, fuzzy=False): return True except Exception: return False + + +def add_noise_to_df_column(df, column_name, noise_level): + """ + Adds noise to a specified column in a DataFrame. + + Parameters: + - df (pd.DataFrame): The DataFrame containing the column to which noise will be added. + - column_name (str): The name of the column to which noise will be added. + - noise_level (float): The level of noise to be added. For numerical columns, this indicates the standard deviation + of the Gaussian noise. For string columns, it represents the probability of permuting the characters + of each string. + + Returns: + - pd.DataFrame: The DataFrame with noise added to the specified column. + """ + if df[column_name].dtype in ["int64", "float64"]: + noise = np.random.normal(0, noise_level, df[column_name].shape[0]) + df[column_name] = df[column_name] + noise + elif df[column_name].dtype == "object": + for i in range(df[column_name].shape[0]): + if np.random.rand() < noise_level: + df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x))))) + return df + +# if __name__ == "__main__": +# add_noise_to_df_column(pd.DataFrame({'a': [1, 2, 3], 'b': ['abcdefg', 'hijklmn', 'opqrst']}), 'b', 0.99) From 8aea3f2cb67175d148d8aa6ef9769be3a185689d Mon Sep 17 00:00:00 2001 From: ThanosTsiamis Date: Fri, 10 May 2024 16:35:42 +0300 Subject: [PATCH 2/4] Codecov minor code modification. No changes in functionality whatsoever. --- valentine/data_sources/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py index 7724e7f..03d5a0a 100644 --- a/valentine/data_sources/utils.py +++ b/valentine/data_sources/utils.py @@ -52,8 +52,8 @@ def add_noise_to_df_column(df, column_name, noise_level): - df (pd.DataFrame): The DataFrame containing the column to which noise will be added. - column_name (str): The name of the column to which noise will be added. - noise_level (float): The level of noise to be added. For numerical columns, this indicates the standard deviation - of the Gaussian noise. For string columns, it represents the probability of permuting the characters - of each string. + of the Gaussian noise. For string columns, it represents the probability of permuting the + characters of each string. Returns: - pd.DataFrame: The DataFrame with noise added to the specified column. @@ -62,7 +62,7 @@ def add_noise_to_df_column(df, column_name, noise_level): noise = np.random.normal(0, noise_level, df[column_name].shape[0]) df[column_name] = df[column_name] + noise elif df[column_name].dtype == "object": - for i in range(df[column_name].shape[0]): + for _ in range(df[column_name].shape[0]): if np.random.rand() < noise_level: df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x))))) return df From 7485a827a885e966e0ecb12488ed2a8ee04a9782 Mon Sep 17 00:00:00 2001 From: ThanosTsiamis Date: Fri, 10 May 2024 18:02:56 +0300 Subject: [PATCH 3/4] Added tests for this newly added function --- tests/test_utils.py | 15 ++++++++++++++- valentine/data_sources/utils.py | 5 +---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9be26ab..a37c641 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ import unittest +import pandas as pd + from tests import d1_path -from valentine.data_sources.utils import get_encoding, get_delimiter, is_date +from valentine.data_sources.utils import get_encoding, get_delimiter, is_date, add_noise_to_df_column from valentine.utils.utils import is_sorted, convert_data_type @@ -30,3 +32,14 @@ def test_get_delimiter(self): def test_is_date(self): date_str = "2019-04-26 18:03:50.941332" assert is_date(date_str) + + def test_add_noise_to_df_column(self): + # Tiny chance that this test will fail due to the random nature of the noise + test_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) + assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) + assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a']) + assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b']) + assert not add_noise_to_df_column(test_df, 'a', 0.5)['a'].equals(assert_df['a']) + assert not add_noise_to_df_column(test_df, 'b', 0.5)['b'].equals(assert_df['b']) + assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a']) + assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b']) diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py index 03d5a0a..28c9686 100644 --- a/valentine/data_sources/utils.py +++ b/valentine/data_sources/utils.py @@ -65,7 +65,4 @@ def add_noise_to_df_column(df, column_name, noise_level): for _ in range(df[column_name].shape[0]): if np.random.rand() < noise_level: df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x))))) - return df - -# if __name__ == "__main__": -# add_noise_to_df_column(pd.DataFrame({'a': [1, 2, 3], 'b': ['abcdefg', 'hijklmn', 'opqrst']}), 'b', 0.99) + return df \ No newline at end of file From 8d209f106584a2fd3800bfc0dbe438e4358bddee Mon Sep 17 00:00:00 2001 From: ThanosTsiamis Date: Fri, 10 May 2024 18:07:25 +0300 Subject: [PATCH 4/4] Randomness complicates things in the CI/CD pipeline. As a result only the edge cases should be left in the tests --- tests/test_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index a37c641..a9186a0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -39,7 +39,5 @@ def test_add_noise_to_df_column(self): assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']}) assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a']) assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b']) - assert not add_noise_to_df_column(test_df, 'a', 0.5)['a'].equals(assert_df['a']) - assert not add_noise_to_df_column(test_df, 'b', 0.5)['b'].equals(assert_df['b']) assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a']) assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b'])