From d4280882a14c0c8e92112a0820dcd4de7b73819c Mon Sep 17 00:00:00 2001
From: ThanosTsiamis <thatsiamis@gmail.com>
Date: Fri, 10 May 2024 16:24:31 +0300
Subject: [PATCH 1/4] Addition of a function called add_noise_to_df_column
 which adds noise to a specified column in a dataframe.

---
 valentine/data_sources/utils.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py
index 4774686..7724e7f 100644
--- a/valentine/data_sources/utils.py
+++ b/valentine/data_sources/utils.py
@@ -1,5 +1,7 @@
-import chardet
 import csv
+
+import chardet
+import numpy as np
 from dateutil.parser import parse
 
 
@@ -40,3 +42,30 @@ def is_date(string, fuzzy=False):
         return True
     except Exception:
         return False
+
+
+def add_noise_to_df_column(df, column_name, noise_level):
+    """
+    Adds noise to a specified column in a DataFrame.
+
+    Parameters:
+    - df (pd.DataFrame): The DataFrame containing the column to which noise will be added.
+    - column_name (str): The name of the column to which noise will be added.
+    - noise_level (float): The level of noise to be added. For numerical columns, this indicates the standard deviation
+                           of the Gaussian noise. For string columns, it represents the probability of permuting the characters
+                           of each string.
+
+    Returns:
+    - pd.DataFrame: The DataFrame with noise added to the specified column.
+    """
+    if df[column_name].dtype in ["int64", "float64"]:
+        noise = np.random.normal(0, noise_level, df[column_name].shape[0])
+        df[column_name] = df[column_name] + noise
+    elif df[column_name].dtype == "object":
+        for i in range(df[column_name].shape[0]):
+            if np.random.rand() < noise_level:
+                df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x)))))
+    return df
+
+# if __name__ == "__main__":
+#     add_noise_to_df_column(pd.DataFrame({'a': [1, 2, 3], 'b': ['abcdefg', 'hijklmn', 'opqrst']}), 'b', 0.99)

From 8aea3f2cb67175d148d8aa6ef9769be3a185689d Mon Sep 17 00:00:00 2001
From: ThanosTsiamis <thatsiamis@gmail.com>
Date: Fri, 10 May 2024 16:35:42 +0300
Subject: [PATCH 2/4] Codecov minor code modification. No changes in
 functionality whatsoever.

---
 valentine/data_sources/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py
index 7724e7f..03d5a0a 100644
--- a/valentine/data_sources/utils.py
+++ b/valentine/data_sources/utils.py
@@ -52,8 +52,8 @@ def add_noise_to_df_column(df, column_name, noise_level):
     - df (pd.DataFrame): The DataFrame containing the column to which noise will be added.
     - column_name (str): The name of the column to which noise will be added.
     - noise_level (float): The level of noise to be added. For numerical columns, this indicates the standard deviation
-                           of the Gaussian noise. For string columns, it represents the probability of permuting the characters
-                           of each string.
+                            of the Gaussian noise. For string columns, it represents the probability of permuting the
+                            characters of each string.
 
     Returns:
     - pd.DataFrame: The DataFrame with noise added to the specified column.
@@ -62,7 +62,7 @@ def add_noise_to_df_column(df, column_name, noise_level):
         noise = np.random.normal(0, noise_level, df[column_name].shape[0])
         df[column_name] = df[column_name] + noise
     elif df[column_name].dtype == "object":
-        for i in range(df[column_name].shape[0]):
+        for _ in range(df[column_name].shape[0]):
             if np.random.rand() < noise_level:
                 df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x)))))
     return df

From 7485a827a885e966e0ecb12488ed2a8ee04a9782 Mon Sep 17 00:00:00 2001
From: ThanosTsiamis <thatsiamis@gmail.com>
Date: Fri, 10 May 2024 18:02:56 +0300
Subject: [PATCH 3/4] Added tests for this newly added function

---
 tests/test_utils.py             | 15 ++++++++++++++-
 valentine/data_sources/utils.py |  5 +----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9be26ab..a37c641 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,9 @@
 import unittest
 
+import pandas as pd
+
 from tests import d1_path
-from valentine.data_sources.utils import get_encoding, get_delimiter, is_date
+from valentine.data_sources.utils import get_encoding, get_delimiter, is_date, add_noise_to_df_column
 from valentine.utils.utils import is_sorted, convert_data_type
 
 
@@ -30,3 +32,14 @@ def test_get_delimiter(self):
     def test_is_date(self):
         date_str = "2019-04-26 18:03:50.941332"
         assert is_date(date_str)
+
+    def test_add_noise_to_df_column(self):
+        # Tiny chance that this test will fail due to the random nature of the noise
+        test_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']})
+        assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']})
+        assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a'])
+        assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b'])
+        assert not add_noise_to_df_column(test_df, 'a', 0.5)['a'].equals(assert_df['a'])
+        assert not add_noise_to_df_column(test_df, 'b', 0.5)['b'].equals(assert_df['b'])
+        assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a'])
+        assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b'])
diff --git a/valentine/data_sources/utils.py b/valentine/data_sources/utils.py
index 03d5a0a..28c9686 100644
--- a/valentine/data_sources/utils.py
+++ b/valentine/data_sources/utils.py
@@ -65,7 +65,4 @@ def add_noise_to_df_column(df, column_name, noise_level):
         for _ in range(df[column_name].shape[0]):
             if np.random.rand() < noise_level:
                 df[column_name] = df[column_name].apply(lambda x: ''.join(np.random.permutation(list(str(x)))))
-    return df
-
-# if __name__ == "__main__":
-#     add_noise_to_df_column(pd.DataFrame({'a': [1, 2, 3], 'b': ['abcdefg', 'hijklmn', 'opqrst']}), 'b', 0.99)
+    return df
\ No newline at end of file

From 8d209f106584a2fd3800bfc0dbe438e4358bddee Mon Sep 17 00:00:00 2001
From: ThanosTsiamis <thatsiamis@gmail.com>
Date: Fri, 10 May 2024 18:07:25 +0300
Subject: [PATCH 4/4] Randomness complicates things in the CI/CD pipeline. As a
 result only the edge cases should be left in the tests

---
 tests/test_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index a37c641..a9186a0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -39,7 +39,5 @@ def test_add_noise_to_df_column(self):
         assert_df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': ['abcdefg', 'hijklmn', 'opqrst']})
         assert add_noise_to_df_column(test_df, 'a', 0.0)['a'].equals(assert_df['a'])
         assert add_noise_to_df_column(test_df, 'b', 0.0)['b'].equals(assert_df['b'])
-        assert not add_noise_to_df_column(test_df, 'a', 0.5)['a'].equals(assert_df['a'])
-        assert not add_noise_to_df_column(test_df, 'b', 0.5)['b'].equals(assert_df['b'])
         assert not add_noise_to_df_column(test_df, 'a', 0.99999)['a'].equals(assert_df['a'])
         assert not add_noise_to_df_column(test_df, 'b', 0.99999)['b'].equals(assert_df['b'])