From 4b3eee09882217c3053cdcedd05e04b6b16e6cdc Mon Sep 17 00:00:00 2001
From: TDerig23 <89554971+TDerig23@users.noreply.github.com>
Date: Wed, 26 Oct 2022 23:26:34 -0700
Subject: [PATCH 1/2] midterm_wednesday_upload

---
 bda602_midterm.py | 197 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 bda602_midterm.py

diff --git a/bda602_midterm.py b/bda602_midterm.py
new file mode 100644
index 0000000..68d2240
--- /dev/null
+++ b/bda602_midterm.py
@@ -0,0 +1,197 @@
+import random
+from typing import List
+from scipy import stats
+import pandas as pd
+import seaborn
+from sklearn import datasets
+import sys
+
+import numpy
+from plotly import express as px
+from plotly import figure_factory as ff
+from plotly import graph_objects as go
+from sklearn.metrics import confusion_matrix
+
+titanic_df = pd.read_csv(
+    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
+
+
+TITANIC_PREDICTORS = [
+    "pclass",
+    "sex",
+    "age",
+    "sibsp",
+    "embarked",
+    "parch",
+    "fare",
+    "who",
+    "adult_male",
+    "deck",
+    "embark_town",
+    "alone",
+    "class",
+]
+
+dataframe = pd.DataFrame()
+
+
+def get_test_data_set(data_set_name: str = None) -> (pd.DataFrame, List[str], str):
+    """Function to load a few test data sets
+
+    :param:
+    data_set_name : string, optional
+        Data set to load
+
+    :return:
+    data_set : :class:`pandas.DataFrame`
+        Tabular data, possibly with some preprocessing applied.
+    predictors :list[str]
+        List of predictor variables
+    response: str
+        Response variable
+    """
+    seaborn_data_sets = ["mpg", "tips", "titanic", "titanic_2"]
+    sklearn_data_sets = ["boston", "diabetes", "breast_cancer"]
+    all_data_sets = seaborn_data_sets + sklearn_data_sets
+
+    if data_set_name is None:
+        data_set_name = random.choice(all_data_sets)
+    else:
+        if data_set_name not in all_data_sets:
+            raise Exception(f"Data set choice not valid: {data_set_name}")
+
+    if data_set_name in seaborn_data_sets:
+        if data_set_name == "mpg":
+            data_set = seaborn.load_dataset(name="mpg").dropna().reset_index()
+            predictors = [
+                "cylinders",
+                "displacement",
+                "horsepower",
+                "weight",
+                "acceleration",
+                "origin",
+                "name",
+            ]
+            response = "mpg"
+        elif data_set_name == "tips":
+            data_set = seaborn.load_dataset(name="tips").dropna().reset_index()
+            predictors = [
+                "total_bill",
+                "sex",
+                "smoker",
+                "day",
+                "time",
+                "size",
+            ]
+            response = "tip"
+        elif data_set_name == "titanic":
+            data_set = seaborn.load_dataset(name="titanic").dropna()
+            predictors = TITANIC_PREDICTORS
+            response = "survived"
+        elif data_set_name == "titanic_2":
+            data_set = seaborn.load_dataset(name="titanic").dropna()
+            predictors = TITANIC_PREDICTORS
+            response = "alive"
+    elif data_set_name in sklearn_data_sets:
+        if data_set_name == "boston":
+            data = datasets.load_boston()
+            data_set = pd.DataFrame(data.data, columns=data.feature_names)
+            data_set["CHAS"] = data_set["CHAS"].astype(str)
+        elif data_set_name == "diabetes":
+            data = datasets.load_diabetes()
+            data_set = pd.DataFrame(data.data, columns=data.feature_names)
+            data_set["gender"] = ["1" if i > 0 else "0" for i in data_set["sex"]]
+        elif data_set_name == "breast_cancer":
+            data = datasets.load_breast_cancer()
+            data_set = pd.DataFrame(data.data, columns=data.feature_names)
+
+        data_set["target"] = data.target
+        predictors = data.feature_names
+        response = "target"
+
+    print(f"Data set selected: {data_set_name}")
+    return data_set, predictors, response
+
+
+dataframe, predictors, response = get_test_data_set("breast_cancer")
+
+
+
+sibsp = dataframe['worst texture'].tolist()
+
+
+# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it.
+## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder)
+## loop through to find the datatype of each predictor create a list of the types.
+def column_sep(dataframe):
+    cat_df = pd.DataFrame()
+    cont_df = pd.DataFrame()
+    for column in dataframe.columns:
+        if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len(
+                pd.unique(dataframe[column])) == 2:
+            cat_df[column] = dataframe[column]
+        else:
+            cont_df[column] = dataframe[column]
+
+    return cat_df, cont_df
+
+
+cat_df, cont_df = column_sep(dataframe)
+
+
+# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it.
+## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder)
+## loop through to find the datatype of each predictor create a list of the types.
+def column_sep(dataframe):
+    predictor_dict = {}
+    for column in dataframe.columns:
+        if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len(
+                pd.unique(dataframe[column])) == 2:
+            predictor_dict[column] = "categorical"
+        else:
+            predictor_dict[column] = "continuous"
+
+    return predictor_dict
+
+
+predictors = column_sep(dataframe)
+
+for key, value in predictors.items():
+    print
+
+## get datatype and dataname
+predictors
+
+
+def type_chooser(predictors):
+    cont_cont_df = pd.DataFrame()
+    for key1 in range(len(predictors)):
+        for key2 in range(key1, len(predictors)):
+            if predictors.get(key1) == "continuous" and predictors.get(key2) == "continuous":
+                res = stats.pearsonr(dataframe[key1], dataframe[key2])
+                cont_cont_df["Pearsons_R"] = res
+            elif predictors.get(key1) == "continuous" and predictors.get(key2) == "categorical":
+                res = stats.pearsonr(dataframe[key1], dataframe[key2])
+                cont_cont_df["Pearsons_R"] = res
+
+
+res = type_chooser(predictors)
+
+print(res)
+
+res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
+res
+
+## for pearsons r continious v continious loop on columns for cont_df (double for loop) get pearson correlation. store in dataframe
+## enumerate the index
+## Continuous / Categorical pairs df.corr[]
+# Categorical / Categorical pairs us cat_correlation
+
+## use for loop to determine the graphing types. for loop then if else to determine if response is boolean or not
+## then correspond to correct correlation type.
+
+
+# for predictor1 in cat df
+
+
+

From 2afbb17ca04cb16700729e6fcff0bd7ec84065d9 Mon Sep 17 00:00:00 2001
From: TDerig23 <89554971+TDerig23@users.noreply.github.com>
Date: Sat, 29 Oct 2022 20:29:31 -0700
Subject: [PATCH 2/2] midterm_upload

---
 bda602_midterm.py    | 276 ++++++++++++++++++++++++++++++++-----------
 requirements.dev.in  |  16 +--
 requirements.dev.txt | 119 ++++++++++---------
 requirements.in      |  11 +-
 requirements.txt     |  80 ++++++++++++-
 5 files changed, 365 insertions(+), 137 deletions(-)

diff --git a/bda602_midterm.py b/bda602_midterm.py
index 68d2240..6c18082 100644
--- a/bda602_midterm.py
+++ b/bda602_midterm.py
@@ -5,15 +5,13 @@
 import seaborn
 from sklearn import datasets
 import sys
-
+import warnings
 import numpy
 from plotly import express as px
 from plotly import figure_factory as ff
 from plotly import graph_objects as go
 from sklearn.metrics import confusion_matrix
 
-titanic_df = pd.read_csv(
-    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
 
 
 TITANIC_PREDICTORS = [
@@ -116,82 +114,220 @@ def get_test_data_set(data_set_name: str = None) -> (pd.DataFrame, List[str], st
 dataframe, predictors, response = get_test_data_set("breast_cancer")
 
 
-
-sibsp = dataframe['worst texture'].tolist()
-
-
-# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it.
-## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder)
-## loop through to find the datatype of each predictor create a list of the types.
 def column_sep(dataframe):
-    cat_df = pd.DataFrame()
-    cont_df = pd.DataFrame()
+    predictor_name = []
+    predictor_type = []
     for column in dataframe.columns:
         if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len(
-                pd.unique(dataframe[column])) == 2:
-            cat_df[column] = dataframe[column]
+                pd.unique(dataframe[column])) < 5:
+            predictor_name.append(column)
+            predictor_type.append("categorical")
         else:
-            cont_df[column] = dataframe[column]
-
-    return cat_df, cont_df
-
-
-cat_df, cont_df = column_sep(dataframe)
-
-
-# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it.
-## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder)
-## loop through to find the datatype of each predictor create a list of the types.
-def column_sep(dataframe):
-    predictor_dict = {}
-    for column in dataframe.columns:
-        if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len(
-                pd.unique(dataframe[column])) == 2:
-            predictor_dict[column] = "categorical"
-        else:
-            predictor_dict[column] = "continuous"
-
-    return predictor_dict
-
+            predictor_name.append(column)
+            predictor_type.append("continuous")
 
-predictors = column_sep(dataframe)
+    predictor_list = list(map(list, zip(predictor_name, predictor_type)))
 
-for key, value in predictors.items():
-    print
+    return predictor_list
 
-## get datatype and dataname
-predictors
 
+predictor_list = column_sep(dataframe)
 
-def type_chooser(predictors):
-    cont_cont_df = pd.DataFrame()
-    for key1 in range(len(predictors)):
-        for key2 in range(key1, len(predictors)):
-            if predictors.get(key1) == "continuous" and predictors.get(key2) == "continuous":
-                res = stats.pearsonr(dataframe[key1], dataframe[key2])
-                cont_cont_df["Pearsons_R"] = res
-            elif predictors.get(key1) == "continuous" and predictors.get(key2) == "categorical":
-                res = stats.pearsonr(dataframe[key1], dataframe[key2])
-                cont_cont_df["Pearsons_R"] = res
-
-
-res = type_chooser(predictors)
-
-print(res)
-
-res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
-res
-
-## for pearsons r continious v continious loop on columns for cont_df (double for loop) get pearson correlation. store in dataframe
-## enumerate the index
-## Continuous / Categorical pairs df.corr[]
-# Categorical / Categorical pairs us cat_correlation
-
-## use for loop to determine the graphing types. for loop then if else to determine if response is boolean or not
-## then correspond to correct correlation type.
-
-
-# for predictor1 in cat df
+def fill_na(data):
+    if isinstance(data, pd.Series):
+        return data.fillna(0)
+    else:
+        return numpy.array([value if value is not None else 0 for value in data])
 
+def cat_correlation(x, y, bias_correction=True, tschuprow=False):
+    """
+    Calculates correlation statistic for categorical-categorical association.
+    The two measures supported are:
+    1. Cramer'V ( default )
+    2. Tschuprow'T
+
+    SOURCES:
+    1.) CODE: https://github.com/MavericksDS/pycorr
+    2.) Used logic from:
+        https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix
+        to ignore yates correction factor on 2x2
+    3.) Haven't validated Tschuprow
+
+    Bias correction and formula's taken from : https://www.researchgate.net/publication/270277061_A_bias-correction_for_Cramer's_V_and_Tschuprow's_T
+
+    Wikipedia for Cramer's V: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
+    Wikipedia for Tschuprow' T: https://en.wikipedia.org/wiki/Tschuprow%27s_T
+    Parameters:
+    -----------
+    x : list / ndarray / Pandas Series
+        A sequence of categorical measurements
+    y : list / NumPy ndarray / Pandas Series
+        A sequence of categorical measurements
+    bias_correction : Boolean, default = True
+    tschuprow : Boolean, default = False
+               For choosing Tschuprow as measure
+    Returns:
+    --------
+    float in the range of [0,1]
+    """
+    corr_coeff = numpy.nan
+    try:
+        x, y = fill_na(x), fill_na(y)
+        crosstab_matrix = pd.crosstab(x, y)
+        n_observations = crosstab_matrix.sum().sum()
+
+        yates_correct = True
+        if bias_correction:
+            if crosstab_matrix.shape == (2, 2):
+                yates_correct = False
+
+        chi2, _, _, _ = stats.chi2_contingency(
+            crosstab_matrix, correction=yates_correct
+        )
+        phi2 = chi2 / n_observations
+
+        # r and c are number of categories of x and y
+        r, c = crosstab_matrix.shape
+        if bias_correction:
+            phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1))
+            r_corrected = r - ((r - 1) ** 2) / (n_observations - 1)
+            c_corrected = c - ((c - 1) ** 2) / (n_observations - 1)
+            if tschuprow:
+                corr_coeff = numpy.sqrt(
+                    phi2_corrected / numpy.sqrt((r_corrected - 1) * (c_corrected - 1))
+                )
+                return corr_coeff
+            corr_coeff = numpy.sqrt(
+                phi2_corrected / min((r_corrected - 1), (c_corrected - 1))
+            )
+            return corr_coeff
+        if tschuprow:
+            corr_coeff = numpy.sqrt(phi2 / numpy.sqrt((r - 1) * (c - 1)))
+            return corr_coeff
+        corr_coeff = numpy.sqrt(phi2 / min((r - 1), (c - 1)))
+        return corr_coeff
+    except Exception as ex:
+        print(ex)
+        if tschuprow:
+            warnings.warn("Error calculating Tschuprow's T", RuntimeWarning)
+        else:
+            warnings.warn("Error calculating Cramer's V", RuntimeWarning)
+        return corr_coeff
 
+def cat_cont_correlation_ratio(categories, values):
+    """
+    Correlation Ratio: https://en.wikipedia.org/wiki/Correlation_ratio
+    SOURCE:
+    1.) https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
+    :param categories: Numpy array of categories
+    :param values: Numpy array of values
+    :return: correlation
+    """
+    f_cat, _ = pd.factorize(categories)
+    cat_num = numpy.max(f_cat) + 1
+    y_avg_array = numpy.zeros(cat_num)
+    n_array = numpy.zeros(cat_num)
+    for i in range(0, cat_num):
+        cat_measures = values[numpy.argwhere(f_cat == i).flatten()]
+        n_array[i] = len(cat_measures)
+        y_avg_array[i] = numpy.average(cat_measures)
+    y_total_avg = numpy.sum(numpy.multiply(y_avg_array, n_array)) / numpy.sum(n_array)
+    numerator = numpy.sum(
+        numpy.multiply(
+            n_array, numpy.power(numpy.subtract(y_avg_array, y_total_avg), 2)
+        )
+    )
+    denominator = numpy.sum(numpy.power(numpy.subtract(values, y_total_avg), 2))
+    if numerator == 0:
+        eta = 0.0
+    else:
+        eta = numpy.sqrt(numerator / denominator)
+    return eta
+
+
+def type_sep(predictor_list, dataframe):
+    cont_cont_preds1 = []  # done
+    cont_cont_preds2 = []  # done
+    cont_cont_pearsons = []  # done
+    cont_cont_html = []  # done
+    #     cont_cont_preds1_values = [] #done
+    #     cont_cont_preds2_values = [] #done
+
+    cat_cont_preds1 = []
+    cat_cont_preds2 = []
+    cat_cont_html = []
+    for reg_predictor, reg_type in predictor_list:
+        for sorted_pred, sorted_type in predictor_list[1:]:
+            if reg_type == "continuous" and sorted_type == "continuous":
+                pearsons_r = stats.pearsonr(dataframe[reg_predictor].values, dataframe[sorted_pred].values)
+                cont_cont_preds1.append(reg_predictor)
+                cont_cont_preds2.append(sorted_pred)
+                cont_cont_pearsons.append(pearsons_r[0])
+                fig = px.scatter(dataframe, x=dataframe[reg_predictor].values, y=dataframe[sorted_pred].values,
+                                 trendline="ols")
+                fig.update_layout(title=f"chart{reg_predictor}_{sorted_pred}",
+                                  xaxis_title=f"Variable: {reg_predictor}", yaxis_title=f"Variable:{sorted_pred}")
+                html = "C:/Users/thoma\OneDrive/Documents/bda602/midterm/html_links/{0}_{1}_file.html".format(
+                    reg_predictor, sorted_pred)
+                fig.write_html(html)
+                cont_cont_html.append(html)
+
+                #             elif reg_type == "continuous" and sorted_type == "categorical" or reg_type == "categorical" and sorted_type == "continuous":  :
+                #                 cat_preds = np.array([sorted_pred])
+                #                 cont_preds = np.array([reg_predictor])
+                #                 cat_cont_array = np.concatenate((cat_preds,cont_preds))
+                #                 cat_array = dataframe[sorted_pred].to_numpy()
+                #                 flattened_cat = cat_array.flatten()
+
+                #                 cat_array = dataframe[sorted_pred].values.flatten()
+                #                 le = preprocessing.LabelEncoder()
+                #                 le.fit([dataframe[sorted_pred].flatten()])
+                #                 classes = le.classes_
+                #                 transformed_predictors = le.transform(classes)
+                fig1 = px.scatter(dataframe, x=dataframe[reg_predictor].values, y=dataframe[sorted_pred].values,
+                                  trendline="ols")
+                fig1.update_layout(title=f"chart{reg_predictor}_{sorted_pred}",
+                                   xaxis_title=f"Variable: {reg_predictor}", yaxis_title=f"Variable:{sorted_pred}")
+                html_cat_cont = "C:/Users/thoma\OneDrive/Documents/bda602/midterm/html_links/{0}_{1}_file.html".format(
+                    reg_predictor, sorted_pred)
+                fig1.write_html(html_cat_cont)
+                cat_cont_html.append(html)
+
+            #                 cat_cont_values = np.array([dataframe[reg_predictors],dataframe[sorted_pred]])
+    #                 eta = cat_cont_correlation_ratio(cat_cont_categories,cat_cont_values)
+
+    #             else:
+
+    #             x = dataframe[sorted_pred]
+    #             le = preprocessing.LabelEncoder()
+    #             le.fit([cat_array])
+    #             classes = le.classes_
+    #             transformed_predictors = le.transform(classes)
+    #             transformed_predictors
+
+    cont_cont_df = pd.DataFrame(
+        {"Predictor 1": cont_cont_preds1, "Predictor 2": cont_cont_preds2, "Pearsons R": cont_cont_pearsons,
+         "HTML_LinregGraph": cont_cont_html})
+    cont_cont_html = cont_cont_df.to_html()
+    text_file = open("C:/Users/thoma\OneDrive/Documents/bda602/midterm/html_links/test_dataframe.html", "w")
+    text_file.write(cont_cont_html)
+    text_file.close()
+
+    return cont_cont_df
+
+
+cont_cont_df = type_sep(predictor_list, dataframe)
+
+
+data = cont_cont_df["Pearsons R"].values
+heatmap = px.imshow(data,labels=dict(x=cont_cont_df["Predictor 1"], y=cont_cont_df["Predictor 2"]))
+heatmap.show()
+
+
+def main():
+    
+    return
+
+if __name__ == "__main__":
+    sys.exit(main())
 
diff --git a/requirements.dev.in b/requirements.dev.in
index a975411..bd93aea 100644
--- a/requirements.dev.in
+++ b/requirements.dev.in
@@ -1,8 +1,8 @@
-black==22.6.0
-detect-secrets==1.3.0
-flake8==4.0.1
-isort[requirements]==5.10.1
-jupyterlab
-pip-tools
-pre-commit==2.20.0
-nose
+black==22.6.0
+detect-secrets==1.3.0
+flake8==4.0.1
+isort[requirements]==5.10.1
+jupyterlab
+pip-tools
+pre-commit==2.20.0
+nose
diff --git a/requirements.dev.txt b/requirements.dev.txt
index e05a14b..7a57d34 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=requirements.dev.txt requirements.dev.in
 #
-anyio==3.6.1
+anyio==3.6.2
     # via jupyter-server
 argon2-cffi==21.3.0
     # via
@@ -13,9 +13,9 @@ argon2-cffi==21.3.0
     #   notebook
 argon2-cffi-bindings==21.2.0
     # via argon2-cffi
-asttokens==2.0.5
+asttokens==2.1.0
     # via stack-data
-attrs==21.4.0
+attrs==22.1.0
     # via jsonschema
 babel==2.10.3
     # via jupyterlab-server
@@ -27,21 +27,21 @@ black==22.6.0
     # via -r requirements.dev.in
 bleach==5.0.1
     # via nbconvert
-build==0.8.0
+build==0.9.0
     # via pip-tools
-certifi==2022.6.15
+certifi==2022.9.24
     # via requests
 cffi==1.15.1
     # via argon2-cffi-bindings
 cfgv==3.3.1
     # via pre-commit
-charset-normalizer==2.1.0
+charset-normalizer==2.1.1
     # via requests
 click==8.1.3
     # via
     #   black
     #   pip-tools
-debugpy==1.6.2
+debugpy==1.6.3
     # via ipykernel
 decorator==5.1.1
     # via ipython
@@ -49,35 +49,35 @@ defusedxml==0.7.1
     # via nbconvert
 detect-secrets==1.3.0
     # via -r requirements.dev.in
-distlib==0.3.5
+distlib==0.3.6
     # via virtualenv
 entrypoints==0.4
-    # via
-    #   jupyter-client
-    #   nbconvert
-executing==0.9.1
+    # via jupyter-client
+executing==1.2.0
     # via stack-data
-fastjsonschema==2.16.1
+fastjsonschema==2.16.2
     # via nbformat
-filelock==3.7.1
+filelock==3.8.0
     # via virtualenv
 flake8==4.0.1
     # via -r requirements.dev.in
-identify==2.5.2
+identify==2.5.8
     # via pre-commit
-idna==3.3
+idna==3.4
     # via
     #   anyio
     #   requests
-importlib-metadata==4.12.0
-    # via jupyterlab-server
-importlib-resources==5.9.0
+importlib-metadata==5.0.0
+    # via
+    #   jupyterlab-server
+    #   nbconvert
+importlib-resources==5.10.0
     # via jsonschema
-ipykernel==6.15.1
+ipykernel==6.16.2
     # via
     #   nbclassic
     #   notebook
-ipython==8.4.0
+ipython==8.5.0
     # via
     #   ipykernel
     #   jupyterlab
@@ -97,20 +97,20 @@ jinja2==3.1.2
     #   nbclassic
     #   nbconvert
     #   notebook
-json5==0.9.8
+json5==0.9.10
     # via jupyterlab-server
-jsonschema==4.7.2
+jsonschema==4.16.0
     # via
     #   jupyterlab-server
     #   nbformat
-jupyter-client==7.3.4
+jupyter-client==7.4.4
     # via
     #   ipykernel
     #   jupyter-server
     #   nbclassic
     #   nbclient
     #   notebook
-jupyter-core==4.11.1
+jupyter-core==4.11.2
     # via
     #   jupyter-client
     #   jupyter-server
@@ -119,49 +119,51 @@ jupyter-core==4.11.1
     #   nbconvert
     #   nbformat
     #   notebook
-jupyter-server==1.18.1
+jupyter-server==1.21.0
     # via
     #   jupyterlab
     #   jupyterlab-server
     #   nbclassic
     #   notebook-shim
-jupyterlab==3.4.4
+jupyterlab==3.5.0
     # via -r requirements.dev.in
 jupyterlab-pygments==0.2.2
     # via nbconvert
-jupyterlab-server==2.15.0
+jupyterlab-server==2.16.1
     # via jupyterlab
 markupsafe==2.1.1
     # via
     #   jinja2
     #   nbconvert
-matplotlib-inline==0.1.3
+matplotlib-inline==0.1.6
     # via
     #   ipykernel
     #   ipython
 mccabe==0.6.1
     # via flake8
-mistune==0.8.4
+mistune==2.0.4
     # via nbconvert
 mypy-extensions==0.4.3
     # via black
-nbclassic==0.4.3
-    # via jupyterlab
-nbclient==0.6.6
+nbclassic==0.4.5
+    # via
+    #   jupyterlab
+    #   notebook
+nbclient==0.7.0
     # via nbconvert
-nbconvert==6.5.0
+nbconvert==7.2.3
     # via
     #   jupyter-server
     #   nbclassic
     #   notebook
-nbformat==5.4.0
+nbformat==5.7.0
     # via
     #   jupyter-server
     #   nbclassic
     #   nbclient
     #   nbconvert
     #   notebook
-nest-asyncio==1.5.5
+nest-asyncio==1.5.6
     # via
     #   ipykernel
     #   jupyter-client
@@ -172,9 +174,9 @@ nodeenv==1.7.0
     # via pre-commit
 nose==1.3.7
     # via -r requirements.dev.in
-notebook==6.4.12
+notebook==6.5.1
     # via jupyterlab
-notebook-shim==0.1.0
+notebook-shim==0.2.0
     # via nbclassic
 packaging==21.3
     # via
@@ -188,30 +190,32 @@ pandocfilters==1.5.0
     # via nbconvert
 parso==0.8.3
     # via jedi
-pathspec==0.9.0
+pathspec==0.10.1
     # via black
-pep517==0.12.0
+pep517==0.13.0
     # via build
 pexpect==4.8.0
     # via ipython
 pickleshare==0.7.5
     # via ipython
-pip-tools==6.8.0
+pip-tools==6.9.0
     # via -r requirements.dev.in
+pkgutil-resolve-name==1.3.10
+    # via jsonschema
 platformdirs==2.5.2
     # via
     #   black
     #   virtualenv
 pre-commit==2.20.0
     # via -r requirements.dev.in
-prometheus-client==0.14.1
+prometheus-client==0.15.0
     # via
     #   jupyter-server
     #   nbclassic
     #   notebook
-prompt-toolkit==3.0.30
+prompt-toolkit==3.0.31
     # via ipython
-psutil==5.9.1
+psutil==5.9.3
     # via ipykernel
 ptyprocess==0.7.0
     # via
@@ -225,7 +229,7 @@ pycparser==2.21
     # via cffi
 pyflakes==2.4.0
     # via flake8
-pygments==2.12.0
+pygments==2.13.0
     # via
     #   ipython
     #   nbconvert
@@ -235,13 +239,13 @@ pyrsistent==0.18.1
     # via jsonschema
 python-dateutil==2.8.2
     # via jupyter-client
-pytz==2022.1
+pytz==2022.5
     # via babel
 pyyaml==6.0
     # via
     #   detect-secrets
     #   pre-commit
-pyzmq==23.2.0
+pyzmq==24.0.1
     # via
     #   ipykernel
     #   jupyter-client
@@ -262,18 +266,18 @@ six==1.16.0
     #   asttokens
     #   bleach
     #   python-dateutil
-sniffio==1.2.0
+sniffio==1.3.0
     # via anyio
 soupsieve==2.3.2.post1
     # via beautifulsoup4
-stack-data==0.3.0
+stack-data==0.6.0
     # via ipython
-terminado==0.15.0
+terminado==0.17.0
     # via
     #   jupyter-server
     #   nbclassic
     #   notebook
-tinycss2==1.1.1
+tinycss2==1.2.1
     # via nbconvert
 toml==0.10.2
     # via pre-commit
@@ -281,6 +285,7 @@ tomli==2.0.1
     # via
     #   black
     #   build
+    #   jupyterlab
     #   pep517
 tornado==6.2
     # via
@@ -291,7 +296,7 @@ tornado==6.2
     #   nbclassic
     #   notebook
     #   terminado
-traitlets==5.3.0
+traitlets==5.5.0
     # via
     #   ipykernel
     #   ipython
@@ -304,11 +309,11 @@ traitlets==5.3.0
     #   nbconvert
     #   nbformat
     #   notebook
-typing-extensions==4.3.0
+typing-extensions==4.4.0
     # via black
-urllib3==1.26.11
+urllib3==1.26.12
     # via requests
-virtualenv==20.16.1
+virtualenv==20.16.6
     # via pre-commit
 wcwidth==0.2.5
     # via prompt-toolkit
@@ -316,11 +321,11 @@ webencodings==0.5.1
     # via
     #   bleach
     #   tinycss2
-websocket-client==1.3.3
+websocket-client==1.4.1
     # via jupyter-server
 wheel==0.37.1
     # via pip-tools
-zipp==3.8.1
+zipp==3.10.0
     # via
     #   importlib-metadata
     #   importlib-resources
diff --git a/requirements.in b/requirements.in
index 296d654..952e14d 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1 +1,10 @@
-numpy
\ No newline at end of file
+mariadb
+numpy
+pandas
+plotly
+pyspark
+pyspark-stubs
+seaborn
+scikit-learn
+sqlalchemy
+statsmodels
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b000696..5aada87 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,83 @@
 #
 #    pip-compile --output-file=requirements.txt requirements.in
 #
-numpy==1.23.1
+contourpy==1.0.5
+    # via matplotlib
+cycler==0.11.0
+    # via matplotlib
+fonttools==4.38.0
+    # via matplotlib
+greenlet==1.1.3.post0
+    # via sqlalchemy
+joblib==1.2.0
+    # via scikit-learn
+kiwisolver==1.4.4
+    # via matplotlib
+mariadb==1.1.4
     # via -r requirements.in
+matplotlib==3.6.0
+    # via seaborn
+numpy==1.23.4
+    # via
+    #   -r requirements.in
+    #   contourpy
+    #   matplotlib
+    #   pandas
+    #   patsy
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   statsmodels
+packaging==21.3
+    # via
+    #   matplotlib
+    #   statsmodels
+pandas==1.5.1
+    # via
+    #   -r requirements.in
+    #   seaborn
+    #   statsmodels
+patsy==0.5.3
+    # via statsmodels
+pillow==9.3.0
+    # via matplotlib
+plotly==5.11.0
+    # via -r requirements.in
+py4j==0.10.9
+    # via pyspark
+pyparsing==3.0.9
+    # via
+    #   matplotlib
+    #   packaging
+pyspark==3.0.3
+    # via
+    #   -r requirements.in
+    #   pyspark-stubs
+pyspark-stubs==3.0.0.post3
+    # via -r requirements.in
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+pytz==2022.5
+    # via pandas
+scikit-learn==1.1.3
+    # via -r requirements.in
+scipy==1.9.3
+    # via
+    #   scikit-learn
+    #   statsmodels
+seaborn==0.12.1
+    # via -r requirements.in
+six==1.16.0
+    # via
+    #   patsy
+    #   python-dateutil
+sqlalchemy==1.4.42
+    # via -r requirements.in
+statsmodels==0.13.2
+    # via -r requirements.in
+tenacity==8.1.0
+    # via plotly
+threadpoolctl==3.1.0
+    # via scikit-learn