From 9f0119c41821d0461a12ad1936bcc474960f55f5 Mon Sep 17 00:00:00 2001
From: TDerig23 <89554971+TDerig23@users.noreply.github.com>
Date: Wed, 12 Oct 2022 23:17:07 -0700
Subject: [PATCH 1/4] Delete bda602_hw3.py

---
 bda602_hw3.py | 94 ---------------------------------------------------
 1 file changed, 94 deletions(-)
 delete mode 100644 bda602_hw3.py

diff --git a/bda602_hw3.py b/bda602_hw3.py
deleted file mode 100644
index 2b3399d..0000000
--- a/bda602_hw3.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from pyspark.sql import SparkSession
-from pyspark import keyword_only
-from pyspark.ml.param.shared import HasInputCols, HasOutputCol
-from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
-from pyspark.ml import Pipeline, Transformer
-
-
-class SplitColumnTransform(
-    Transformer,
-    HasInputCols,
-    HasOutputCol,
-    DefaultParamsReadable,
-    DefaultParamsWritable,
-):
-    @keyword_only
-    def __init__(self, inputCols=None, outputCol=None):
-        super(SplitColumnTransform, self).__init__()
-        kwargs = self._input_kwargs
-        self.setParams(**kwargs)
-        return
-
-    @keyword_only
-    def setParams(self, inputCols=None, outputCol=None):
-        kwargs = self._input_kwargs
-        return self._set(**kwargs)
-
-    def _transform(self, dataset):
-        input_cols = self.getInputCols()
-        output_col = self.getOutputCol()
-
-        return dataset.show()
-
-
-def main():
-    appName = "App"
-    master = "local[*]"
-    spark = (
-        SparkSession.builder.appName(appName)
-        .master(master)
-        .config(
-            "spark.jars",
-            "/mnt/c/Users/thoma/scr/PythonProjectTemplate/mariadb-java-client-3.0.8.jar",
-        )
-        .enableHiveSupport()
-        .getOrCreate()
-    )
-
-    sql = (
-        """SELECT bc.batter,bc.Hit, bc.atBat,g.game_id, g.local_dateFROM batter_counts bc,
-        SUM(nb.Hit) AS total_h,SUM(nb.atBat) as total_ab,(SUM(nb.Hit) / SUM(nb.atBat)) AS rolling_avg
-    JOIN game g
-    ON g.game_id = bc.game_id
-    order by bc.batter, bc.game_id"""
-
-    )
-    database = "baseball"
-    user = ""
-    password = ""
-    server = "127.0.0.1"
-    port = 3306
-    jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
-    jdbc_driver = "org.mariadb.jdbc.Driver"
-
-    df = (
-        spark.read.format("jdbc")
-        .option("url", jdbc_url)
-        .option("query", sql)
-        .option("user", user)
-        .option("password", password)
-        .option("trustServerCertificate", True)
-        .option("driver", jdbc_driver)
-        .load()
-    )
-    df.show(5)
-    df.printSchema()
-
-    df.createOrReplaceTempView("rolling_avg")
-    df2 = spark.sql("""select batter, game_id, SUM(Hit) AS total_h,SUM(nb.atBat) 
-                    as total_ab,(SUM(nb.Hit) / SUM(nb.atBat)) AS rolling_avg 
-                    where nb.local_date >= 2012-03-20 00:00:00.000  and nb2.local_date < 2012-06-28 22:15:00.000 
-                    GROUP by nb.batter,nb.local_date"""
-                      )
-
-
-
-    new_transform = SplitColumnTransform()
-    pipeline = Pipeline(stages=[new_transform])
-    model = pipeline.fit(df2)
-    model.transform(df2)
-
-
-if __name__ == "__main__":
-    main()
-#

From 5108656bf0525af5bc13e5b22d5abe7aecce8483 Mon Sep 17 00:00:00 2001
From: TDerig23 <89554971+TDerig23@users.noreply.github.com>
Date: Wed, 12 Oct 2022 23:17:50 -0700
Subject: [PATCH 2/4] wednesday_commit

---
 bda602_hw4.py | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 bda602_hw4.py

diff --git a/bda602_hw4.py b/bda602_hw4.py
new file mode 100644
index 0000000..7d9b30f
--- /dev/null
+++ b/bda602_hw4.py
@@ -0,0 +1,210 @@
+import pandas as pd
+import sys
+import numpy
+from plotly import express as px
+from plotly import figure_factory as ff
+from plotly import graph_objects as go
+from sklearn.metrics import confusion_matrix
+
+titanic_df = pd.read_csv(
+    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
+
+
+## this is where I will separate the column of an inputted dataset into categorical and continious.
+def columnsep():
+
+
+cat_df = []
+cont_df = []
+for col in titanic_df:
+    if titanic_df[col].dtypes == "object" or titanic_df[col].dtypes == "bool":
+        extracted_cat = titanic_df[col]
+        cat_df = cat_df.append(extracted_cat)
+    else:
+        extracted_cont = titanic_df[col]
+        cont_df = cont_df.append(extracted_cont)
+
+
+def cont_resp_cat_predictor(dataframe, cat_predictor, labels):
+    # Add histogram data
+
+    # Group data together
+
+    group_labels = dataframe[cat_predictor].values
+
+    # Create distribution plot with custom bin_size
+    fig_1 = ff.create_distplot(dataframe, group_labels, bin_size=0.2)
+    fig_1.update_layout(
+        title="Continuous Response by Categorical Predictor",
+        xaxis_title="Response",
+        yaxis_title="Distribution",
+    )
+    fig_1.show()
+    fig_1.write_html(
+        file="../../../plots/lecture_6_cont_response_cat_predictor_dist_plot.html",
+        include_plotlyjs="cdn",
+    )
+
+    fig_2 = go.Figure()
+    for curr_hist, curr_group in zip(dataframe, group_labels):
+        fig_2.add_trace(
+            go.Violin(
+                x=numpy.repeat(curr_group),
+                y=dataframe,
+                name=curr_group,
+                box_visible=True,
+                meanline_visible=True,
+            )
+        )
+    fig_2.update_layout(
+        title="Continuous Response by Categorical Predictor",
+        xaxis_title="Groupings",
+        yaxis_title="Response",
+    )
+    fig_2.show()
+
+    return
+
+
+def cat_resp_cont_predictor(dataframe, cont_predictor):
+    # Group data together
+    hist_data = [x1,
+
+                 group_labels = ["Response = 0", "Response = 1"]
+
+    # Create distribution plot with custom bin_size
+    fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2)
+    fig_1.update_layout(
+        title="Continuous Predictor by Categorical Response",
+        xaxis_title="Predictor",
+        yaxis_title="Distribution",
+    )
+    fig_1.show()
+    fig_1.write_html(
+        file="../../../plots/lecture_6_cat_response_cont_predictor_dist_plot.html",
+        include_plotlyjs="cdn",
+    )
+
+    fig_2 = go.Figure()
+    for curr_hist, curr_group in zip(hist_data, group_labels):
+        fig_2.add_trace(
+            go.Violin(
+                x=numpy.repeat(curr_group, n),
+                y=curr_hist,
+                name=curr_group,
+                box_visible=True,
+                meanline_visible=True,
+            )
+        )
+    fig_2.update_layout(
+        title="Continuous Predictor by Categorical Response",
+        xaxis_title="Response",
+        yaxis_title="Predictor",
+    )
+    fig_2.show()
+    fig_2.write_html(
+        file="../../../plots/lecture_6_cat_response_cont_predictor_violin_plot.html",
+        include_plotlyjs="cdn",
+    )
+    return
+
+
+def cat_response_cat_predictor(dataframe, cat_predictor):
+    conf_matrix = confusion_matrix(x_2, y_2)
+
+    fig_no_relationship = go.Figure(
+        data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max())
+    )
+    fig_no_relationship.update_layout(
+        title="Categorical Predictor by Categorical Response (without relationship)",
+        xaxis_title="Response",
+        yaxis_title="Predictor",
+    )
+    fig_no_relationship.show()
+    fig_no_relationship.write_html(
+        file="../../../plots/lecture_6_cat_response_cat_predictor_heat_map_no_relation.html",
+        include_plotlyjs="cdn",
+    )
+
+    x = numpy.random.randn(n)
+    y = x + numpy.random.randn(n)
+
+    x_2 = [1 if abs(x_) > 1.5 else 0 for x_ in x]
+    y_2 = [1 if abs(y_) > 1.5 else 0 for y_ in y]
+
+    conf_matrix = confusion_matrix(x_2, y_2)
+
+    fig_no_relationship = go.Figure(
+        data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max())
+    )
+    fig_no_relationship.update_layout(
+        title="Categorical Predictor by Categorical Response (with relationship)",
+        xaxis_title="Response",
+        yaxis_title="Predictor",
+    )
+    fig_no_relationship.show()
+    fig_no_relationship.write_html(
+        file="../../../plots/lecture_6_cat_response_cat_predictor_heat_map_yes_relation.html",
+        include_plotlyjs="cdn",
+    )
+    return
+
+
+def cont_response_cont_predictor(dataframe, cont_predictor):
+    fig = px.scatter(x=x, y=y, trendline="ols")
+    fig.update_layout(
+        title="Continuous Response by Continuous Predictor",
+        xaxis_title="Predictor",
+        yaxis_title="Response",
+    )
+    fig.show()
+    fig.write_html(
+        file="../../../plots/lecture_6_cont_response_cont_predictor_scatter_plot.html",
+        include_plotlyjs="cdn",
+    )
+
+    return
+
+
+def datasetregression():
+    titanic_df = pd.read_csv(
+        "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
+    X = titanic_df.data
+    y = titanic_df.target
+
+    for idx, column in enumerate(X.T):
+        feature_name = diabetes.feature_names[idx]
+        predictor = statsmodels.api.add_constant(column)
+        linear_regression_model = statsmodels.api.OLS(y, predictor)
+        linear_regression_model_fitted = linear_regression_model.fit()
+        print(f"Variable: {feature_name}")
+        print(linear_regression_model_fitted.summary())
+
+        # Get the stats
+        t_value = round(linear_regression_model_fitted.tvalues[1], 6)
+        p_value = "{:.6e}".format(linear_regression_model_fitted.pvalues[1])
+
+        # Plot the figure
+        fig = px.scatter(x=column, y=y, trendline="ols")
+        fig.update_layout(
+            title=f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})",
+            xaxis_title=f"Variable: {feature_name}",
+            yaxis_title="y",
+        )
+        fig.show()
+        fig.write_html(
+            file=f"../../plots/lecture_6_var_{idx}.html", include_plotlyjs="cdn"
+        )
+
+
+def main():
+    cont_resp_cat_predictor()
+    cat_resp_cont_predictor()
+    cat_response_cat_predictor()
+    cont_response_cont_predictor()
+    datasetregression()
+    return
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From c3eb6615683bcbe42a985fc58a9e50fdf983e018 Mon Sep 17 00:00:00 2001
From: TDerig23 <89554971+TDerig23@users.noreply.github.com>
Date: Sat, 15 Oct 2022 00:04:28 -0700
Subject: [PATCH 3/4] hw4_final

unless I add another before you download it
---
 bda602_hw4.py | 222 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 177 insertions(+), 45 deletions(-)

diff --git a/bda602_hw4.py b/bda602_hw4.py
index 7d9b30f..29fa122 100644
--- a/bda602_hw4.py
+++ b/bda602_hw4.py
@@ -1,55 +1,60 @@
+from io import StringIO
+
 import pandas as pd
 import sys
 import numpy
+import statsmodels as statsmodels
 from plotly import express as px
 from plotly import figure_factory as ff
 from plotly import graph_objects as go
 from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import GridSearchCV
+from sklearn.tree import DecisionTreeClassifier, export_graphviz
+import random
 
-titanic_df = pd.read_csv(
-    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
 
+def column_sep(dataframe):
+    cat_df = pd.DataFrame()
+    cont_df = pd.DataFrame()
+    for col in dataframe:
+        if dataframe[col].dtypes == "bool" or dataframe[col].dtypes == "object" or len(pd.unique(dataframe[col])) == 2:
+            extracted_cat = dataframe[col]
+            cat_df = cat_df.append(extracted_cat)
+        else:
+            extracted_cont = dataframe[col]
+            cont_df = cont_df.append(extracted_cont)
 
-## this is where I will separate the column of an inputted dataset into categorical and continious.
-def columnsep():
+    return cat_df, cont_df
 
 
-cat_df = []
-cont_df = []
-for col in titanic_df:
-    if titanic_df[col].dtypes == "object" or titanic_df[col].dtypes == "bool":
-        extracted_cat = titanic_df[col]
-        cat_df = cat_df.append(extracted_cat)
-    else:
-        extracted_cont = titanic_df[col]
-        cont_df = cont_df.append(extracted_cont)
+def get_column_names(dataframe):
+    cat_df, cont_df = column_sep(dataframe)
 
+    return cat_df.info, cont_df.info
 
-def cont_resp_cat_predictor(dataframe, cat_predictor, labels):
-    # Add histogram data
 
+def cont_resp_cat_predictor(dataframe):
+    cat_df, cont_df = column_sep(dataframe)
+    n = random.randint(0, 8)
+    # Add histogram data
+    hist_data = cat_df.iloc[:, n]
+    group_labels = cont_df.iloc[:, n].values
     # Group data together
 
-    group_labels = dataframe[cat_predictor].values
-
     # Create distribution plot with custom bin_size
-    fig_1 = ff.create_distplot(dataframe, group_labels, bin_size=0.2)
+    fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2)
     fig_1.update_layout(
         title="Continuous Response by Categorical Predictor",
         xaxis_title="Response",
         yaxis_title="Distribution",
     )
     fig_1.show()
-    fig_1.write_html(
-        file="../../../plots/lecture_6_cont_response_cat_predictor_dist_plot.html",
-        include_plotlyjs="cdn",
-    )
 
     fig_2 = go.Figure()
-    for curr_hist, curr_group in zip(dataframe, group_labels):
+    for curr_hist, curr_group in zip(hist_data, group_labels):
         fig_2.add_trace(
             go.Violin(
-                x=numpy.repeat(curr_group),
+                x=numpy.repeat(curr_group, n),
                 y=dataframe,
                 name=curr_group,
                 box_visible=True,
@@ -63,14 +68,16 @@ def cont_resp_cat_predictor(dataframe, cat_predictor, labels):
     )
     fig_2.show()
 
-    return
+    return fig_1, fig_2
 
 
-def cat_resp_cont_predictor(dataframe, cont_predictor):
+def cat_resp_cont_predictor(dataframe):
+    cat_df, cont_df = column_sep(dataframe)
+    n = random.randint(0, 8)
+    # Add histogram data
+    hist_data = cont_df.iloc[:, n]
+    group_labels = cat_df.iloc[:, n].values
     # Group data together
-    hist_data = [x1,
-
-                 group_labels = ["Response = 0", "Response = 1"]
 
     # Create distribution plot with custom bin_size
     fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2)
@@ -109,8 +116,15 @@ def cat_resp_cont_predictor(dataframe, cont_predictor):
     return
 
 
-def cat_response_cat_predictor(dataframe, cat_predictor):
-    conf_matrix = confusion_matrix(x_2, y_2)
+def cat_response_cat_predictor(dataframe):
+    cat_df, cont_df = column_sep(dataframe)
+    n = random.randint(0, 8)
+    # Add histogram data
+    x = cat_df.iloc[:, n].values
+    y = cont_df.iloc[:, n].values
+    # Group data together
+
+    conf_matrix = confusion_matrix(x, y)
 
     fig_no_relationship = go.Figure(
         data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max())
@@ -126,13 +140,7 @@ def cat_response_cat_predictor(dataframe, cat_predictor):
         include_plotlyjs="cdn",
     )
 
-    x = numpy.random.randn(n)
-    y = x + numpy.random.randn(n)
-
-    x_2 = [1 if abs(x_) > 1.5 else 0 for x_ in x]
-    y_2 = [1 if abs(y_) > 1.5 else 0 for y_ in y]
-
-    conf_matrix = confusion_matrix(x_2, y_2)
+    conf_matrix = confusion_matrix(x,y)
 
     fig_no_relationship = go.Figure(
         data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max())
@@ -150,7 +158,14 @@ def cat_response_cat_predictor(dataframe, cat_predictor):
     return
 
 
-def cont_response_cont_predictor(dataframe, cont_predictor):
+def cont_response_cont_predictor(dataframe):
+    cat_df, cont_df = column_sep(dataframe)
+    n = random.randint(0, 8)
+    # Add histogram data
+    x = cat_df.iloc[:, n]
+    y = cont_df.iloc[:, n].values
+    # Group data together
+
     fig = px.scatter(x=x, y=y, trendline="ols")
     fig.update_layout(
         title="Continuous Response by Continuous Predictor",
@@ -166,14 +181,18 @@ def cont_response_cont_predictor(dataframe, cont_predictor):
     return
 
 
-def datasetregression():
-    titanic_df = pd.read_csv(
-        "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
-    X = titanic_df.data
-    y = titanic_df.target
+def datasetregression(dataframe):
+    cat_df, cont_df = column_sep(dataframe)
+    n = random.randint(0, 8)
+    # Add histogram data
+    hist_data = cat_df.iloc[:, n]
+    group_labels = cont_df.iloc
+    # Group data together
+    X = dataframe.data
+    y = dataframe.target
 
     for idx, column in enumerate(X.T):
-        feature_name = diabetes.feature_names[idx]
+        feature_name = dataframe.feature_names[idx]
         predictor = statsmodels.api.add_constant(column)
         linear_regression_model = statsmodels.api.OLS(y, predictor)
         linear_regression_model_fitted = linear_regression_model.fit()
@@ -197,6 +216,119 @@ def datasetregression():
         )
 
 
+def plot_decision_tree(decision_tree, feature_names, class_names, file_out):
+    with StringIO() as dot_data:
+        export_graphviz(
+            decision_tree,
+            feature_names=feature_names,
+            class_names=class_names,
+            out_file=dot_data,
+            filled=True,
+        )
+        graph = pydot.graph_from_dot_data(dot_data.getvalue())
+
+
+def decision_tree_setup(dataframe):
+    # Increase pandas print viewport (so we see more on the screen)
+    pd.set_option("display.max_rows", 60)
+    pd.set_option("display.max_columns", 500)
+    pd.set_option("display.width", 1_000)
+
+    cat_df, cont_df = column_sep(dataframe)
+    n = random.randint(0, 8)
+    # Add histogram data
+
+    column_names = dataframe.columns
+
+    # Drop rows with missing values
+    dataframe = dataframe.dropna()
+
+    print("Original Dataset")
+
+    # Continuous Features
+
+    X = dataframe[cont_df].values
+
+    # Response
+    y = dataframe.iloc[:, n].values
+
+    # Decision Tree Classifier
+    max_tree_depth = 7
+    tree_random_state = 0  # Always set a seed
+    decision_tree = DecisionTreeClassifier(
+        max_depth=max_tree_depth, random_state=tree_random_state
+    )
+    decision_tree.fit(X, y)
+
+    # Plot the decision tree
+    plot_decision_tree(
+        decision_tree=decision_tree,
+        feature_names=x,
+        class_names="classification",
+        file_out="../../plots/lecture_6_iris_tree_full",
+    )
+
+    # Find an optimal tree via cross-validation
+    parameters = {
+        "max_depth": range(1, max_tree_depth),
+        "criterion": ["gini", "entropy"],
+    }
+    decision_tree_grid_search = GridSearchCV(
+        DecisionTreeClassifier(random_state=tree_random_state), parameters, n_jobs=4
+    )
+    decision_tree_grid_search.fit(X=X, y=y)
+
+    cv_results = DataFrame(decision_tree_grid_search.cv_results_["params"])
+    cv_results["score"] = decision_tree_grid_search.cv_results_["mean_test_score"]
+    print_heading("Cross validation results")
+    print(cv_results)
+    print_heading("Cross validation results - HTML table")
+    print(cv_results.to_html())
+
+    # Plot these cross_val results
+    gini_results = cv_results.loc[cv_results["criterion"] == "gini"]
+    entropy_results = cv_results.loc[cv_results["criterion"] == "entropy"]
+    data = [
+        go.Scatter(
+            x=gini_results["max_depth"].values,
+            y=gini_results["score"].values,
+            name="gini",
+            mode="lines",
+        ),
+        go.Scatter(
+            x=entropy_results["max_depth"].values,
+            y=entropy_results["score"].values,
+            name="entropy",
+            mode="lines",
+        ),
+    ]
+
+    layout = go.Layout(
+        title="Fisher's Iris Cross Validation",
+        xaxis_title="Tree Depth",
+        yaxis_title="Score",
+    )
+
+    fig = go.Figure(data=data, layout=layout)
+    fig.show()
+    fig.write_html(
+        file="../../plots/lecture_6_iris_cross_val.html",
+        include_plotlyjs="cdn",
+    )
+
+    # Get the "best" model
+    best_tree_model = decision_tree_grid_search.best_estimator_
+
+    # Plot this "best" decision tree
+    plot_decision_tree(
+        decision_tree=best_tree_model,
+        feature_names=continuous_features,
+        class_names="classification",
+        file_out="../../plots/lecture_6_iris_tree_cross_val",
+    )
+    return
+
+
 def main():
     cont_resp_cat_predictor()
     cat_resp_cont_predictor()

From e6397243ad625a64a20ca28df773cdf1077398ea Mon Sep 17 00:00:00 2001
From: TDerig23 <89554971+TDerig23@users.noreply.github.com>
Date: Sat, 15 Oct 2022 00:13:55 -0700
Subject: [PATCH 4/4] new_final_commit

unless I have a revelation.
---
 bda602_hw4.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/bda602_hw4.py b/bda602_hw4.py
index 29fa122..dd1be04 100644
--- a/bda602_hw4.py
+++ b/bda602_hw4.py
@@ -263,7 +263,7 @@ def decision_tree_setup(dataframe):
     # Plot the decision tree
     plot_decision_tree(
         decision_tree=decision_tree,
-        feature_names=x,
+        feature_names=X,
         class_names="classification",
         file_out="../../plots/lecture_6_iris_tree_full",
     )
@@ -278,11 +278,11 @@ def decision_tree_setup(dataframe):
     )
     decision_tree_grid_search.fit(X=X, y=y)
 
-    cv_results = DataFrame(decision_tree_grid_search.cv_results_["params"])
+    cv_results = dataframe(decision_tree_grid_search.cv_results_["params"])
     cv_results["score"] = decision_tree_grid_search.cv_results_["mean_test_score"]
-    print_heading("Cross validation results")
+    print("Cross validation results")
     print(cv_results)
-    print_heading("Cross validation results - HTML table")
+    print("Cross validation results - HTML table")
     print(cv_results.to_html())
 
     # Plot these cross_val results
@@ -328,13 +328,15 @@ def decision_tree_setup(dataframe):
     )
     return
 
-
+### column sep separates the cat from the cont columns
 def main():
     cont_resp_cat_predictor()
     cat_resp_cont_predictor()
     cat_response_cat_predictor()
     cont_response_cont_predictor()
     datasetregression()
+    get_column_names()
+    column_sep()
     return