From 9f0119c41821d0461a12ad1936bcc474960f55f5 Mon Sep 17 00:00:00 2001 From: TDerig23 <89554971+TDerig23@users.noreply.github.com> Date: Wed, 12 Oct 2022 23:17:07 -0700 Subject: [PATCH 1/4] Delete bda602_hw3.py --- bda602_hw3.py | 94 --------------------------------------------------- 1 file changed, 94 deletions(-) delete mode 100644 bda602_hw3.py diff --git a/bda602_hw3.py b/bda602_hw3.py deleted file mode 100644 index 2b3399d..0000000 --- a/bda602_hw3.py +++ /dev/null @@ -1,94 +0,0 @@ -from pyspark.sql import SparkSession -from pyspark import keyword_only -from pyspark.ml.param.shared import HasInputCols, HasOutputCol -from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable -from pyspark.ml import Pipeline, Transformer - - -class SplitColumnTransform( - Transformer, - HasInputCols, - HasOutputCol, - DefaultParamsReadable, - DefaultParamsWritable, -): - @keyword_only - def __init__(self, inputCols=None, outputCol=None): - super(SplitColumnTransform, self).__init__() - kwargs = self._input_kwargs - self.setParams(**kwargs) - return - - @keyword_only - def setParams(self, inputCols=None, outputCol=None): - kwargs = self._input_kwargs - return self._set(**kwargs) - - def _transform(self, dataset): - input_cols = self.getInputCols() - output_col = self.getOutputCol() - - return dataset.show() - - -def main(): - appName = "App" - master = "local[*]" - spark = ( - SparkSession.builder.appName(appName) - .master(master) - .config( - "spark.jars", - "/mnt/c/Users/thoma/scr/PythonProjectTemplate/mariadb-java-client-3.0.8.jar", - ) - .enableHiveSupport() - .getOrCreate() - ) - - sql = ( - """SELECT bc.batter,bc.Hit, bc.atBat,g.game_id, g.local_dateFROM batter_counts bc, - SUM(nb.Hit) AS total_h,SUM(nb.atBat) as total_ab,(SUM(nb.Hit) / SUM(nb.atBat)) AS rolling_avg - JOIN game g - ON g.game_id = bc.game_id - order by bc.batter, bc.game_id""" - - ) - database = "baseball" - user = "" - password = "" - server = "127.0.0.1" - port = 3306 - jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme" - jdbc_driver = "org.mariadb.jdbc.Driver" - - df = ( - spark.read.format("jdbc") - .option("url", jdbc_url) - .option("query", sql) - .option("user", user) - .option("password", password) - .option("trustServerCertificate", True) - .option("driver", jdbc_driver) - .load() - ) - df.show(5) - df.printSchema() - - df.createOrReplaceTempView("rolling_avg") - df2 = spark.sql("""select batter, game_id, SUM(Hit) AS total_h,SUM(nb.atBat) - as total_ab,(SUM(nb.Hit) / SUM(nb.atBat)) AS rolling_avg - where nb.local_date >= 2012-03-20 00:00:00.000 and nb2.local_date < 2012-06-28 22:15:00.000 - GROUP by nb.batter,nb.local_date""" - ) - - - - new_transform = SplitColumnTransform() - pipeline = Pipeline(stages=[new_transform]) - model = pipeline.fit(df2) - model.transform(df2) - - -if __name__ == "__main__": - main() -# From 5108656bf0525af5bc13e5b22d5abe7aecce8483 Mon Sep 17 00:00:00 2001 From: TDerig23 <89554971+TDerig23@users.noreply.github.com> Date: Wed, 12 Oct 2022 23:17:50 -0700 Subject: [PATCH 2/4] wednesday_commit --- bda602_hw4.py | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 bda602_hw4.py diff --git a/bda602_hw4.py b/bda602_hw4.py new file mode 100644 index 0000000..7d9b30f --- /dev/null +++ b/bda602_hw4.py @@ -0,0 +1,210 @@ +import pandas as pd +import sys +import numpy +from plotly import express as px +from plotly import figure_factory as ff +from plotly import graph_objects as go +from sklearn.metrics import confusion_matrix + +titanic_df = pd.read_csv( + "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") + + +## this is where I will separate the column of an inputted dataset into categorical and continious. +def columnsep(): + + +cat_df = [] +cont_df = [] +for col in titanic_df: + if titanic_df[col].dtypes == "object" or titanic_df[col].dtypes == "bool": + extracted_cat = titanic_df[col] + cat_df = cat_df.append(extracted_cat) + else: + extracted_cont = titanic_df[col] + cont_df = cont_df.append(extracted_cont) + + +def cont_resp_cat_predictor(dataframe, cat_predictor, labels): + # Add histogram data + + # Group data together + + group_labels = dataframe[cat_predictor].values + + # Create distribution plot with custom bin_size + fig_1 = ff.create_distplot(dataframe, group_labels, bin_size=0.2) + fig_1.update_layout( + title="Continuous Response by Categorical Predictor", + xaxis_title="Response", + yaxis_title="Distribution", + ) + fig_1.show() + fig_1.write_html( + file="../../../plots/lecture_6_cont_response_cat_predictor_dist_plot.html", + include_plotlyjs="cdn", + ) + + fig_2 = go.Figure() + for curr_hist, curr_group in zip(dataframe, group_labels): + fig_2.add_trace( + go.Violin( + x=numpy.repeat(curr_group), + y=dataframe, + name=curr_group, + box_visible=True, + meanline_visible=True, + ) + ) + fig_2.update_layout( + title="Continuous Response by Categorical Predictor", + xaxis_title="Groupings", + yaxis_title="Response", + ) + fig_2.show() + + return + + +def cat_resp_cont_predictor(dataframe, cont_predictor): + # Group data together + hist_data = [x1, + + group_labels = ["Response = 0", "Response = 1"] + + # Create distribution plot with custom bin_size + fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2) + fig_1.update_layout( + title="Continuous Predictor by Categorical Response", + xaxis_title="Predictor", + yaxis_title="Distribution", + ) + fig_1.show() + fig_1.write_html( + file="../../../plots/lecture_6_cat_response_cont_predictor_dist_plot.html", + include_plotlyjs="cdn", + ) + + fig_2 = go.Figure() + for curr_hist, curr_group in zip(hist_data, group_labels): + fig_2.add_trace( + go.Violin( + x=numpy.repeat(curr_group, n), + y=curr_hist, + name=curr_group, + box_visible=True, + meanline_visible=True, + ) + ) + fig_2.update_layout( + title="Continuous Predictor by Categorical Response", + xaxis_title="Response", + yaxis_title="Predictor", + ) + fig_2.show() + fig_2.write_html( + file="../../../plots/lecture_6_cat_response_cont_predictor_violin_plot.html", + include_plotlyjs="cdn", + ) + return + + +def cat_response_cat_predictor(dataframe, cat_predictor): + conf_matrix = confusion_matrix(x_2, y_2) + + fig_no_relationship = go.Figure( + data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) + ) + fig_no_relationship.update_layout( + title="Categorical Predictor by Categorical Response (without relationship)", + xaxis_title="Response", + yaxis_title="Predictor", + ) + fig_no_relationship.show() + fig_no_relationship.write_html( + file="../../../plots/lecture_6_cat_response_cat_predictor_heat_map_no_relation.html", + include_plotlyjs="cdn", + ) + + x = numpy.random.randn(n) + y = x + numpy.random.randn(n) + + x_2 = [1 if abs(x_) > 1.5 else 0 for x_ in x] + y_2 = [1 if abs(y_) > 1.5 else 0 for y_ in y] + + conf_matrix = confusion_matrix(x_2, y_2) + + fig_no_relationship = go.Figure( + data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) + ) + fig_no_relationship.update_layout( + title="Categorical Predictor by Categorical Response (with relationship)", + xaxis_title="Response", + yaxis_title="Predictor", + ) + fig_no_relationship.show() + fig_no_relationship.write_html( + file="../../../plots/lecture_6_cat_response_cat_predictor_heat_map_yes_relation.html", + include_plotlyjs="cdn", + ) + return + + +def cont_response_cont_predictor(dataframe, cont_predictor): + fig = px.scatter(x=x, y=y, trendline="ols") + fig.update_layout( + title="Continuous Response by Continuous Predictor", + xaxis_title="Predictor", + yaxis_title="Response", + ) + fig.show() + fig.write_html( + file="../../../plots/lecture_6_cont_response_cont_predictor_scatter_plot.html", + include_plotlyjs="cdn", + ) + + return + + +def datasetregression(): + titanic_df = pd.read_csv( + "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") + X = titanic_df.data + y = titanic_df.target + + for idx, column in enumerate(X.T): + feature_name = diabetes.feature_names[idx] + predictor = statsmodels.api.add_constant(column) + linear_regression_model = statsmodels.api.OLS(y, predictor) + linear_regression_model_fitted = linear_regression_model.fit() + print(f"Variable: {feature_name}") + print(linear_regression_model_fitted.summary()) + + # Get the stats + t_value = round(linear_regression_model_fitted.tvalues[1], 6) + p_value = "{:.6e}".format(linear_regression_model_fitted.pvalues[1]) + + # Plot the figure + fig = px.scatter(x=column, y=y, trendline="ols") + fig.update_layout( + title=f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", + xaxis_title=f"Variable: {feature_name}", + yaxis_title="y", + ) + fig.show() + fig.write_html( + file=f"../../plots/lecture_6_var_{idx}.html", include_plotlyjs="cdn" + ) + + +def main(): + cont_resp_cat_predictor() + cat_resp_cont_predictor() + cat_response_cat_predictor() + cont_response_cont_predictor() + datasetregression() + return + + +if __name__ == "__main__": + sys.exit(main()) From c3eb6615683bcbe42a985fc58a9e50fdf983e018 Mon Sep 17 00:00:00 2001 From: TDerig23 <89554971+TDerig23@users.noreply.github.com> Date: Sat, 15 Oct 2022 00:04:28 -0700 Subject: [PATCH 3/4] hw4_final unless I add another before you download it --- bda602_hw4.py | 222 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 177 insertions(+), 45 deletions(-) diff --git a/bda602_hw4.py b/bda602_hw4.py index 7d9b30f..29fa122 100644 --- a/bda602_hw4.py +++ b/bda602_hw4.py @@ -1,55 +1,60 @@ +from io import StringIO + import pandas as pd import sys import numpy +import statsmodels as statsmodels from plotly import express as px from plotly import figure_factory as ff from plotly import graph_objects as go from sklearn.metrics import confusion_matrix +from sklearn.model_selection import GridSearchCV +from sklearn.tree import DecisionTreeClassifier, export_graphviz +import random -titanic_df = pd.read_csv( - "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") +def column_sep(dataframe): + cat_df = pd.DataFrame() + cont_df = pd.DataFrame() + for col in dataframe: + if dataframe[col].dtypes == "bool" or dataframe[col].dtypes == "object" or len(pd.unique(dataframe[col])) == 2: + extracted_cat = dataframe[col] + cat_df = cat_df.append(extracted_cat) + else: + extracted_cont = dataframe[col] + cont_df = cont_df.append(extracted_cont) -## this is where I will separate the column of an inputted dataset into categorical and continious. -def columnsep(): + return cat_df, cont_df -cat_df = [] -cont_df = [] -for col in titanic_df: - if titanic_df[col].dtypes == "object" or titanic_df[col].dtypes == "bool": - extracted_cat = titanic_df[col] - cat_df = cat_df.append(extracted_cat) - else: - extracted_cont = titanic_df[col] - cont_df = cont_df.append(extracted_cont) +def get_column_names(dataframe): + cat_df, cont_df = column_sep(dataframe) + return cat_df.info, cont_df.info -def cont_resp_cat_predictor(dataframe, cat_predictor, labels): - # Add histogram data +def cont_resp_cat_predictor(dataframe): + cat_df, cont_df = column_sep(dataframe) + n = random.randint(0, 8) + # Add histogram data + hist_data = cat_df.iloc[:, n] + group_labels = cont_df.iloc[:, n].values # Group data together - group_labels = dataframe[cat_predictor].values - # Create distribution plot with custom bin_size - fig_1 = ff.create_distplot(dataframe, group_labels, bin_size=0.2) + fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2) fig_1.update_layout( title="Continuous Response by Categorical Predictor", xaxis_title="Response", yaxis_title="Distribution", ) fig_1.show() - fig_1.write_html( - file="../../../plots/lecture_6_cont_response_cat_predictor_dist_plot.html", - include_plotlyjs="cdn", - ) fig_2 = go.Figure() - for curr_hist, curr_group in zip(dataframe, group_labels): + for curr_hist, curr_group in zip(hist_data, group_labels): fig_2.add_trace( go.Violin( - x=numpy.repeat(curr_group), + x=numpy.repeat(curr_group, n), y=dataframe, name=curr_group, box_visible=True, @@ -63,14 +68,16 @@ def cont_resp_cat_predictor(dataframe, cat_predictor, labels): ) fig_2.show() - return + return fig_1, fig_2 -def cat_resp_cont_predictor(dataframe, cont_predictor): +def cat_resp_cont_predictor(dataframe): + cat_df, cont_df = column_sep(dataframe) + n = random.randint(0, 8) + # Add histogram data + hist_data = cont_df.iloc[:, n] + group_labels = cat_df.iloc[:, n].values # Group data together - hist_data = [x1, - - group_labels = ["Response = 0", "Response = 1"] # Create distribution plot with custom bin_size fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2) @@ -109,8 +116,15 @@ def cat_resp_cont_predictor(dataframe, cont_predictor): return -def cat_response_cat_predictor(dataframe, cat_predictor): - conf_matrix = confusion_matrix(x_2, y_2) +def cat_response_cat_predictor(dataframe): + cat_df, cont_df = column_sep(dataframe) + n = random.randint(0, 8) + # Add histogram data + x = cat_df.iloc[:, n].values + y = cont_df.iloc[:, n].values + # Group data together + + conf_matrix = confusion_matrix(x, y) fig_no_relationship = go.Figure( data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) @@ -126,13 +140,7 @@ def cat_response_cat_predictor(dataframe, cat_predictor): include_plotlyjs="cdn", ) - x = numpy.random.randn(n) - y = x + numpy.random.randn(n) - - x_2 = [1 if abs(x_) > 1.5 else 0 for x_ in x] - y_2 = [1 if abs(y_) > 1.5 else 0 for y_ in y] - - conf_matrix = confusion_matrix(x_2, y_2) + conf_matrix = confusion_matrix(x,y) fig_no_relationship = go.Figure( data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) @@ -150,7 +158,14 @@ def cat_response_cat_predictor(dataframe, cat_predictor): return -def cont_response_cont_predictor(dataframe, cont_predictor): +def cont_response_cont_predictor(dataframe): + cat_df, cont_df = column_sep(dataframe) + n = random.randint(0, 8) + # Add histogram data + x = cat_df.iloc[:, n] + y = cont_df.iloc[:, n].values + # Group data together + fig = px.scatter(x=x, y=y, trendline="ols") fig.update_layout( title="Continuous Response by Continuous Predictor", @@ -166,14 +181,18 @@ def cont_response_cont_predictor(dataframe, cont_predictor): return -def datasetregression(): - titanic_df = pd.read_csv( - "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") - X = titanic_df.data - y = titanic_df.target +def datasetregression(dataframe): + cat_df, cont_df = column_sep(dataframe) + n = random.randint(0, 8) + # Add histogram data + hist_data = cat_df.iloc[:, n] + group_labels = cont_df.iloc + # Group data together + X = dataframe.data + y = dataframe.target for idx, column in enumerate(X.T): - feature_name = diabetes.feature_names[idx] + feature_name = dataframe.feature_names[idx] predictor = statsmodels.api.add_constant(column) linear_regression_model = statsmodels.api.OLS(y, predictor) linear_regression_model_fitted = linear_regression_model.fit() @@ -197,6 +216,119 @@ def datasetregression(): ) +def plot_decision_tree(decision_tree, feature_names, class_names, file_out): + with StringIO() as dot_data: + export_graphviz( + decision_tree, + feature_names=feature_names, + class_names=class_names, + out_file=dot_data, + filled=True, + ) + graph = pydot.graph_from_dot_data(dot_data.getvalue()) + + +def decision_tree_setup(dataframe): + # Increase pandas print viewport (so we see more on the screen) + pd.set_option("display.max_rows", 60) + pd.set_option("display.max_columns", 500) + pd.set_option("display.width", 1_000) + + cat_df, cont_df = column_sep(dataframe) + n = random.randint(0, 8) + # Add histogram data + + column_names = dataframe.columns + + # Drop rows with missing values + dataframe = dataframe.dropna() + + print("Original Dataset") + + # Continuous Features + + X = dataframe[cont_df].values + + # Response + y = dataframe.iloc[:, n].values + + # Decision Tree Classifier + max_tree_depth = 7 + tree_random_state = 0 # Always set a seed + decision_tree = DecisionTreeClassifier( + max_depth=max_tree_depth, random_state=tree_random_state + ) + decision_tree.fit(X, y) + + # Plot the decision tree + plot_decision_tree( + decision_tree=decision_tree, + feature_names=x, + class_names="classification", + file_out="../../plots/lecture_6_iris_tree_full", + ) + + # Find an optimal tree via cross-validation + parameters = { + "max_depth": range(1, max_tree_depth), + "criterion": ["gini", "entropy"], + } + decision_tree_grid_search = GridSearchCV( + DecisionTreeClassifier(random_state=tree_random_state), parameters, n_jobs=4 + ) + decision_tree_grid_search.fit(X=X, y=y) + + cv_results = DataFrame(decision_tree_grid_search.cv_results_["params"]) + cv_results["score"] = decision_tree_grid_search.cv_results_["mean_test_score"] + print_heading("Cross validation results") + print(cv_results) + print_heading("Cross validation results - HTML table") + print(cv_results.to_html()) + + # Plot these cross_val results + gini_results = cv_results.loc[cv_results["criterion"] == "gini"] + entropy_results = cv_results.loc[cv_results["criterion"] == "entropy"] + data = [ + go.Scatter( + x=gini_results["max_depth"].values, + y=gini_results["score"].values, + name="gini", + mode="lines", + ), + go.Scatter( + x=entropy_results["max_depth"].values, + y=entropy_results["score"].values, + name="entropy", + mode="lines", + ), + ] + + layout = go.Layout( + title="Fisher's Iris Cross Validation", + xaxis_title="Tree Depth", + yaxis_title="Score", + ) + + fig = go.Figure(data=data, layout=layout) + fig.show() + fig.write_html( + file="../../plots/lecture_6_iris_cross_val.html", + include_plotlyjs="cdn", + ) + + # Get the "best" model + best_tree_model = decision_tree_grid_search.best_estimator_ + + # Plot this "best" decision tree + plot_decision_tree( + decision_tree=best_tree_model, + feature_names=continuous_features, + class_names="classification", + file_out="../../plots/lecture_6_iris_tree_cross_val", + ) + return + + def main(): cont_resp_cat_predictor() cat_resp_cont_predictor() From e6397243ad625a64a20ca28df773cdf1077398ea Mon Sep 17 00:00:00 2001 From: TDerig23 <89554971+TDerig23@users.noreply.github.com> Date: Sat, 15 Oct 2022 00:13:55 -0700 Subject: [PATCH 4/4] new_final_commit unless I have a revelation. --- bda602_hw4.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bda602_hw4.py b/bda602_hw4.py index 29fa122..dd1be04 100644 --- a/bda602_hw4.py +++ b/bda602_hw4.py @@ -263,7 +263,7 @@ def decision_tree_setup(dataframe): # Plot the decision tree plot_decision_tree( decision_tree=decision_tree, - feature_names=x, + feature_names=X, class_names="classification", file_out="../../plots/lecture_6_iris_tree_full", ) @@ -278,11 +278,11 @@ def decision_tree_setup(dataframe): ) decision_tree_grid_search.fit(X=X, y=y) - cv_results = DataFrame(decision_tree_grid_search.cv_results_["params"]) + cv_results = dataframe(decision_tree_grid_search.cv_results_["params"]) cv_results["score"] = decision_tree_grid_search.cv_results_["mean_test_score"] - print_heading("Cross validation results") + print("Cross validation results") print(cv_results) - print_heading("Cross validation results - HTML table") + print("Cross validation results - HTML table") print(cv_results.to_html()) # Plot these cross_val results @@ -328,13 +328,15 @@ def decision_tree_setup(dataframe): ) return - +### column sep separates the cat from the cont columns def main(): cont_resp_cat_predictor() cat_resp_cont_predictor() cat_response_cat_predictor() cont_response_cont_predictor() datasetregression() + get_column_names() + column_sep() return