diff --git a/Assignment4.py b/Assignment4.py new file mode 100644 index 0000000..472cac4 --- /dev/null +++ b/Assignment4.py @@ -0,0 +1,377 @@ +import pandas as pd +import plotly.express as px +import statsmodels +from matplotlib import pyplot as plt +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import OneHotEncoder +from plotly import graph_objects as go +from plotly import figure_factory as ff +from sklearn.metrics import confusion_matrix +from itertools import combinations +import numpy +import statistics +import statsmodels.api as sm + + + +# insert your dataset here + +df2 = pd.read_csv( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz" +) +print(df2) + +#subset the dataset just to test and to avoid long running time +df = df2.head(100) + +# below line is for titanic specific dataset because the column name was already a key in python \ + +#df.rename(columns={"class": "v_class"}, inplace=True) +# print(list(df)) + +# Here we define the response and predictors +responses = ["traffic_volume"] +predictors = [ + "holiday", + "temp", + "rain_1h", + "snow_1h", + "clouds_all", + "weather_main", + "weather_description", + "date_time" +] + +print(list(responses)) +print(list(predictors)) + +# getting rid of missed values ASAP +# I replaced missing values with mean, it's not the exact way but better than dropping +# I Explained in the next part where 5 comes from, assumed if the column data type is int but it has only 5 values +# is categorical + +for col in df.columns: + if df[col].dtypes == "float" or df[col].dtypes == "int" and df[col].nunique() > 5: + df[col].fillna((df[col].mean()), inplace=True) + else: + df = df.apply(lambda col: col.fillna(col.value_counts().index[0])) + +#determine if boolean or continuous + +category_labels = {} +response_type = '' +for i in responses: + if df[i].nunique() == 2: + df[i] = df[i].astype("bool") + df.replace({False: 0, True: 1}, inplace=True) + category_labels = {idx: value for idx, value in enumerate(df[i].unique())} + response_type = 'categorical' + else: + response_type = 'continuous' + +print("Response type is:", response_type) + + +# create dictionary for predictors type + +# Honestly this code is not gonna be accurate for ALL variables out there, a categorical variable can be numeric +# Like cancer stages (1-4) or even float for example software versions. This is close to the most accurate we can get +# without analyzing the data first. I assumed if there's 5 ints only that's categorical too, and I made 5 up myself. + +predictors_type = {"continuous": [], "categorical": []} +continuous = df.select_dtypes(include=["float", "int"]) + +for i in predictors: + if i in list(continuous) and df[i].nunique() > 5: + predictors_type["continuous"].append(i) + else: + predictors_type["categorical"].append(i) +print(predictors_type) + + + +data_cat = df.select_dtypes("object") + +# one hot encoder for categorical variables +onehotencoder = OneHotEncoder(handle_unknown="ignore") +encoder = onehotencoder.fit_transform(data_cat.values.reshape(-1, 1)).toarray() +dfOneHot = pd.DataFrame(encoder) +data = pd.concat([df.select_dtypes(exclude=["object"]), dfOneHot], axis=1) +data = data.head(len(df)) +#print(data) + +# creating train and test set +for i in responses: + x = data.drop(i, axis=1) + y = data[i] +print(x) +print(y) + +x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20) +# +# +# I created a dictionary because of the sake of interpretation but lists are easier to work with in this plot example. +predictors_con = [] +predictors_cat = [] +for i in predictors: + if i in list(continuous): + predictors_con.append(i) + else: + predictors_cat.append(i) + +#Plotting the variables + +def cont_resp_cat_predictor(): + + n = 200 + + # Add histogram data and group data together + for i in predictors: + if i in predictors_cat and len(predictors_cat) > 2: + hist_data = list(combinations(predictors_cat, 3)) + elif i in predictors_cat and 0 < len(predictors_cat) < 3: + hist_data = list(combinations(predictors_cat, len(predictors_cat))) + + for j in df.columns: + if j in responses and response_type == 'continuous': + group_labels = sorted(df[i].unique()) + + # Create distribution plot with custom bin_size + fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2) + fig_1.update_layout( + title="Continuous Response by Categorical Predictor", + xaxis_title="Response", + yaxis_title="Distribution", + ) + fig_1.show() + + fig_2 = go.Figure() + for curr_hist, curr_group in zip(hist_data, group_labels): + fig_2.add_trace( + go.Violin( + x=numpy.repeat(curr_group, n), + y=curr_hist, + name=curr_group, + box_visible=True, + meanline_visible=True, + ) + ) + fig_2.update_layout( + title="Continuous Response by Categorical Predictor", + xaxis_title="Groupings", + yaxis_title="Response", + ) + fig_2.show() + + return + + +def cat_resp_cont_predictor(): + + n = 200 + # Add histogram data + hist_data = [] + for i in predictors: + if i in predictors_con and len(predictors_con) > 2: + hist_data = list(combinations(predictors_con, 3)) + elif i in predictors_cat and 0 < len(predictors_con) < 3: + hist_data = list(combinations(predictors_con, len(predictors_con))) + + for i in df.columns: + if i in responses and response_type == 'categorical': + group_labels = sorted(df[i].unique()) + + + # Create distribution plot with custom bin_size + fig_1 = ff.create_distplot(hist_data, group_labels, bin_size=0.2) + fig_1.update_layout( + title="Continuous Predictor by Categorical Response", + xaxis_title="Predictor", + yaxis_title="Distribution", + ) + fig_1.show() + + fig_2 = go.Figure() + for curr_hist, curr_group in zip(hist_data, group_labels): + fig_2.add_trace( + go.Violin( + x=numpy.repeat(curr_group, n), + y=curr_hist, + name=curr_group, + box_visible=True, + meanline_visible=True, + ) + ) + fig_2.update_layout( + title="Continuous Predictor by Categorical Response", + xaxis_title="Response", + yaxis_title="Predictor", + ) + fig_2.show() + + return + +def cat_response_cat_predictor(): + + x = [] + for i in predictors: + if i in predictors_cat and len(predictors_cat) > 2: + x = list(*combinations(predictors_cat, 3)) + elif i in predictors_cat and 0 < len(predictors_cat) < 3: + x = list(*combinations(predictors_cat, len(predictors_cat))) + + for i in df.columns: + if i in responses and response_type == 'categorical': + y = sorted(df[i].unique()) + + x_2 = [1 if abs(x_) > 0.5 else 0 for x_ in x] + y_2 = [1 if abs(y_) > 0.5 else 0 for y_ in y] + + conf_matrix = confusion_matrix(x_2, y_2) + + fig_no_relationship = go.Figure( + data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) + ) + fig_no_relationship.update_layout( + title="Categorical Predictor by Categorical Response (without relationship)", + xaxis_title="Response", + yaxis_title="Predictor", + ) + fig_no_relationship.show() + + + x_2 = [1 if abs(x_) > 1.5 else 0 for x_ in x] + y_2 = [1 if abs(y_) > 1.5 else 0 for y_ in y] + + conf_matrix = confusion_matrix(x_2, y_2) + + fig_no_relationship = go.Figure( + data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) + ) + fig_no_relationship.update_layout( + title="Categorical Predictor by Categorical Response (with relationship)", + xaxis_title="Response", + yaxis_title="Predictor", + ) + fig_no_relationship.show() + + return + + +def cont_response_cont_predictor(): + x = [] + for i in predictors: + if i in predictors_con and len(predictors_con) > 2: + x = list(*combinations(predictors_con, 3)) + elif i in predictors_con and 0 < len(predictors_con) < 3: + x = list(*combinations(predictors_con, len(predictors_con))) + + for i in df.columns: + if i in responses and response_type == 'categorical': + y = sorted(df[i].unique()) + + fig = px.scatter(x=x, y=y, trendline="ols") + fig.update_layout( + title="Continuous Response by Continuous Predictor", + xaxis_title="Predictor", + yaxis_title="Response", + ) + fig.show() + + return + + + +# creating models based on response category + +for i in responses: + if response_type == 'categorical': + model = LogisticRegression() + model_fitted = model.fit(x_train, y_train) + predictor_reg = model_fitted.predict(x_test) + else: + feature_name = i + predictor = statsmodels.api.add_constant(x) + model = statsmodels.api.OLS(y, predictor) + model_fitted = model.fit() + predictor_ols = model_fitted.predict() + print(model_fitted) + print(f"Variable: {feature_name}") + print(model_fitted.summary()) + + + # T-value and P-value + t_value = round(model_fitted.tvalues[1], 6) + p_value = "{:.6e}".format(model_fitted.pvalues[1]) + print("P-value is:", p_value) + print("T-value is:", t_value) + + # # Plot + # fig = px.scatter(x=[i].values, y=y, trendline="ols") + # fig.update_layout( + # title=f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", + # xaxis_title=f"Variable: {feature_name}", + # yaxis_title="y", + # ) + # fig.show() + + # Random forest + rf = RandomForestRegressor(n_estimators=100) + rf.fit(x_train, y_train) + sorted_idx = rf.feature_importances_.argsort() + plt.barh(feature_name, rf.feature_importances_[sorted_idx]) + plt.xlabel("Random Forest Feature Importance") + + + #Bin number based on Scot's Rule + + pred_stdev = statistics.stdev(predictor_ols) + pred_mean = statistics.mean(predictor_ols) + bin_width = 3.49*(pred_stdev)*(len(predictor_ols))**(-(1/3)) + bin_number = round(bin_width/len(predictor_ols)) + + #Difference with mean of response + + num = round(len(df) / bin_number) + list_df = [predictor_ols[i:i + num] for i in range(0, predictor_ols.shape[0], num)] + + + means =[] + lower_bin = [] + upper_bin = [] + bin_center = [] + bin_count = [] + for j in range(0, num-2): + lower_bin.append(list_df[j][0]) + upper_bin.append(list_df[j][-1]) + bin_count.append(len(list_df[j])) + chunk_mean = statistics.mean(list_df[j]) + means.append(chunk_mean) + print(means) + print(lower_bin) + print(upper_bin) + print(bin_count) + + i_list = list(range(0, num-2)) + + for i in means: + pop_mean = (means - pred_mean)**2 + + unweighted = pop_mean/bin_number + print(unweighted) + + + unweighted_table = pd.DataFrame( + {'i': i_list, + 'lower bin': lower_bin, + 'upper bin': upper_bin, + 'bin count': bin_count, + 'bin means': means, + 'population mean': pop_mean, + 'MeanSquaredDiff': unweighted + + }) + + print(unweighted_table) diff --git a/README.md b/README.md index 3cd99cc..c31fa59 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ - Setup a python 3.x venv (usually in `.venv`) - You can run `./scripts/create-venv.sh` to generate one - `pip3 install --upgrade pip` -- Install pip-tools `pip3 install pip-tools` -- Update dev requirements: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade` +- Install pip-tools ` ` +- Update dev requirement s: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade` - Update requirements: `pip-compile --output-file=requirements.txt requirements.in --upgrade` - Install dev requirements `pip3 install -r requirements.dev.txt` - Install requirements `pip3 install -r requirements.txt` diff --git a/mariadb-java-client-3.0.8.jar b/mariadb-java-client-3.0.8.jar new file mode 100644 index 0000000..0c21e78 Binary files /dev/null and b/mariadb-java-client-3.0.8.jar differ diff --git a/requirements.dev.txt b/requirements.dev.txt index e05a14b..3ad9ca3 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,15 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.dev.txt requirements.dev.in # anyio==3.6.1 # via jupyter-server +appnope==0.1.3 + # via + # ipykernel + # ipython argon2-cffi==21.3.0 # via # jupyter-server @@ -13,9 +17,9 @@ argon2-cffi==21.3.0 # notebook argon2-cffi-bindings==21.2.0 # via argon2-cffi -asttokens==2.0.5 +asttokens==2.0.8 # via stack-data -attrs==21.4.0 +attrs==22.1.0 # via jsonschema babel==2.10.3 # via jupyterlab-server @@ -29,19 +33,19 @@ bleach==5.0.1 # via nbconvert build==0.8.0 # via pip-tools -certifi==2022.6.15 +certifi==2022.9.24 # via requests cffi==1.15.1 # via argon2-cffi-bindings cfgv==3.3.1 # via pre-commit -charset-normalizer==2.1.0 +charset-normalizer==2.1.1 # via requests click==8.1.3 # via # black # pip-tools -debugpy==1.6.2 +debugpy==1.6.3 # via ipykernel decorator==5.1.1 # via ipython @@ -49,35 +53,29 @@ defusedxml==0.7.1 # via nbconvert detect-secrets==1.3.0 # via -r requirements.dev.in -distlib==0.3.5 +distlib==0.3.6 # via virtualenv entrypoints==0.4 - # via - # jupyter-client - # nbconvert -executing==0.9.1 + # via jupyter-client +executing==1.1.1 # via stack-data -fastjsonschema==2.16.1 +fastjsonschema==2.16.2 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv flake8==4.0.1 # via -r requirements.dev.in -identify==2.5.2 +identify==2.5.6 # via pre-commit -idna==3.3 +idna==3.4 # via # anyio # requests -importlib-metadata==4.12.0 - # via jupyterlab-server -importlib-resources==5.9.0 - # via jsonschema -ipykernel==6.15.1 +ipykernel==6.16.0 # via # nbclassic # notebook -ipython==8.4.0 +ipython==8.5.0 # via # ipykernel # jupyterlab @@ -97,13 +95,13 @@ jinja2==3.1.2 # nbclassic # nbconvert # notebook -json5==0.9.8 +json5==0.9.10 # via jupyterlab-server -jsonschema==4.7.2 +jsonschema==4.16.0 # via # jupyterlab-server # nbformat -jupyter-client==7.3.4 +jupyter-client==7.4.2 # via # ipykernel # jupyter-server @@ -119,49 +117,49 @@ jupyter-core==4.11.1 # nbconvert # nbformat # notebook -jupyter-server==1.18.1 +jupyter-server==1.21.0 # via # jupyterlab # jupyterlab-server # nbclassic # notebook-shim -jupyterlab==3.4.4 +jupyterlab==3.4.8 # via -r requirements.dev.in jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.15.0 +jupyterlab-server==2.15.2 # via jupyterlab markupsafe==2.1.1 # via # jinja2 # nbconvert -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via # ipykernel # ipython mccabe==0.6.1 # via flake8 -mistune==0.8.4 +mistune==2.0.4 # via nbconvert mypy-extensions==0.4.3 # via black -nbclassic==0.4.3 +nbclassic==0.4.6 # via jupyterlab -nbclient==0.6.6 +nbclient==0.7.0 # via nbconvert -nbconvert==6.5.0 +nbconvert==7.2.1 # via # jupyter-server # nbclassic # notebook -nbformat==5.4.0 +nbformat==5.7.0 # via # jupyter-server # nbclassic # nbclient # nbconvert # notebook -nest-asyncio==1.5.5 +nest-asyncio==1.5.6 # via # ipykernel # jupyter-client @@ -188,15 +186,15 @@ pandocfilters==1.5.0 # via nbconvert parso==0.8.3 # via jedi -pathspec==0.9.0 +pathspec==0.10.1 # via black -pep517==0.12.0 +pep517==0.13.0 # via build pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.8.0 +pip-tools==6.9.0 # via -r requirements.dev.in platformdirs==2.5.2 # via @@ -209,9 +207,9 @@ prometheus-client==0.14.1 # jupyter-server # nbclassic # notebook -prompt-toolkit==3.0.30 +prompt-toolkit==3.0.31 # via ipython -psutil==5.9.1 +psutil==5.9.2 # via ipykernel ptyprocess==0.7.0 # via @@ -225,7 +223,7 @@ pycparser==2.21 # via cffi pyflakes==2.4.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # ipython # nbconvert @@ -235,13 +233,13 @@ pyrsistent==0.18.1 # via jsonschema python-dateutil==2.8.2 # via jupyter-client -pytz==2022.1 +pytz==2022.4 # via babel pyyaml==6.0 # via # detect-secrets # pre-commit -pyzmq==23.2.0 +pyzmq==24.0.1 # via # ipykernel # jupyter-client @@ -262,13 +260,13 @@ six==1.16.0 # asttokens # bleach # python-dateutil -sniffio==1.2.0 +sniffio==1.3.0 # via anyio soupsieve==2.3.2.post1 # via beautifulsoup4 -stack-data==0.3.0 +stack-data==0.5.1 # via ipython -terminado==0.15.0 +terminado==0.16.0 # via # jupyter-server # nbclassic @@ -281,6 +279,7 @@ tomli==2.0.1 # via # black # build + # jupyterlab # pep517 tornado==6.2 # via @@ -291,7 +290,7 @@ tornado==6.2 # nbclassic # notebook # terminado -traitlets==5.3.0 +traitlets==5.4.0 # via # ipykernel # ipython @@ -304,11 +303,9 @@ traitlets==5.3.0 # nbconvert # nbformat # notebook -typing-extensions==4.3.0 - # via black -urllib3==1.26.11 +urllib3==1.26.12 # via requests -virtualenv==20.16.1 +virtualenv==20.16.5 # via pre-commit wcwidth==0.2.5 # via prompt-toolkit @@ -316,14 +313,10 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.3.3 +websocket-client==1.4.1 # via jupyter-server wheel==0.37.1 # via pip-tools -zipp==3.8.1 - # via - # importlib-metadata - # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements.in b/requirements.in index 296d654..9fcd45c 100644 --- a/requirements.in +++ b/requirements.in @@ -1 +1,12 @@ -numpy \ No newline at end of file +fastapi +flake8==5.0.4 +isort[requirements] +numpy +pandas +pip-tools +pre-commit +plotly +scikit-learn +statsmodels +matplotlib +matplotlib diff --git a/requirements.txt b/requirements.txt index b000696..51c0cbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,135 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.txt requirements.in # -numpy==1.23.1 +anyio==3.6.1 + # via starlette +build==0.8.0 + # via pip-tools +cfgv==3.3.1 + # via pre-commit +click==8.1.3 + # via pip-tools +contourpy==1.0.5 + # via matplotlib +cycler==0.11.0 + # via matplotlib +distlib==0.3.6 + # via virtualenv +fastapi==0.85.0 # via -r requirements.in +filelock==3.8.0 + # via virtualenv +flake8==5.0.4 + # via -r requirements.in +fonttools==4.37.4 + # via matplotlib +identify==2.5.6 + # via pre-commit +idna==3.4 + # via anyio +isort[requirements]==5.10.1 + # via -r requirements.in +joblib==1.2.0 + # via scikit-learn +kiwisolver==1.4.4 + # via matplotlib +matplotlib==3.6.1 + # via -r requirements.in +mccabe==0.7.0 + # via flake8 +nodeenv==1.7.0 + # via pre-commit +numpy==1.23.4 + # via + # -r requirements.in + # contourpy + # matplotlib + # pandas + # patsy + # scikit-learn + # scipy + # statsmodels +packaging==21.3 + # via + # build + # matplotlib + # statsmodels +pandas==1.5.0 + # via + # -r requirements.in + # statsmodels +patsy==0.5.3 + # via statsmodels +pep517==0.13.0 + # via build +pillow==9.2.0 + # via matplotlib +pip-tools==6.9.0 + # via -r requirements.in +platformdirs==2.5.2 + # via virtualenv +plotly==5.10.0 + # via -r requirements.in +pre-commit==2.20.0 + # via -r requirements.in +px==0.1.0 + # via -r requirements.in +pycodestyle==2.9.1 + # via flake8 +pydantic==1.10.2 + # via fastapi +pyflakes==2.5.0 + # via flake8 +pyparsing==3.0.9 + # via + # matplotlib + # packaging +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +pytz==2022.4 + # via pandas +pyyaml==6.0 + # via pre-commit +scikit-learn==1.1.2 + # via -r requirements.in +scipy==1.9.2 + # via + # scikit-learn + # statsmodels +six==1.16.0 + # via + # patsy + # python-dateutil +sniffio==1.3.0 + # via anyio +starlette==0.20.4 + # via fastapi +statsmodels==0.13.2 + # via -r requirements.in +tenacity==8.1.0 + # via plotly +threadpoolctl==3.1.0 + # via scikit-learn +toml==0.10.2 + # via pre-commit +tomli==2.0.1 + # via + # build + # pep517 +typing-extensions==4.4.0 + # via pydantic +virtualenv==20.16.5 + # via pre-commit +websocket-client==1.4.1 + # via px +wheel==0.37.1 + # via pip-tools + +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools