diff --git a/Midterm_finalEdition.py b/Midterm_finalEdition.py
new file mode 100644
index 0000000..06f3857
--- /dev/null
+++ b/Midterm_finalEdition.py
@@ -0,0 +1,619 @@
+import os
+import sys
+import scipy.stats
+import statsmodels.api as sm
+import bisect
+from collections import defaultdict
+import math
+import numpy as np
+import statistics
+import itertools
+import pandas as pd
+import plotly.express as px
+import statsmodels
+import statsmodels.api as sm
+from plotly import graph_objects as go
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import OneHotEncoder
+import scipy.stats
+import warnings
+import numpy
+import pandas
+from scipy import stats
+
+
+# *************** Defining all necessary functions ******************
+
+
+def onehotencoder(df, df_cat):
+ onehotencoder = OneHotEncoder(handle_unknown="ignore")
+ encoder = onehotencoder.fit_transform(df_cat.values.reshape(-1, 1)).toarray()
+ dfOneHot = pd.DataFrame(encoder)
+ data = pd.concat([df.select_dtypes(exclude=["object"]), dfOneHot], axis=1)
+ data = data.head(len(df))
+ return data
+
+
+def boundaries(X):
+ stdev = statistics.stdev(X)
+ bin_width = 3.49 * (stdev) * (len(X)) ** (-(1 / 3))
+ bin_number = round(math.sqrt(len(X)))
+ sorted_pred = sorted(X)
+
+ boundaries = []
+ for i in range(0, bin_number):
+ boundaries.append(sorted_pred[0] + bin_width * i)
+ return boundaries
+
+
+def MeanSquaredDiff(X):
+ bn = boundaries(X)
+ pop_mean = statistics.mean(X)
+ dic = defaultdict(list)
+ total_population = len(X)
+
+ for x in X:
+ ind = bisect.bisect_right(bn, x)
+ dic[ind].append(x)
+ list_df = list(dic.values())
+
+ for j in range(0, len(list_df) - 1):
+ chunk_mean = statistics.mean(list_df[j])
+ msf = (chunk_mean - pop_mean) ** 2
+ MeanSquaredDiff = msf.sum() / total_population
+ return MeanSquaredDiff
+
+
+def bin_average(X):
+ bn = boundaries(X)
+ dic = defaultdict(list)
+ for x in X:
+ ind = bisect.bisect_right(bn, x)
+ dic[ind].append(x)
+ list_df = list(dic.values())
+
+ mean = []
+ for j in range(0, len(list_df) - 1):
+ chunk_mean = statistics.mean(list_df[j])
+ mean.append(chunk_mean)
+ return mean
+
+
+def WeightedMeanSquaredDiff(X):
+ bn = boundaries(X)
+ pop_mean = statistics.mean(X)
+ dic = defaultdict(list)
+ total_population = len(X)
+
+ for x in X:
+ ind = bisect.bisect_right(bn, x)
+ dic[ind].append(x)
+ list_df = list(dic.values())
+
+ for j in range(0, len(list_df) - 1):
+ chunk_mean = statistics.mean(list_df[j])
+ PopulationProportion = len(list_df[j]) / total_population
+ msf = (chunk_mean - pop_mean) ** 2
+ weightedMeanSquaredDiff = (PopulationProportion * msf.sum()) / total_population
+ return weightedMeanSquaredDiff
+
+
+def cat_correlation(x, y, bias_correction=True, tschuprow=False):
+ corr_coeff = numpy.nan
+ try:
+ x, y = fill_na(x), fill_na(y)
+ crosstab_matrix = pandas.crosstab(x, y)
+ n_observations = crosstab_matrix.sum().sum()
+
+ yates_correct = True
+ if bias_correction:
+ if crosstab_matrix.shape == (2, 2):
+ yates_correct = False
+
+ chi2, _, _, _ = stats.chi2_contingency(
+ crosstab_matrix, correction=yates_correct
+ )
+ phi2 = chi2 / n_observations
+
+ # r and c are number of categories of x and y
+ r, c = crosstab_matrix.shape
+ if bias_correction:
+ phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1))
+ r_corrected = r - ((r - 1) ** 2) / (n_observations - 1)
+ c_corrected = c - ((c - 1) ** 2) / (n_observations - 1)
+ if tschuprow:
+ corr_coeff = numpy.sqrt(
+ phi2_corrected / numpy.sqrt((r_corrected - 1) * (c_corrected - 1))
+ )
+ return corr_coeff
+ corr_coeff = numpy.sqrt(
+ phi2_corrected / min((r_corrected - 1), (c_corrected - 1))
+ )
+ return corr_coeff
+ if tschuprow:
+ corr_coeff = numpy.sqrt(phi2 / numpy.sqrt((r - 1) * (c - 1)))
+ return corr_coeff
+ corr_coeff = numpy.sqrt(phi2 / min((r - 1), (c - 1)))
+ return corr_coeff
+ except Exception as ex:
+ print(ex)
+ if tschuprow:
+ warnings.warn("Error calculating Tschuprow's T", RuntimeWarning)
+ else:
+ warnings.warn("Error calculating Cramer's V", RuntimeWarning)
+ return corr_coeff
+
+
+def cat_cont_correlation(categories, values):
+ f_cat, _ = pandas.factorize(categories)
+ cat_num = numpy.max(f_cat) + 1
+ y_avg_array = numpy.zeros(cat_num)
+ n_array = numpy.zeros(cat_num)
+ for i in range(0, cat_num):
+ cat_measures = values[numpy.argwhere(f_cat == i).flatten()]
+ n_array[i] = len(cat_measures)
+ y_avg_array[i] = numpy.average(cat_measures)
+ y_total_avg = numpy.sum(numpy.multiply(y_avg_array, n_array)) / numpy.sum(n_array)
+ numerator = numpy.sum(
+ numpy.multiply(
+ n_array, numpy.power(numpy.subtract(y_avg_array, y_total_avg), 2)
+ )
+ )
+ denominator = numpy.sum(numpy.power(numpy.subtract(values, y_total_avg), 2))
+ if numerator == 0:
+ eta = 0.0
+ else:
+ eta = numpy.sqrt(numerator / denominator)
+ return eta
+
+
+def fill_na(data):
+ if isinstance(data, pd.Series):
+ return data.fillna(0)
+ else:
+ return numpy.array([value if value is not None else 0 for value in data])
+
+
+def variable_cat_plot(x, y, path=None):
+ conf_matrix = confusion_matrix(x, y)
+
+ fig_no_relationship = go.Figure(
+ data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max())
+ )
+ fig_no_relationship.update_layout(
+ title="Categorical Predictor by Categorical Response (without relationship)",
+ xaxis_title=x.name,
+ yaxis_title=y.name,
+ )
+ if path is not None:
+ fig_no_relationship.write_html(path, include_plotlyjs="cdn")
+ else:
+ fig_no_relationship.show()
+ return
+
+
+def variable_con_plot(x, y, path=None):
+ fig = px.scatter(x=x, y=y, trendline="ols")
+ fig.update_layout(
+ title="Two Continuous Predictors",
+ xaxis_title=x.name,
+ yaxis_title=y.name,
+ )
+ if path is not None:
+ fig.write_html(path)
+ else:
+ fig.show()
+ return
+
+
+def hist(x_label, y_label, path=None):
+ df = px.data.tips()
+ fig = px.histogram(df, x=x_label, y=y_label, color=x_label, marginal="rug",
+ )
+ fig.update_layout(
+ xaxis_title=x_label.name,
+ yaxis_title=y_label.name,
+ legend_title=x_label.name,
+ font=dict(
+ family="Courier New, monospace",
+ size=18,
+ color="RebeccaPurple"
+ )
+ )
+ if path is not None:
+ fig.write_html(path)
+ else:
+ fig.show()
+ return
+
+
+def violin(x_label, y_label, path=None):
+ df = px.data.tips()
+ fig = px.violin(df, x=x_label, y=y_label, color=x_label, box=True, points="all",
+ )
+ fig.update_layout(
+ xaxis_title=x_label.name,
+ yaxis_title=y_label.name,
+ legend_title=x_label.name,
+ font=dict(
+ family="Courier New, monospace",
+ size=18,
+ color="RebeccaPurple"
+ )
+ )
+ if path is not None:
+ fig.write_html(path)
+ else:
+ fig.show()
+ return
+
+def dataframe_to_html(df, hyperlink_columns, out_path):
+ def make_hyperlink(path):
+ f_url = os.path.basename(path)
+ return u'{}'.format(path, f_url)
+
+ # This css class helps to make our table look stylish
+ css_style = ''
+ # Set formatter for the hyperlink columns
+ formatters = {}
+ for hyperlink_column in hyperlink_columns:
+ formatters[hyperlink_column] = make_hyperlink
+ # Generate HTML table from dataframe
+ html_output = df.to_html(classes="rendered_html", formatters=formatters, justify="center", escape=False)
+
+ # Write HTML output into file along with a link to our css style
+ with open(out_path, "w") as out_file:
+ # Writing data to a file
+ out_file.write(css_style)
+ out_file.write(html_output)
+
+
+def init_directories():
+ try:
+ os.mkdir("concat_plot")
+ os.mkdir("cat_plot")
+ os.mkdir("con_plot")
+ os.mkdir("catcon_plot")
+ except:
+ pass
+
+
+def main():
+ global table_final
+ init_directories()
+
+ # *************** Reading Dataset ******************
+ # Explanation 1: For the sake of running time, I'm doing all the analysis on the first 100 rows of dataset.
+ # You can run whole code with changing df_full to df and removing line 13.
+
+ # Explanation 2: I deleted all columns with only 1 unique values, since they cannot contribute to the model.
+
+ df_full = pd.read_csv(
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz"
+ )
+
+ df = df_full.head(100)
+
+ for col in df.columns:
+ if len(df[col].unique()) == 1:
+ df.drop(col, inplace=True, axis=1)
+
+ print(df.to_string())
+
+ # *************** Identifying Response and Predictors and Their Type ******************
+
+ responses = ["traffic_volume"]
+ predictors = ["temp", "clouds_all", "weather_main", "weather_description"]
+
+ response_type = ""
+
+ for i in responses:
+ if df[i].nunique() == 2:
+ df[i] = df[i].astype("bool")
+ df.replace({False: 0, True: 1}, inplace=True)
+ response_type = "categorical"
+ else:
+ response_type = "continuous"
+
+ predictors_type = {"continuous": [], "categorical": []}
+ continuous = df.select_dtypes(include=["float", "int"])
+ for i in predictors:
+ if i in list(continuous) and df[i].nunique() > 5:
+ predictors_type["continuous"].append(i)
+ else:
+ predictors_type["categorical"].append(i)
+
+ print("Response variable is:", *responses)
+ print("Response type is:", response_type)
+
+ print("Predictor variables are:", predictors)
+ print("Predictors types:", predictors_type)
+
+ # dividing dataframes to categorical and continuous
+
+ for key, value in predictors_type.items():
+ if key == "continuous":
+ df_continuous = df[value]
+ else:
+ df_categorical = df[value]
+
+ print(df_continuous)
+ print(df_categorical)
+
+ # creating list for continuous and categorical variables for iteration purposes
+
+ predictors_con = []
+ predictors_cat = []
+ for i in predictors:
+ if i in df_continuous:
+ predictors_con.append(i)
+ else:
+ predictors_cat.append(i)
+
+ # *************** Handling Null Values ******************
+
+ for col in df.columns:
+ if (
+ df[col].dtypes == "float"
+ or df[col].dtypes == "int"
+ and df[col].nunique() > 5
+ ):
+ df[col].fillna((df[col].mean()), inplace=True)
+ else:
+ df = df.apply(lambda col: col.fillna(col.value_counts().index[0]))
+
+ # *************** One Hot Encoder ******************
+
+ data_cat = df.select_dtypes("object")
+ data = onehotencoder(df, data_cat)
+
+ # *************** Test and Train Datasets ******************
+
+ for i in responses:
+ x = data.drop(i, axis=1)
+ y = data[i]
+
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
+
+ for i in responses:
+ if response_type == "categorical":
+ logr = LogisticRegression()
+ logr_fitted = logr.fit(x_train, y_train)
+ logr_predict = logr_fitted.predict(x_test)
+ print(logr_predict)
+ print(logr_fitted.summary())
+ else:
+ feature_name = i
+ ols_predict = statsmodels.api.add_constant(x)
+ ols = statsmodels.api.OLS(y, ols_predict)
+ ols_fitted = ols.fit()
+ predictor_ols = ols_fitted.predict()
+ print(predictor_ols)
+ print(f"Variable: {feature_name}")
+ print(ols_fitted.summary())
+
+ # *************** Correlation Tables for all 3 possibilities ******************
+
+ # 1. creating permutations:
+
+ combo = set(itertools.combinations(predictors, 2))
+
+ # 2. creating the tables:
+
+ table_con = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Pearson Correlation",
+ "Absolute Value of Correlation",
+ "Linear Regression Plot",
+ ]
+ )
+
+ table_cat = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Cramers V",
+ "Absolute Value of Correlation",
+ "heatmap"
+ ]
+ )
+
+ table_catcon = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Correlation ratio",
+ "Absolute Value of Correlation",
+ "Violin Plot",
+ "Histogram Plot"
+ ]
+ )
+
+ table_concat = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Correlation ratio",
+ "Absolute Value of Correlation",
+ "Violin Plot",
+ "Histogram Plot"
+ ]
+ )
+
+ # 3. Fill in the Correlation tables and draw plots
+ # Explanation: First it calculate correlation and add it to the proper table, then it creates plots.
+
+ for index, tup in enumerate(combo):
+ if tup[0] in predictors_con and tup[1] in predictors_con:
+ x_label = df[tup[0]]
+ y_label = df[tup[1]]
+ pearson = scipy.stats.pearsonr(x_label, y_label).statistic
+ path = "{}/{}_{}_{}".format("con_plot", tup[0], tup[1], "con_plot.html")
+ variable_con_plot(x_label, y_label, path)
+ new = [x_label.name, y_label.name, pearson, np.abs(pearson), path]
+ table_con.loc[len(table_con)] = new
+ elif tup[0] in predictors_cat and tup[1] in predictors_cat:
+ x_label = df[tup[0]]
+ y_label = df[tup[1]]
+ fill_na(df)
+ correlation = cat_correlation(x_label, y_label)
+ path = "{}/{}_{}_{}".format("cat_plot", tup[0], tup[1], "cat_plot.html")
+ variable_cat_plot(x_label, y_label, path)
+ new = [x_label.name, y_label.name, correlation, np.abs(correlation), path]
+ table_cat.loc[len(table_cat)] = new
+ elif tup[0] in predictors_cat and tup[1] in predictors_con:
+ x_label = df[tup[0]]
+ y_label = df[tup[1]]
+ correlation = cat_cont_correlation(x_label, y_label)
+ path1 = "{}/{}_{}_{}".format("catcon_plot", tup[0], tup[1], "catcon_plot1.html")
+ path2 = "{}/{}_{}_{}".format("catcon_plot", tup[0], tup[1], "catcon_plot2.html")
+ hist(x_label, y_label, path1)
+ violin(x_label, y_label, path2)
+ new = [x_label.name, y_label.name, correlation, np.abs(correlation), path1, path2]
+ table_catcon.loc[len(table_catcon)] = new
+ elif tup[0] in predictors_con and tup[1] in predictors_cat:
+ x_label = df[tup[1]]
+ y_label = df[tup[0]]
+ correlation = cat_cont_correlation(x_label, y_label)
+ path1 = "{}/{}_{}_{}".format("concat_plot", tup[0], tup[1], "concat_plot1.html")
+ path2 = "{}/{}_{}_{}".format("concat_plot", tup[0], tup[1], "concat_plot2.html")
+ violin(x_label, y_label, path2)
+ hist(x_label, y_label, path1)
+ new = [x_label.name, y_label.name, correlation, np.abs(correlation), path1, path2]
+ table_concat.loc[len(table_concat)] = new
+ table_final = pd.concat([table_concat, table_catcon])
+
+ dataframe_to_html(table_cat, ["heatmap"], "tablecat.html")
+ dataframe_to_html(table_con, ["Linear Regression Plot"], "tablecon.html")
+ dataframe_to_html(table_final, ["Violin Plot", "Histogram Plot"], "tableboth.html")
+ # *************** Brute Force for all 3 possibilities ******************
+
+ # 1. creating the tables:
+
+ brute_force_con = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ brute_force_cat = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ brute_force_both1 = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ brute_force_both2 = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ # 2. Fill in the Brute Force tables
+
+ for i in responses:
+ if response_type == "continuous":
+ for index, tup in enumerate(combo):
+ if tup[0] in predictors_con and tup[1] in predictors_con:
+ x = tup[0]
+ y = tup[1]
+ dataset = data[[x, y]]
+ model = sm.OLS(df[i], dataset, axis=1).fit()
+ pred = model.predict()
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ # corr = model.rsquared ** .5
+ con_new = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_con.loc[len(brute_force_con)] = con_new
+ elif tup[0] in predictors_cat and tup[1] in predictors_cat:
+ x = tup[0]
+ y = tup[1]
+ dataset = df[[x, y]]
+ dt = onehotencoder(dataset, dataset)
+ model = sm.OLS(df[i], dt, axis=1).fit()
+ pred = model.predict()
+ # corr = model.rsquared ** .5
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ cat_new = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_cat.loc[len(brute_force_cat)] = cat_new
+ elif tup[0] in predictors_cat and tup[1] in predictors_con:
+ x = tup[0]
+ y = tup[1]
+ dataset = df[[x, y]]
+ dt = onehotencoder(dataset, df[x])
+ model = sm.OLS(df[i], dt, axis=1).fit()
+ pred = model.predict()
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ both1 = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_both1.loc[len(brute_force_both1)] = both1
+ elif tup[0] in predictors_con and tup[1] in predictors_cat:
+ x = tup[0]
+ y = tup[1]
+ dataset = df[[x, y]]
+ dt = onehotencoder(dataset, df[y])
+ model = sm.OLS(df[i], dt, axis=1).fit()
+ pred = model.predict()
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+
+ both2 = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_both2.loc[len(brute_force_both2)] = both2
+ brute_force_both = pd.concat([brute_force_both1, brute_force_both2])
+
+ # *************** heatmap for all 3 possibilities ******************
+
+ # sns.heatmap(df_continuous)
+ # plt.show()
+
+ # df_cat = df_categorical.apply(lambda x: pd.factorize(x)[0]).corr(method="pearson", min_periods=1)
+ # sns.heatmap(df_cat, annot=True)
+ # plt.show()
+
+ # df_all = df.apply(lambda x: pd.factorize(x)[0]).corr(method="pearson", min_periods=1)
+ # sns.heatmap(df_all, annot=True)
+ # plt.show()
+
+ # dataframe_to_html(brute_force_con, ["Residual Plot"], "brutetablcon.html")
+ # *************** Printing Tables for Each Possibility ******************
+
+ # # 1. both categorical
+ print(table_cat.sort_values(["Cramers V"], ascending=[False]).to_string())
+ print(brute_force_cat.to_string())
+
+ # # 2. both continuous
+ print(table_con.sort_values(["Pearson Correlation"], ascending=[False]).to_string())
+ print(brute_force_con.to_string())
+
+ # # 3. categorical and continuous
+ print(table_final.sort_values(["Correlation ratio"], ascending=[False]).to_string())
+ print(brute_force_both.to_string())
+
+
+ return
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/README.md b/README.md
index 3cd99cc..c31fa59 100644
--- a/README.md
+++ b/README.md
@@ -5,8 +5,8 @@
- Setup a python 3.x venv (usually in `.venv`)
- You can run `./scripts/create-venv.sh` to generate one
- `pip3 install --upgrade pip`
-- Install pip-tools `pip3 install pip-tools`
-- Update dev requirements: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade`
+- Install pip-tools ` `
+- Update dev requirement s: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade`
- Update requirements: `pip-compile --output-file=requirements.txt requirements.in --upgrade`
- Install dev requirements `pip3 install -r requirements.dev.txt`
- Install requirements `pip3 install -r requirements.txt`
diff --git a/mariadb-java-client-3.0.8.jar b/mariadb-java-client-3.0.8.jar
new file mode 100644
index 0000000..0c21e78
Binary files /dev/null and b/mariadb-java-client-3.0.8.jar differ
diff --git a/midterm.py b/midterm.py
new file mode 100644
index 0000000..1fd77c2
--- /dev/null
+++ b/midterm.py
@@ -0,0 +1,441 @@
+import bisect
+import itertools
+import math
+import statistics
+import sys
+import warnings
+from collections import defaultdict
+
+import numpy
+import numpy as np
+import pandas
+import pandas as pd
+import scipy.stats
+import statsmodels
+import statsmodels.api as sm
+from scipy import stats
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import OneHotEncoder
+
+# *************** Defining all necessary functions ******************
+
+
+def onehotencoder(df, df_cat):
+ onehotencoder = OneHotEncoder(handle_unknown="ignore")
+ encoder = onehotencoder.fit_transform(df_cat.values.reshape(-1, 1)).toarray()
+ dfOneHot = pd.DataFrame(encoder)
+ data = pd.concat([df.select_dtypes(exclude=["object"]), dfOneHot], axis=1)
+ data = data.head(len(df))
+ return data
+
+
+def boundaries(X):
+ stdev = statistics.stdev(X)
+ bin_width = 3.49 * (stdev) * (len(X)) ** (-(1 / 3))
+ bin_number = round(math.sqrt(len(X)))
+ sorted_pred = sorted(X)
+
+ boundaries = []
+ for i in range(0, bin_number):
+ boundaries.append(sorted_pred[0] + bin_width * i)
+ return boundaries
+
+
+def MeanSquaredDiff(X):
+ bn = boundaries(X)
+ pop_mean = statistics.mean(X)
+ dic = defaultdict(list)
+ total_population = len(X)
+
+ for x in X:
+ ind = bisect.bisect_right(bn, x)
+ dic[ind].append(x)
+ list_df = list(dic.values())
+
+ for j in range(0, len(list_df) - 1):
+ chunk_mean = statistics.mean(list_df[j])
+ msf = (chunk_mean - pop_mean) ** 2
+ MeanSquaredDiff = msf.sum() / total_population
+ return MeanSquaredDiff
+
+
+def WeightedMeanSquaredDiff(X):
+ bn = boundaries(X)
+ pop_mean = statistics.mean(X)
+ dic = defaultdict(list)
+ total_population = len(X)
+
+ for x in X:
+ ind = bisect.bisect_right(bn, x)
+ dic[ind].append(x)
+ list_df = list(dic.values())
+
+ for j in range(0, len(list_df) - 1):
+ chunk_mean = statistics.mean(list_df[j])
+ PopulationProportion = len(list_df[j]) / total_population
+ msf = (chunk_mean - pop_mean) ** 2
+ weightedMeanSquaredDiff = (PopulationProportion * msf.sum()) / total_population
+ return weightedMeanSquaredDiff
+
+
+def cat_correlation(x, y, bias_correction=True, tschuprow=False):
+ corr_coeff = numpy.nan
+ try:
+ x, y = fill_na(x), fill_na(y)
+ crosstab_matrix = pandas.crosstab(x, y)
+ n_observations = crosstab_matrix.sum().sum()
+
+ yates_correct = True
+ if bias_correction:
+ if crosstab_matrix.shape == (2, 2):
+ yates_correct = False
+
+ chi2, _, _, _ = stats.chi2_contingency(
+ crosstab_matrix, correction=yates_correct
+ )
+ phi2 = chi2 / n_observations
+
+ # r and c are number of categories of x and y
+ r, c = crosstab_matrix.shape
+ if bias_correction:
+ phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1))
+ r_corrected = r - ((r - 1) ** 2) / (n_observations - 1)
+ c_corrected = c - ((c - 1) ** 2) / (n_observations - 1)
+ if tschuprow:
+ corr_coeff = numpy.sqrt(
+ phi2_corrected / numpy.sqrt((r_corrected - 1) * (c_corrected - 1))
+ )
+ return corr_coeff
+ corr_coeff = numpy.sqrt(
+ phi2_corrected / min((r_corrected - 1), (c_corrected - 1))
+ )
+ return corr_coeff
+ if tschuprow:
+ corr_coeff = numpy.sqrt(phi2 / numpy.sqrt((r - 1) * (c - 1)))
+ return corr_coeff
+ corr_coeff = numpy.sqrt(phi2 / min((r - 1), (c - 1)))
+ return corr_coeff
+ except Exception as ex:
+ print(ex)
+ if tschuprow:
+ warnings.warn("Error calculating Tschuprow's T", RuntimeWarning)
+ else:
+ warnings.warn("Error calculating Cramer's V", RuntimeWarning)
+ return corr_coeff
+
+
+def cat_cont_correlation(categories, values):
+ f_cat, _ = pandas.factorize(categories)
+ cat_num = numpy.max(f_cat) + 1
+ y_avg_array = numpy.zeros(cat_num)
+ n_array = numpy.zeros(cat_num)
+ for i in range(0, cat_num):
+ cat_measures = values[numpy.argwhere(f_cat == i).flatten()]
+ n_array[i] = len(cat_measures)
+ y_avg_array[i] = numpy.average(cat_measures)
+ y_total_avg = numpy.sum(numpy.multiply(y_avg_array, n_array)) / numpy.sum(n_array)
+ numerator = numpy.sum(
+ numpy.multiply(
+ n_array, numpy.power(numpy.subtract(y_avg_array, y_total_avg), 2)
+ )
+ )
+ denominator = numpy.sum(numpy.power(numpy.subtract(values, y_total_avg), 2))
+ if numerator == 0:
+ eta = 0.0
+ else:
+ eta = numpy.sqrt(numerator / denominator)
+ return eta
+
+
+def fill_na(data):
+ if isinstance(data, pd.Series):
+ return data.fillna(0)
+ else:
+ return numpy.array([value if value is not None else 0 for value in data])
+
+
+def main():
+ # *************** Reading Dataset ******************
+
+ # Explanation 1: For the sake of running time, I'm doing all the analysis on the first 100 rows of dataset.
+ # You can run whole code with changing df_full to df and removing line 13.
+
+ # Explanation 2: I deleted all columns with only 1 unique values, since they cannot contribute to the model.
+
+ df_full = pd.read_csv(
+ "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz"
+ )
+
+ df = df_full.head(100)
+
+ for col in df.columns:
+ if len(df[col].unique()) == 1:
+ df.drop(col, inplace=True, axis=1)
+
+ print(df.to_string())
+
+ # *************** Identifying Response and Predictors and Their Type ******************
+
+ responses = ["traffic_volume"]
+ predictors = ["temp", "clouds_all", "weather_main", "weather_description"]
+
+ response_type = ""
+
+ for i in responses:
+ if df[i].nunique() == 2:
+ df[i] = df[i].astype("bool")
+ df.replace({False: 0, True: 1}, inplace=True)
+ response_type = "categorical"
+ else:
+ response_type = "continuous"
+
+ predictors_type = {"continuous": [], "categorical": []}
+ continuous = df.select_dtypes(include=["float", "int"])
+ for i in predictors:
+ if i in list(continuous) and df[i].nunique() > 5:
+ predictors_type["continuous"].append(i)
+ else:
+ predictors_type["categorical"].append(i)
+
+ print("Response variable is:", *responses)
+ print("Response type is:", response_type)
+
+ print("Predictor variables are:", predictors)
+ print("Predictors types:", predictors_type)
+
+ # dividing dataframes to categorical and continuous
+
+ for key, value in predictors_type.items():
+ if key == "continuous":
+ df_continuous = df[value]
+ else:
+ df_categorical = df[value]
+
+ print(df_continuous)
+ print(df_categorical)
+
+ # creating list for continuous and categorical variables for iteration purposes
+
+ predictors_con = []
+ predictors_cat = []
+ for i in predictors:
+ if i in df_continuous:
+ predictors_con.append(i)
+ else:
+ predictors_cat.append(i)
+
+ # *************** Handling Null Values ******************
+
+ for col in df.columns:
+ if (
+ df[col].dtypes == "float"
+ or df[col].dtypes == "int"
+ and df[col].nunique() > 5
+ ):
+ df[col].fillna((df[col].mean()), inplace=True)
+ else:
+ df = df.apply(lambda col: col.fillna(col.value_counts().index[0]))
+
+ # *************** One Hot Encoder ******************
+
+ data_cat = df.select_dtypes("object")
+ data = onehotencoder(df, data_cat)
+
+ # *************** Test and Train Datasets ******************
+
+ for i in responses:
+ x = data.drop(i, axis=1)
+ y = data[i]
+
+ x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
+
+ for i in responses:
+ if response_type == "categorical":
+ logr = LogisticRegression()
+ logr_fitted = logr.fit(x_train, y_train)
+ logr_predict = logr_fitted.predict(x_test)
+ print(logr_predict)
+ print(logr_fitted.summary())
+ else:
+ feature_name = i
+ ols_predict = statsmodels.api.add_constant(x)
+ ols = statsmodels.api.OLS(y, ols_predict)
+ ols_fitted = ols.fit()
+ predictor_ols = ols_fitted.predict()
+ print(predictor_ols)
+ print(f"Variable: {feature_name}")
+ print(ols_fitted.summary())
+
+ # *************** Correlation Tables for all 3 possibilities ******************
+
+ # 1. creating permutations:
+
+ combo = set(itertools.combinations(predictors, 2))
+
+ # 2. creating the tables:
+
+ table_con = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Pearson Correlation",
+ "Absolute Value of Correlation",
+ ]
+ )
+
+ table_cat = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Cramers V",
+ "Absolute Value of Correlation",
+ ]
+ )
+
+ table_catcon = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Correlation ratio",
+ "Absolute Value of Correlation",
+ ]
+ )
+
+ table_concat = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Correlation ratio",
+ "Absolute Value of Correlation",
+ ]
+ )
+
+ # 3. Fill in the Correlation tables
+
+ for index, tup in enumerate(combo):
+ if tup[0] in predictors_con and tup[1] in predictors_con:
+ x_label = df[tup[0]]
+ y_label = df[tup[1]]
+ pearson = scipy.stats.pearsonr(x_label, y_label).statistic
+ new = [x_label.name, y_label.name, pearson, np.abs(pearson)]
+ table_con.loc[len(table_con)] = new
+ elif tup[0] in predictors_cat and tup[1] in predictors_cat:
+ x_label = df[tup[0]]
+ y_label = df[tup[1]]
+ fill_na(df)
+ correlation = cat_correlation(x_label, y_label)
+ new = [x_label.name, y_label.name, correlation, np.abs(correlation)]
+ table_cat.loc[len(table_cat)] = new
+ elif tup[0] in predictors_cat and tup[1] in predictors_con:
+ x_label = df[tup[0]]
+ y_label = df[tup[1]]
+ correlation = cat_cont_correlation(x_label, y_label)
+ new = [x_label.name, y_label.name, correlation, np.abs(correlation)]
+ table_catcon.loc[len(table_catcon)] = new
+ elif tup[0] in predictors_con and tup[1] in predictors_cat:
+ x_label = df[tup[1]]
+ y_label = df[tup[0]]
+ correlation = cat_cont_correlation(x_label, y_label)
+ new = [x_label.name, y_label.name, correlation, np.abs(correlation)]
+ table_concat.loc[len(table_concat)] = new
+ table_final = pd.concat([table_concat, table_catcon])
+
+ # *************** Brute Force for all 3 possibilities ******************
+
+ # 1. creating the tables:
+
+ brute_force_con = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ brute_force_cat = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ brute_force_both = pd.DataFrame(
+ columns=[
+ "predictor 1",
+ "predictor 2",
+ "Difference of Mean Response",
+ "Weighted Difference of Mean Response",
+ ]
+ )
+
+ # 2. Fill in the Brute Force tables
+
+ for i in responses:
+ if response_type == "continuous":
+ for index, tup in enumerate(combo):
+ if tup[0] in predictors_con and tup[1] in predictors_con:
+ x = tup[0]
+ y = tup[1]
+ dataset = data[[x, y]]
+ model = sm.OLS(df[i], dataset, axis=1).fit()
+ pred = model.predict()
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ con_new = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_con.loc[len(brute_force_con)] = con_new
+ if tup[0] in predictors_cat and tup[1] in predictors_cat:
+ x = tup[0]
+ y = tup[1]
+ dataset = df[[x, y]]
+ dt = onehotencoder(dataset, dataset)
+ model = sm.OLS(df[i], dt, axis=1).fit()
+ pred = model.predict()
+ # corr = model.rsquared ** .5
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ cat_new = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_cat.loc[len(brute_force_cat)] = cat_new
+ if tup[0] in predictors_cat and tup[1] in predictors_con:
+ x = tup[0]
+ y = tup[1]
+ dataset = df[[x, y]]
+ dt = onehotencoder(dataset, df[x])
+ model = sm.OLS(df[i], dt, axis=1).fit()
+ pred = model.predict()
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ both = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_both.loc[len(brute_force_both)] = both
+ if tup[0] in predictors_con and tup[1] in predictors_cat:
+ x = tup[0]
+ y = tup[1]
+ dataset = df[[x, y]]
+ dt = onehotencoder(dataset, df[y])
+ model = sm.OLS(df[i], dt, axis=1).fit()
+ pred = model.predict()
+ meansquareddiff = MeanSquaredDiff(pred)
+ weightedmeansquareddiff = WeightedMeanSquaredDiff(pred)
+ both = [x, y, meansquareddiff, weightedmeansquareddiff]
+ brute_force_both.loc[len(brute_force_both)] = both
+
+ # *************** Printing Tables for Each Possibility ******************
+
+ # 1. both categorical
+ print(table_cat.sort_values(["Cramers V"], ascending=[False]).to_string())
+ print(brute_force_cat.to_string())
+
+ # 2. both continuous
+ print(table_con.sort_values(["Pearson Correlation"], ascending=[False]).to_string())
+ print(brute_force_con.to_string())
+
+ # 2. categorical and continuous
+ print(table_final.sort_values(["Correlation ratio"], ascending=[False]).to_string())
+ print(brute_force_both.to_string())
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/requirements.dev.txt b/requirements.dev.txt
index e05a14b..90830c3 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -1,11 +1,15 @@
#
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
# To update, run:
#
# pip-compile --output-file=requirements.dev.txt requirements.dev.in
#
-anyio==3.6.1
+anyio==3.6.2
# via jupyter-server
+appnope==0.1.3
+ # via
+ # ipykernel
+ # ipython
argon2-cffi==21.3.0
# via
# jupyter-server
@@ -13,9 +17,9 @@ argon2-cffi==21.3.0
# notebook
argon2-cffi-bindings==21.2.0
# via argon2-cffi
-asttokens==2.0.5
+asttokens==2.0.8
# via stack-data
-attrs==21.4.0
+attrs==22.1.0
# via jsonschema
babel==2.10.3
# via jupyterlab-server
@@ -29,19 +33,19 @@ bleach==5.0.1
# via nbconvert
build==0.8.0
# via pip-tools
-certifi==2022.6.15
+certifi==2022.9.24
# via requests
cffi==1.15.1
# via argon2-cffi-bindings
cfgv==3.3.1
# via pre-commit
-charset-normalizer==2.1.0
+charset-normalizer==2.1.1
# via requests
click==8.1.3
# via
# black
# pip-tools
-debugpy==1.6.2
+debugpy==1.6.3
# via ipykernel
decorator==5.1.1
# via ipython
@@ -49,35 +53,29 @@ defusedxml==0.7.1
# via nbconvert
detect-secrets==1.3.0
# via -r requirements.dev.in
-distlib==0.3.5
+distlib==0.3.6
# via virtualenv
entrypoints==0.4
- # via
- # jupyter-client
- # nbconvert
-executing==0.9.1
+ # via jupyter-client
+executing==1.1.1
# via stack-data
-fastjsonschema==2.16.1
+fastjsonschema==2.16.2
# via nbformat
-filelock==3.7.1
+filelock==3.8.0
# via virtualenv
flake8==4.0.1
# via -r requirements.dev.in
-identify==2.5.2
+identify==2.5.6
# via pre-commit
-idna==3.3
+idna==3.4
# via
# anyio
# requests
-importlib-metadata==4.12.0
- # via jupyterlab-server
-importlib-resources==5.9.0
- # via jsonschema
-ipykernel==6.15.1
+ipykernel==6.16.1
# via
# nbclassic
# notebook
-ipython==8.4.0
+ipython==8.5.0
# via
# ipykernel
# jupyterlab
@@ -97,20 +95,20 @@ jinja2==3.1.2
# nbclassic
# nbconvert
# notebook
-json5==0.9.8
+json5==0.9.10
# via jupyterlab-server
-jsonschema==4.7.2
+jsonschema==4.16.0
# via
# jupyterlab-server
# nbformat
-jupyter-client==7.3.4
+jupyter-client==7.4.3
# via
# ipykernel
# jupyter-server
# nbclassic
# nbclient
# notebook
-jupyter-core==4.11.1
+jupyter-core==4.11.2
# via
# jupyter-client
# jupyter-server
@@ -119,49 +117,51 @@ jupyter-core==4.11.1
# nbconvert
# nbformat
# notebook
-jupyter-server==1.18.1
+jupyter-server==1.21.0
# via
# jupyterlab
# jupyterlab-server
# nbclassic
# notebook-shim
-jupyterlab==3.4.4
+jupyterlab==3.4.8
# via -r requirements.dev.in
jupyterlab-pygments==0.2.2
# via nbconvert
-jupyterlab-server==2.15.0
+jupyterlab-server==2.16.1
# via jupyterlab
markupsafe==2.1.1
# via
# jinja2
# nbconvert
-matplotlib-inline==0.1.3
+matplotlib-inline==0.1.6
# via
# ipykernel
# ipython
mccabe==0.6.1
# via flake8
-mistune==0.8.4
+mistune==2.0.4
# via nbconvert
mypy-extensions==0.4.3
# via black
-nbclassic==0.4.3
- # via jupyterlab
-nbclient==0.6.6
+nbclassic==0.4.5
+ # via
+ # jupyterlab
+ # notebook
+nbclient==0.7.0
# via nbconvert
-nbconvert==6.5.0
+nbconvert==7.2.2
# via
# jupyter-server
# nbclassic
# notebook
-nbformat==5.4.0
+nbformat==5.7.0
# via
# jupyter-server
# nbclassic
# nbclient
# nbconvert
# notebook
-nest-asyncio==1.5.5
+nest-asyncio==1.5.6
# via
# ipykernel
# jupyter-client
@@ -172,9 +172,9 @@ nodeenv==1.7.0
# via pre-commit
nose==1.3.7
# via -r requirements.dev.in
-notebook==6.4.12
+notebook==6.5.1
# via jupyterlab
-notebook-shim==0.1.0
+notebook-shim==0.2.0
# via nbclassic
packaging==21.3
# via
@@ -188,15 +188,15 @@ pandocfilters==1.5.0
# via nbconvert
parso==0.8.3
# via jedi
-pathspec==0.9.0
+pathspec==0.10.1
# via black
-pep517==0.12.0
+pep517==0.13.0
# via build
pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
-pip-tools==6.8.0
+pip-tools==6.9.0
# via -r requirements.dev.in
platformdirs==2.5.2
# via
@@ -204,14 +204,14 @@ platformdirs==2.5.2
# virtualenv
pre-commit==2.20.0
# via -r requirements.dev.in
-prometheus-client==0.14.1
+prometheus-client==0.15.0
# via
# jupyter-server
# nbclassic
# notebook
-prompt-toolkit==3.0.30
+prompt-toolkit==3.0.31
# via ipython
-psutil==5.9.1
+psutil==5.9.3
# via ipykernel
ptyprocess==0.7.0
# via
@@ -225,7 +225,7 @@ pycparser==2.21
# via cffi
pyflakes==2.4.0
# via flake8
-pygments==2.12.0
+pygments==2.13.0
# via
# ipython
# nbconvert
@@ -235,13 +235,13 @@ pyrsistent==0.18.1
# via jsonschema
python-dateutil==2.8.2
# via jupyter-client
-pytz==2022.1
+pytz==2022.5
# via babel
pyyaml==6.0
# via
# detect-secrets
# pre-commit
-pyzmq==23.2.0
+pyzmq==24.0.1
# via
# ipykernel
# jupyter-client
@@ -262,18 +262,18 @@ six==1.16.0
# asttokens
# bleach
# python-dateutil
-sniffio==1.2.0
+sniffio==1.3.0
# via anyio
soupsieve==2.3.2.post1
# via beautifulsoup4
-stack-data==0.3.0
+stack-data==0.5.1
# via ipython
-terminado==0.15.0
+terminado==0.16.0
# via
# jupyter-server
# nbclassic
# notebook
-tinycss2==1.1.1
+tinycss2==1.2.1
# via nbconvert
toml==0.10.2
# via pre-commit
@@ -281,6 +281,7 @@ tomli==2.0.1
# via
# black
# build
+ # jupyterlab
# pep517
tornado==6.2
# via
@@ -291,7 +292,7 @@ tornado==6.2
# nbclassic
# notebook
# terminado
-traitlets==5.3.0
+traitlets==5.5.0
# via
# ipykernel
# ipython
@@ -304,11 +305,9 @@ traitlets==5.3.0
# nbconvert
# nbformat
# notebook
-typing-extensions==4.3.0
- # via black
-urllib3==1.26.11
+urllib3==1.26.12
# via requests
-virtualenv==20.16.1
+virtualenv==20.16.5
# via pre-commit
wcwidth==0.2.5
# via prompt-toolkit
@@ -316,14 +315,10 @@ webencodings==0.5.1
# via
# bleach
# tinycss2
-websocket-client==1.3.3
+websocket-client==1.4.1
# via jupyter-server
wheel==0.37.1
# via pip-tools
-zipp==3.8.1
- # via
- # importlib-metadata
- # importlib-resources
# The following packages are considered to be unsafe in a requirements file:
# pip
diff --git a/requirements.in b/requirements.in
index 296d654..762e489 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1 +1,13 @@
-numpy
\ No newline at end of file
+fastapi
+flake8==5.0.4
+isort[requirements]
+numpy
+pandas
+pip-tools
+pre-commit
+plotly
+scikit-learn
+statsmodels
+matplotlib
+matplotlib
+dython
diff --git a/requirements.txt b/requirements.txt
index b000696..498d8d2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,154 @@
#
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
# To update, run:
#
# pip-compile --output-file=requirements.txt requirements.in
#
-numpy==1.23.1
+anyio==3.6.2
+ # via starlette
+build==0.8.0
+ # via pip-tools
+cfgv==3.3.1
+ # via pre-commit
+click==8.1.3
+ # via pip-tools
+contourpy==1.0.5
+ # via matplotlib
+cycler==0.11.0
+ # via matplotlib
+distlib==0.3.6
+ # via virtualenv
+dython==0.7.2
# via -r requirements.in
+fastapi==0.85.1
+ # via -r requirements.in
+filelock==3.8.0
+ # via virtualenv
+flake8==5.0.4
+ # via -r requirements.in
+fonttools==4.38.0
+ # via matplotlib
+identify==2.5.6
+ # via pre-commit
+idna==3.4
+ # via anyio
+isort[requirements]==5.10.1
+ # via -r requirements.in
+joblib==1.2.0
+ # via
+ # scikit-learn
+ # scikit-plot
+kiwisolver==1.4.4
+ # via matplotlib
+matplotlib==3.6.0
+ # via
+ # -r requirements.in
+ # dython
+ # scikit-plot
+ # seaborn
+mccabe==0.7.0
+ # via flake8
+nodeenv==1.7.0
+ # via pre-commit
+numpy==1.23.4
+ # via
+ # -r requirements.in
+ # contourpy
+ # dython
+ # matplotlib
+ # pandas
+ # patsy
+ # scikit-learn
+ # scipy
+ # seaborn
+ # statsmodels
+packaging==21.3
+ # via
+ # build
+ # matplotlib
+ # statsmodels
+pandas==1.5.1
+ # via
+ # -r requirements.in
+ # dython
+ # seaborn
+ # statsmodels
+patsy==0.5.3
+ # via statsmodels
+pep517==0.13.0
+ # via build
+pillow==9.2.0
+ # via matplotlib
+pip-tools==6.9.0
+ # via -r requirements.in
+platformdirs==2.5.2
+ # via virtualenv
+plotly==5.10.0
+ # via -r requirements.in
+pre-commit==2.20.0
+ # via -r requirements.in
+psutil==5.9.3
+ # via dython
+pycodestyle==2.9.1
+ # via flake8
+pydantic==1.10.2
+ # via fastapi
+pyflakes==2.5.0
+ # via flake8
+pyparsing==3.0.9
+ # via
+ # matplotlib
+ # packaging
+python-dateutil==2.8.2
+ # via
+ # matplotlib
+ # pandas
+pytz==2022.5
+ # via pandas
+pyyaml==6.0
+ # via pre-commit
+scikit-learn==1.1.2
+ # via
+ # -r requirements.in
+ # dython
+ # scikit-plot
+scikit-plot==0.3.7
+ # via dython
+scipy==1.9.3
+ # via
+ # dython
+ # scikit-learn
+ # scikit-plot
+ # statsmodels
+seaborn==0.12.1
+ # via dython
+six==1.16.0
+ # via
+ # patsy
+ # python-dateutil
+sniffio==1.3.0
+ # via anyio
+starlette==0.20.4
+ # via fastapi
+statsmodels==0.13.2
+ # via -r requirements.in
+tenacity==8.1.0
+ # via plotly
+threadpoolctl==3.1.0
+ # via scikit-learn
+toml==0.10.2
+ # via pre-commit
+tomli==2.0.1
+ # via
+ # build
+ # pep517
+typing-extensions==4.4.0
+ # via pydantic
+virtualenv==20.16.5
+ # via pre-commit
+wheel==0.37.1
+ # via pip-tools
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools