diff --git a/Midterm_finalEdition.py b/Midterm_finalEdition.py new file mode 100644 index 0000000..06f3857 --- /dev/null +++ b/Midterm_finalEdition.py @@ -0,0 +1,619 @@ +import os +import sys +import scipy.stats +import statsmodels.api as sm +import bisect +from collections import defaultdict +import math +import numpy as np +import statistics +import itertools +import pandas as pd +import plotly.express as px +import statsmodels +import statsmodels.api as sm +from plotly import graph_objects as go +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import OneHotEncoder +import scipy.stats +import warnings +import numpy +import pandas +from scipy import stats + + +# *************** Defining all necessary functions ****************** + + +def onehotencoder(df, df_cat): + onehotencoder = OneHotEncoder(handle_unknown="ignore") + encoder = onehotencoder.fit_transform(df_cat.values.reshape(-1, 1)).toarray() + dfOneHot = pd.DataFrame(encoder) + data = pd.concat([df.select_dtypes(exclude=["object"]), dfOneHot], axis=1) + data = data.head(len(df)) + return data + + +def boundaries(X): + stdev = statistics.stdev(X) + bin_width = 3.49 * (stdev) * (len(X)) ** (-(1 / 3)) + bin_number = round(math.sqrt(len(X))) + sorted_pred = sorted(X) + + boundaries = [] + for i in range(0, bin_number): + boundaries.append(sorted_pred[0] + bin_width * i) + return boundaries + + +def MeanSquaredDiff(X): + bn = boundaries(X) + pop_mean = statistics.mean(X) + dic = defaultdict(list) + total_population = len(X) + + for x in X: + ind = bisect.bisect_right(bn, x) + dic[ind].append(x) + list_df = list(dic.values()) + + for j in range(0, len(list_df) - 1): + chunk_mean = statistics.mean(list_df[j]) + msf = (chunk_mean - pop_mean) ** 2 + MeanSquaredDiff = msf.sum() / total_population + return MeanSquaredDiff + + +def bin_average(X): + bn = boundaries(X) + dic = defaultdict(list) + for x in X: + ind = bisect.bisect_right(bn, x) + dic[ind].append(x) + list_df = list(dic.values()) + + mean = [] + for j in range(0, len(list_df) - 1): + chunk_mean = statistics.mean(list_df[j]) + mean.append(chunk_mean) + return mean + + +def WeightedMeanSquaredDiff(X): + bn = boundaries(X) + pop_mean = statistics.mean(X) + dic = defaultdict(list) + total_population = len(X) + + for x in X: + ind = bisect.bisect_right(bn, x) + dic[ind].append(x) + list_df = list(dic.values()) + + for j in range(0, len(list_df) - 1): + chunk_mean = statistics.mean(list_df[j]) + PopulationProportion = len(list_df[j]) / total_population + msf = (chunk_mean - pop_mean) ** 2 + weightedMeanSquaredDiff = (PopulationProportion * msf.sum()) / total_population + return weightedMeanSquaredDiff + + +def cat_correlation(x, y, bias_correction=True, tschuprow=False): + corr_coeff = numpy.nan + try: + x, y = fill_na(x), fill_na(y) + crosstab_matrix = pandas.crosstab(x, y) + n_observations = crosstab_matrix.sum().sum() + + yates_correct = True + if bias_correction: + if crosstab_matrix.shape == (2, 2): + yates_correct = False + + chi2, _, _, _ = stats.chi2_contingency( + crosstab_matrix, correction=yates_correct + ) + phi2 = chi2 / n_observations + + # r and c are number of categories of x and y + r, c = crosstab_matrix.shape + if bias_correction: + phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1)) + r_corrected = r - ((r - 1) ** 2) / (n_observations - 1) + c_corrected = c - ((c - 1) ** 2) / (n_observations - 1) + if tschuprow: + corr_coeff = numpy.sqrt( + phi2_corrected / numpy.sqrt((r_corrected - 1) * (c_corrected - 1)) + ) + return corr_coeff + corr_coeff = numpy.sqrt( + phi2_corrected / min((r_corrected - 1), (c_corrected - 1)) + ) + return corr_coeff + if tschuprow: + corr_coeff = numpy.sqrt(phi2 / numpy.sqrt((r - 1) * (c - 1))) + return corr_coeff + corr_coeff = numpy.sqrt(phi2 / min((r - 1), (c - 1))) + return corr_coeff + except Exception as ex: + print(ex) + if tschuprow: + warnings.warn("Error calculating Tschuprow's T", RuntimeWarning) + else: + warnings.warn("Error calculating Cramer's V", RuntimeWarning) + return corr_coeff + + +def cat_cont_correlation(categories, values): + f_cat, _ = pandas.factorize(categories) + cat_num = numpy.max(f_cat) + 1 + y_avg_array = numpy.zeros(cat_num) + n_array = numpy.zeros(cat_num) + for i in range(0, cat_num): + cat_measures = values[numpy.argwhere(f_cat == i).flatten()] + n_array[i] = len(cat_measures) + y_avg_array[i] = numpy.average(cat_measures) + y_total_avg = numpy.sum(numpy.multiply(y_avg_array, n_array)) / numpy.sum(n_array) + numerator = numpy.sum( + numpy.multiply( + n_array, numpy.power(numpy.subtract(y_avg_array, y_total_avg), 2) + ) + ) + denominator = numpy.sum(numpy.power(numpy.subtract(values, y_total_avg), 2)) + if numerator == 0: + eta = 0.0 + else: + eta = numpy.sqrt(numerator / denominator) + return eta + + +def fill_na(data): + if isinstance(data, pd.Series): + return data.fillna(0) + else: + return numpy.array([value if value is not None else 0 for value in data]) + + +def variable_cat_plot(x, y, path=None): + conf_matrix = confusion_matrix(x, y) + + fig_no_relationship = go.Figure( + data=go.Heatmap(z=conf_matrix, zmin=0, zmax=conf_matrix.max()) + ) + fig_no_relationship.update_layout( + title="Categorical Predictor by Categorical Response (without relationship)", + xaxis_title=x.name, + yaxis_title=y.name, + ) + if path is not None: + fig_no_relationship.write_html(path, include_plotlyjs="cdn") + else: + fig_no_relationship.show() + return + + +def variable_con_plot(x, y, path=None): + fig = px.scatter(x=x, y=y, trendline="ols") + fig.update_layout( + title="Two Continuous Predictors", + xaxis_title=x.name, + yaxis_title=y.name, + ) + if path is not None: + fig.write_html(path) + else: + fig.show() + return + + +def hist(x_label, y_label, path=None): + df = px.data.tips() + fig = px.histogram(df, x=x_label, y=y_label, color=x_label, marginal="rug", + ) + fig.update_layout( + xaxis_title=x_label.name, + yaxis_title=y_label.name, + legend_title=x_label.name, + font=dict( + family="Courier New, monospace", + size=18, + color="RebeccaPurple" + ) + ) + if path is not None: + fig.write_html(path) + else: + fig.show() + return + + +def violin(x_label, y_label, path=None): + df = px.data.tips() + fig = px.violin(df, x=x_label, y=y_label, color=x_label, box=True, points="all", + ) + fig.update_layout( + xaxis_title=x_label.name, + yaxis_title=y_label.name, + legend_title=x_label.name, + font=dict( + family="Courier New, monospace", + size=18, + color="RebeccaPurple" + ) + ) + if path is not None: + fig.write_html(path) + else: + fig.show() + return + +def dataframe_to_html(df, hyperlink_columns, out_path): + def make_hyperlink(path): + f_url = os.path.basename(path) + return u'{}'.format(path, f_url) + + # This css class helps to make our table look stylish + css_style = '' + # Set formatter for the hyperlink columns + formatters = {} + for hyperlink_column in hyperlink_columns: + formatters[hyperlink_column] = make_hyperlink + # Generate HTML table from dataframe + html_output = df.to_html(classes="rendered_html", formatters=formatters, justify="center", escape=False) + + # Write HTML output into file along with a link to our css style + with open(out_path, "w") as out_file: + # Writing data to a file + out_file.write(css_style) + out_file.write(html_output) + + +def init_directories(): + try: + os.mkdir("concat_plot") + os.mkdir("cat_plot") + os.mkdir("con_plot") + os.mkdir("catcon_plot") + except: + pass + + +def main(): + global table_final + init_directories() + + # *************** Reading Dataset ****************** + # Explanation 1: For the sake of running time, I'm doing all the analysis on the first 100 rows of dataset. + # You can run whole code with changing df_full to df and removing line 13. + + # Explanation 2: I deleted all columns with only 1 unique values, since they cannot contribute to the model. + + df_full = pd.read_csv( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz" + ) + + df = df_full.head(100) + + for col in df.columns: + if len(df[col].unique()) == 1: + df.drop(col, inplace=True, axis=1) + + print(df.to_string()) + + # *************** Identifying Response and Predictors and Their Type ****************** + + responses = ["traffic_volume"] + predictors = ["temp", "clouds_all", "weather_main", "weather_description"] + + response_type = "" + + for i in responses: + if df[i].nunique() == 2: + df[i] = df[i].astype("bool") + df.replace({False: 0, True: 1}, inplace=True) + response_type = "categorical" + else: + response_type = "continuous" + + predictors_type = {"continuous": [], "categorical": []} + continuous = df.select_dtypes(include=["float", "int"]) + for i in predictors: + if i in list(continuous) and df[i].nunique() > 5: + predictors_type["continuous"].append(i) + else: + predictors_type["categorical"].append(i) + + print("Response variable is:", *responses) + print("Response type is:", response_type) + + print("Predictor variables are:", predictors) + print("Predictors types:", predictors_type) + + # dividing dataframes to categorical and continuous + + for key, value in predictors_type.items(): + if key == "continuous": + df_continuous = df[value] + else: + df_categorical = df[value] + + print(df_continuous) + print(df_categorical) + + # creating list for continuous and categorical variables for iteration purposes + + predictors_con = [] + predictors_cat = [] + for i in predictors: + if i in df_continuous: + predictors_con.append(i) + else: + predictors_cat.append(i) + + # *************** Handling Null Values ****************** + + for col in df.columns: + if ( + df[col].dtypes == "float" + or df[col].dtypes == "int" + and df[col].nunique() > 5 + ): + df[col].fillna((df[col].mean()), inplace=True) + else: + df = df.apply(lambda col: col.fillna(col.value_counts().index[0])) + + # *************** One Hot Encoder ****************** + + data_cat = df.select_dtypes("object") + data = onehotencoder(df, data_cat) + + # *************** Test and Train Datasets ****************** + + for i in responses: + x = data.drop(i, axis=1) + y = data[i] + + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20) + + for i in responses: + if response_type == "categorical": + logr = LogisticRegression() + logr_fitted = logr.fit(x_train, y_train) + logr_predict = logr_fitted.predict(x_test) + print(logr_predict) + print(logr_fitted.summary()) + else: + feature_name = i + ols_predict = statsmodels.api.add_constant(x) + ols = statsmodels.api.OLS(y, ols_predict) + ols_fitted = ols.fit() + predictor_ols = ols_fitted.predict() + print(predictor_ols) + print(f"Variable: {feature_name}") + print(ols_fitted.summary()) + + # *************** Correlation Tables for all 3 possibilities ****************** + + # 1. creating permutations: + + combo = set(itertools.combinations(predictors, 2)) + + # 2. creating the tables: + + table_con = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Pearson Correlation", + "Absolute Value of Correlation", + "Linear Regression Plot", + ] + ) + + table_cat = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Cramers V", + "Absolute Value of Correlation", + "heatmap" + ] + ) + + table_catcon = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Correlation ratio", + "Absolute Value of Correlation", + "Violin Plot", + "Histogram Plot" + ] + ) + + table_concat = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Correlation ratio", + "Absolute Value of Correlation", + "Violin Plot", + "Histogram Plot" + ] + ) + + # 3. Fill in the Correlation tables and draw plots + # Explanation: First it calculate correlation and add it to the proper table, then it creates plots. + + for index, tup in enumerate(combo): + if tup[0] in predictors_con and tup[1] in predictors_con: + x_label = df[tup[0]] + y_label = df[tup[1]] + pearson = scipy.stats.pearsonr(x_label, y_label).statistic + path = "{}/{}_{}_{}".format("con_plot", tup[0], tup[1], "con_plot.html") + variable_con_plot(x_label, y_label, path) + new = [x_label.name, y_label.name, pearson, np.abs(pearson), path] + table_con.loc[len(table_con)] = new + elif tup[0] in predictors_cat and tup[1] in predictors_cat: + x_label = df[tup[0]] + y_label = df[tup[1]] + fill_na(df) + correlation = cat_correlation(x_label, y_label) + path = "{}/{}_{}_{}".format("cat_plot", tup[0], tup[1], "cat_plot.html") + variable_cat_plot(x_label, y_label, path) + new = [x_label.name, y_label.name, correlation, np.abs(correlation), path] + table_cat.loc[len(table_cat)] = new + elif tup[0] in predictors_cat and tup[1] in predictors_con: + x_label = df[tup[0]] + y_label = df[tup[1]] + correlation = cat_cont_correlation(x_label, y_label) + path1 = "{}/{}_{}_{}".format("catcon_plot", tup[0], tup[1], "catcon_plot1.html") + path2 = "{}/{}_{}_{}".format("catcon_plot", tup[0], tup[1], "catcon_plot2.html") + hist(x_label, y_label, path1) + violin(x_label, y_label, path2) + new = [x_label.name, y_label.name, correlation, np.abs(correlation), path1, path2] + table_catcon.loc[len(table_catcon)] = new + elif tup[0] in predictors_con and tup[1] in predictors_cat: + x_label = df[tup[1]] + y_label = df[tup[0]] + correlation = cat_cont_correlation(x_label, y_label) + path1 = "{}/{}_{}_{}".format("concat_plot", tup[0], tup[1], "concat_plot1.html") + path2 = "{}/{}_{}_{}".format("concat_plot", tup[0], tup[1], "concat_plot2.html") + violin(x_label, y_label, path2) + hist(x_label, y_label, path1) + new = [x_label.name, y_label.name, correlation, np.abs(correlation), path1, path2] + table_concat.loc[len(table_concat)] = new + table_final = pd.concat([table_concat, table_catcon]) + + dataframe_to_html(table_cat, ["heatmap"], "tablecat.html") + dataframe_to_html(table_con, ["Linear Regression Plot"], "tablecon.html") + dataframe_to_html(table_final, ["Violin Plot", "Histogram Plot"], "tableboth.html") + # *************** Brute Force for all 3 possibilities ****************** + + # 1. creating the tables: + + brute_force_con = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + brute_force_cat = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + brute_force_both1 = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + brute_force_both2 = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + # 2. Fill in the Brute Force tables + + for i in responses: + if response_type == "continuous": + for index, tup in enumerate(combo): + if tup[0] in predictors_con and tup[1] in predictors_con: + x = tup[0] + y = tup[1] + dataset = data[[x, y]] + model = sm.OLS(df[i], dataset, axis=1).fit() + pred = model.predict() + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + # corr = model.rsquared ** .5 + con_new = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_con.loc[len(brute_force_con)] = con_new + elif tup[0] in predictors_cat and tup[1] in predictors_cat: + x = tup[0] + y = tup[1] + dataset = df[[x, y]] + dt = onehotencoder(dataset, dataset) + model = sm.OLS(df[i], dt, axis=1).fit() + pred = model.predict() + # corr = model.rsquared ** .5 + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + cat_new = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_cat.loc[len(brute_force_cat)] = cat_new + elif tup[0] in predictors_cat and tup[1] in predictors_con: + x = tup[0] + y = tup[1] + dataset = df[[x, y]] + dt = onehotencoder(dataset, df[x]) + model = sm.OLS(df[i], dt, axis=1).fit() + pred = model.predict() + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + both1 = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_both1.loc[len(brute_force_both1)] = both1 + elif tup[0] in predictors_con and tup[1] in predictors_cat: + x = tup[0] + y = tup[1] + dataset = df[[x, y]] + dt = onehotencoder(dataset, df[y]) + model = sm.OLS(df[i], dt, axis=1).fit() + pred = model.predict() + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + + both2 = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_both2.loc[len(brute_force_both2)] = both2 + brute_force_both = pd.concat([brute_force_both1, brute_force_both2]) + + # *************** heatmap for all 3 possibilities ****************** + + # sns.heatmap(df_continuous) + # plt.show() + + # df_cat = df_categorical.apply(lambda x: pd.factorize(x)[0]).corr(method="pearson", min_periods=1) + # sns.heatmap(df_cat, annot=True) + # plt.show() + + # df_all = df.apply(lambda x: pd.factorize(x)[0]).corr(method="pearson", min_periods=1) + # sns.heatmap(df_all, annot=True) + # plt.show() + + # dataframe_to_html(brute_force_con, ["Residual Plot"], "brutetablcon.html") + # *************** Printing Tables for Each Possibility ****************** + + # # 1. both categorical + print(table_cat.sort_values(["Cramers V"], ascending=[False]).to_string()) + print(brute_force_cat.to_string()) + + # # 2. both continuous + print(table_con.sort_values(["Pearson Correlation"], ascending=[False]).to_string()) + print(brute_force_con.to_string()) + + # # 3. categorical and continuous + print(table_final.sort_values(["Correlation ratio"], ascending=[False]).to_string()) + print(brute_force_both.to_string()) + + + return + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/README.md b/README.md index 3cd99cc..c31fa59 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ - Setup a python 3.x venv (usually in `.venv`) - You can run `./scripts/create-venv.sh` to generate one - `pip3 install --upgrade pip` -- Install pip-tools `pip3 install pip-tools` -- Update dev requirements: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade` +- Install pip-tools ` ` +- Update dev requirement s: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade` - Update requirements: `pip-compile --output-file=requirements.txt requirements.in --upgrade` - Install dev requirements `pip3 install -r requirements.dev.txt` - Install requirements `pip3 install -r requirements.txt` diff --git a/mariadb-java-client-3.0.8.jar b/mariadb-java-client-3.0.8.jar new file mode 100644 index 0000000..0c21e78 Binary files /dev/null and b/mariadb-java-client-3.0.8.jar differ diff --git a/midterm.py b/midterm.py new file mode 100644 index 0000000..1fd77c2 --- /dev/null +++ b/midterm.py @@ -0,0 +1,441 @@ +import bisect +import itertools +import math +import statistics +import sys +import warnings +from collections import defaultdict + +import numpy +import numpy as np +import pandas +import pandas as pd +import scipy.stats +import statsmodels +import statsmodels.api as sm +from scipy import stats +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import OneHotEncoder + +# *************** Defining all necessary functions ****************** + + +def onehotencoder(df, df_cat): + onehotencoder = OneHotEncoder(handle_unknown="ignore") + encoder = onehotencoder.fit_transform(df_cat.values.reshape(-1, 1)).toarray() + dfOneHot = pd.DataFrame(encoder) + data = pd.concat([df.select_dtypes(exclude=["object"]), dfOneHot], axis=1) + data = data.head(len(df)) + return data + + +def boundaries(X): + stdev = statistics.stdev(X) + bin_width = 3.49 * (stdev) * (len(X)) ** (-(1 / 3)) + bin_number = round(math.sqrt(len(X))) + sorted_pred = sorted(X) + + boundaries = [] + for i in range(0, bin_number): + boundaries.append(sorted_pred[0] + bin_width * i) + return boundaries + + +def MeanSquaredDiff(X): + bn = boundaries(X) + pop_mean = statistics.mean(X) + dic = defaultdict(list) + total_population = len(X) + + for x in X: + ind = bisect.bisect_right(bn, x) + dic[ind].append(x) + list_df = list(dic.values()) + + for j in range(0, len(list_df) - 1): + chunk_mean = statistics.mean(list_df[j]) + msf = (chunk_mean - pop_mean) ** 2 + MeanSquaredDiff = msf.sum() / total_population + return MeanSquaredDiff + + +def WeightedMeanSquaredDiff(X): + bn = boundaries(X) + pop_mean = statistics.mean(X) + dic = defaultdict(list) + total_population = len(X) + + for x in X: + ind = bisect.bisect_right(bn, x) + dic[ind].append(x) + list_df = list(dic.values()) + + for j in range(0, len(list_df) - 1): + chunk_mean = statistics.mean(list_df[j]) + PopulationProportion = len(list_df[j]) / total_population + msf = (chunk_mean - pop_mean) ** 2 + weightedMeanSquaredDiff = (PopulationProportion * msf.sum()) / total_population + return weightedMeanSquaredDiff + + +def cat_correlation(x, y, bias_correction=True, tschuprow=False): + corr_coeff = numpy.nan + try: + x, y = fill_na(x), fill_na(y) + crosstab_matrix = pandas.crosstab(x, y) + n_observations = crosstab_matrix.sum().sum() + + yates_correct = True + if bias_correction: + if crosstab_matrix.shape == (2, 2): + yates_correct = False + + chi2, _, _, _ = stats.chi2_contingency( + crosstab_matrix, correction=yates_correct + ) + phi2 = chi2 / n_observations + + # r and c are number of categories of x and y + r, c = crosstab_matrix.shape + if bias_correction: + phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1)) + r_corrected = r - ((r - 1) ** 2) / (n_observations - 1) + c_corrected = c - ((c - 1) ** 2) / (n_observations - 1) + if tschuprow: + corr_coeff = numpy.sqrt( + phi2_corrected / numpy.sqrt((r_corrected - 1) * (c_corrected - 1)) + ) + return corr_coeff + corr_coeff = numpy.sqrt( + phi2_corrected / min((r_corrected - 1), (c_corrected - 1)) + ) + return corr_coeff + if tschuprow: + corr_coeff = numpy.sqrt(phi2 / numpy.sqrt((r - 1) * (c - 1))) + return corr_coeff + corr_coeff = numpy.sqrt(phi2 / min((r - 1), (c - 1))) + return corr_coeff + except Exception as ex: + print(ex) + if tschuprow: + warnings.warn("Error calculating Tschuprow's T", RuntimeWarning) + else: + warnings.warn("Error calculating Cramer's V", RuntimeWarning) + return corr_coeff + + +def cat_cont_correlation(categories, values): + f_cat, _ = pandas.factorize(categories) + cat_num = numpy.max(f_cat) + 1 + y_avg_array = numpy.zeros(cat_num) + n_array = numpy.zeros(cat_num) + for i in range(0, cat_num): + cat_measures = values[numpy.argwhere(f_cat == i).flatten()] + n_array[i] = len(cat_measures) + y_avg_array[i] = numpy.average(cat_measures) + y_total_avg = numpy.sum(numpy.multiply(y_avg_array, n_array)) / numpy.sum(n_array) + numerator = numpy.sum( + numpy.multiply( + n_array, numpy.power(numpy.subtract(y_avg_array, y_total_avg), 2) + ) + ) + denominator = numpy.sum(numpy.power(numpy.subtract(values, y_total_avg), 2)) + if numerator == 0: + eta = 0.0 + else: + eta = numpy.sqrt(numerator / denominator) + return eta + + +def fill_na(data): + if isinstance(data, pd.Series): + return data.fillna(0) + else: + return numpy.array([value if value is not None else 0 for value in data]) + + +def main(): + # *************** Reading Dataset ****************** + + # Explanation 1: For the sake of running time, I'm doing all the analysis on the first 100 rows of dataset. + # You can run whole code with changing df_full to df and removing line 13. + + # Explanation 2: I deleted all columns with only 1 unique values, since they cannot contribute to the model. + + df_full = pd.read_csv( + "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz" + ) + + df = df_full.head(100) + + for col in df.columns: + if len(df[col].unique()) == 1: + df.drop(col, inplace=True, axis=1) + + print(df.to_string()) + + # *************** Identifying Response and Predictors and Their Type ****************** + + responses = ["traffic_volume"] + predictors = ["temp", "clouds_all", "weather_main", "weather_description"] + + response_type = "" + + for i in responses: + if df[i].nunique() == 2: + df[i] = df[i].astype("bool") + df.replace({False: 0, True: 1}, inplace=True) + response_type = "categorical" + else: + response_type = "continuous" + + predictors_type = {"continuous": [], "categorical": []} + continuous = df.select_dtypes(include=["float", "int"]) + for i in predictors: + if i in list(continuous) and df[i].nunique() > 5: + predictors_type["continuous"].append(i) + else: + predictors_type["categorical"].append(i) + + print("Response variable is:", *responses) + print("Response type is:", response_type) + + print("Predictor variables are:", predictors) + print("Predictors types:", predictors_type) + + # dividing dataframes to categorical and continuous + + for key, value in predictors_type.items(): + if key == "continuous": + df_continuous = df[value] + else: + df_categorical = df[value] + + print(df_continuous) + print(df_categorical) + + # creating list for continuous and categorical variables for iteration purposes + + predictors_con = [] + predictors_cat = [] + for i in predictors: + if i in df_continuous: + predictors_con.append(i) + else: + predictors_cat.append(i) + + # *************** Handling Null Values ****************** + + for col in df.columns: + if ( + df[col].dtypes == "float" + or df[col].dtypes == "int" + and df[col].nunique() > 5 + ): + df[col].fillna((df[col].mean()), inplace=True) + else: + df = df.apply(lambda col: col.fillna(col.value_counts().index[0])) + + # *************** One Hot Encoder ****************** + + data_cat = df.select_dtypes("object") + data = onehotencoder(df, data_cat) + + # *************** Test and Train Datasets ****************** + + for i in responses: + x = data.drop(i, axis=1) + y = data[i] + + x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20) + + for i in responses: + if response_type == "categorical": + logr = LogisticRegression() + logr_fitted = logr.fit(x_train, y_train) + logr_predict = logr_fitted.predict(x_test) + print(logr_predict) + print(logr_fitted.summary()) + else: + feature_name = i + ols_predict = statsmodels.api.add_constant(x) + ols = statsmodels.api.OLS(y, ols_predict) + ols_fitted = ols.fit() + predictor_ols = ols_fitted.predict() + print(predictor_ols) + print(f"Variable: {feature_name}") + print(ols_fitted.summary()) + + # *************** Correlation Tables for all 3 possibilities ****************** + + # 1. creating permutations: + + combo = set(itertools.combinations(predictors, 2)) + + # 2. creating the tables: + + table_con = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Pearson Correlation", + "Absolute Value of Correlation", + ] + ) + + table_cat = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Cramers V", + "Absolute Value of Correlation", + ] + ) + + table_catcon = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Correlation ratio", + "Absolute Value of Correlation", + ] + ) + + table_concat = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Correlation ratio", + "Absolute Value of Correlation", + ] + ) + + # 3. Fill in the Correlation tables + + for index, tup in enumerate(combo): + if tup[0] in predictors_con and tup[1] in predictors_con: + x_label = df[tup[0]] + y_label = df[tup[1]] + pearson = scipy.stats.pearsonr(x_label, y_label).statistic + new = [x_label.name, y_label.name, pearson, np.abs(pearson)] + table_con.loc[len(table_con)] = new + elif tup[0] in predictors_cat and tup[1] in predictors_cat: + x_label = df[tup[0]] + y_label = df[tup[1]] + fill_na(df) + correlation = cat_correlation(x_label, y_label) + new = [x_label.name, y_label.name, correlation, np.abs(correlation)] + table_cat.loc[len(table_cat)] = new + elif tup[0] in predictors_cat and tup[1] in predictors_con: + x_label = df[tup[0]] + y_label = df[tup[1]] + correlation = cat_cont_correlation(x_label, y_label) + new = [x_label.name, y_label.name, correlation, np.abs(correlation)] + table_catcon.loc[len(table_catcon)] = new + elif tup[0] in predictors_con and tup[1] in predictors_cat: + x_label = df[tup[1]] + y_label = df[tup[0]] + correlation = cat_cont_correlation(x_label, y_label) + new = [x_label.name, y_label.name, correlation, np.abs(correlation)] + table_concat.loc[len(table_concat)] = new + table_final = pd.concat([table_concat, table_catcon]) + + # *************** Brute Force for all 3 possibilities ****************** + + # 1. creating the tables: + + brute_force_con = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + brute_force_cat = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + brute_force_both = pd.DataFrame( + columns=[ + "predictor 1", + "predictor 2", + "Difference of Mean Response", + "Weighted Difference of Mean Response", + ] + ) + + # 2. Fill in the Brute Force tables + + for i in responses: + if response_type == "continuous": + for index, tup in enumerate(combo): + if tup[0] in predictors_con and tup[1] in predictors_con: + x = tup[0] + y = tup[1] + dataset = data[[x, y]] + model = sm.OLS(df[i], dataset, axis=1).fit() + pred = model.predict() + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + con_new = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_con.loc[len(brute_force_con)] = con_new + if tup[0] in predictors_cat and tup[1] in predictors_cat: + x = tup[0] + y = tup[1] + dataset = df[[x, y]] + dt = onehotencoder(dataset, dataset) + model = sm.OLS(df[i], dt, axis=1).fit() + pred = model.predict() + # corr = model.rsquared ** .5 + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + cat_new = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_cat.loc[len(brute_force_cat)] = cat_new + if tup[0] in predictors_cat and tup[1] in predictors_con: + x = tup[0] + y = tup[1] + dataset = df[[x, y]] + dt = onehotencoder(dataset, df[x]) + model = sm.OLS(df[i], dt, axis=1).fit() + pred = model.predict() + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + both = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_both.loc[len(brute_force_both)] = both + if tup[0] in predictors_con and tup[1] in predictors_cat: + x = tup[0] + y = tup[1] + dataset = df[[x, y]] + dt = onehotencoder(dataset, df[y]) + model = sm.OLS(df[i], dt, axis=1).fit() + pred = model.predict() + meansquareddiff = MeanSquaredDiff(pred) + weightedmeansquareddiff = WeightedMeanSquaredDiff(pred) + both = [x, y, meansquareddiff, weightedmeansquareddiff] + brute_force_both.loc[len(brute_force_both)] = both + + # *************** Printing Tables for Each Possibility ****************** + + # 1. both categorical + print(table_cat.sort_values(["Cramers V"], ascending=[False]).to_string()) + print(brute_force_cat.to_string()) + + # 2. both continuous + print(table_con.sort_values(["Pearson Correlation"], ascending=[False]).to_string()) + print(brute_force_con.to_string()) + + # 2. categorical and continuous + print(table_final.sort_values(["Correlation ratio"], ascending=[False]).to_string()) + print(brute_force_both.to_string()) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/requirements.dev.txt b/requirements.dev.txt index e05a14b..90830c3 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,15 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.dev.txt requirements.dev.in # -anyio==3.6.1 +anyio==3.6.2 # via jupyter-server +appnope==0.1.3 + # via + # ipykernel + # ipython argon2-cffi==21.3.0 # via # jupyter-server @@ -13,9 +17,9 @@ argon2-cffi==21.3.0 # notebook argon2-cffi-bindings==21.2.0 # via argon2-cffi -asttokens==2.0.5 +asttokens==2.0.8 # via stack-data -attrs==21.4.0 +attrs==22.1.0 # via jsonschema babel==2.10.3 # via jupyterlab-server @@ -29,19 +33,19 @@ bleach==5.0.1 # via nbconvert build==0.8.0 # via pip-tools -certifi==2022.6.15 +certifi==2022.9.24 # via requests cffi==1.15.1 # via argon2-cffi-bindings cfgv==3.3.1 # via pre-commit -charset-normalizer==2.1.0 +charset-normalizer==2.1.1 # via requests click==8.1.3 # via # black # pip-tools -debugpy==1.6.2 +debugpy==1.6.3 # via ipykernel decorator==5.1.1 # via ipython @@ -49,35 +53,29 @@ defusedxml==0.7.1 # via nbconvert detect-secrets==1.3.0 # via -r requirements.dev.in -distlib==0.3.5 +distlib==0.3.6 # via virtualenv entrypoints==0.4 - # via - # jupyter-client - # nbconvert -executing==0.9.1 + # via jupyter-client +executing==1.1.1 # via stack-data -fastjsonschema==2.16.1 +fastjsonschema==2.16.2 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv flake8==4.0.1 # via -r requirements.dev.in -identify==2.5.2 +identify==2.5.6 # via pre-commit -idna==3.3 +idna==3.4 # via # anyio # requests -importlib-metadata==4.12.0 - # via jupyterlab-server -importlib-resources==5.9.0 - # via jsonschema -ipykernel==6.15.1 +ipykernel==6.16.1 # via # nbclassic # notebook -ipython==8.4.0 +ipython==8.5.0 # via # ipykernel # jupyterlab @@ -97,20 +95,20 @@ jinja2==3.1.2 # nbclassic # nbconvert # notebook -json5==0.9.8 +json5==0.9.10 # via jupyterlab-server -jsonschema==4.7.2 +jsonschema==4.16.0 # via # jupyterlab-server # nbformat -jupyter-client==7.3.4 +jupyter-client==7.4.3 # via # ipykernel # jupyter-server # nbclassic # nbclient # notebook -jupyter-core==4.11.1 +jupyter-core==4.11.2 # via # jupyter-client # jupyter-server @@ -119,49 +117,51 @@ jupyter-core==4.11.1 # nbconvert # nbformat # notebook -jupyter-server==1.18.1 +jupyter-server==1.21.0 # via # jupyterlab # jupyterlab-server # nbclassic # notebook-shim -jupyterlab==3.4.4 +jupyterlab==3.4.8 # via -r requirements.dev.in jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.15.0 +jupyterlab-server==2.16.1 # via jupyterlab markupsafe==2.1.1 # via # jinja2 # nbconvert -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via # ipykernel # ipython mccabe==0.6.1 # via flake8 -mistune==0.8.4 +mistune==2.0.4 # via nbconvert mypy-extensions==0.4.3 # via black -nbclassic==0.4.3 - # via jupyterlab -nbclient==0.6.6 +nbclassic==0.4.5 + # via + # jupyterlab + # notebook +nbclient==0.7.0 # via nbconvert -nbconvert==6.5.0 +nbconvert==7.2.2 # via # jupyter-server # nbclassic # notebook -nbformat==5.4.0 +nbformat==5.7.0 # via # jupyter-server # nbclassic # nbclient # nbconvert # notebook -nest-asyncio==1.5.5 +nest-asyncio==1.5.6 # via # ipykernel # jupyter-client @@ -172,9 +172,9 @@ nodeenv==1.7.0 # via pre-commit nose==1.3.7 # via -r requirements.dev.in -notebook==6.4.12 +notebook==6.5.1 # via jupyterlab -notebook-shim==0.1.0 +notebook-shim==0.2.0 # via nbclassic packaging==21.3 # via @@ -188,15 +188,15 @@ pandocfilters==1.5.0 # via nbconvert parso==0.8.3 # via jedi -pathspec==0.9.0 +pathspec==0.10.1 # via black -pep517==0.12.0 +pep517==0.13.0 # via build pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.8.0 +pip-tools==6.9.0 # via -r requirements.dev.in platformdirs==2.5.2 # via @@ -204,14 +204,14 @@ platformdirs==2.5.2 # virtualenv pre-commit==2.20.0 # via -r requirements.dev.in -prometheus-client==0.14.1 +prometheus-client==0.15.0 # via # jupyter-server # nbclassic # notebook -prompt-toolkit==3.0.30 +prompt-toolkit==3.0.31 # via ipython -psutil==5.9.1 +psutil==5.9.3 # via ipykernel ptyprocess==0.7.0 # via @@ -225,7 +225,7 @@ pycparser==2.21 # via cffi pyflakes==2.4.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # ipython # nbconvert @@ -235,13 +235,13 @@ pyrsistent==0.18.1 # via jsonschema python-dateutil==2.8.2 # via jupyter-client -pytz==2022.1 +pytz==2022.5 # via babel pyyaml==6.0 # via # detect-secrets # pre-commit -pyzmq==23.2.0 +pyzmq==24.0.1 # via # ipykernel # jupyter-client @@ -262,18 +262,18 @@ six==1.16.0 # asttokens # bleach # python-dateutil -sniffio==1.2.0 +sniffio==1.3.0 # via anyio soupsieve==2.3.2.post1 # via beautifulsoup4 -stack-data==0.3.0 +stack-data==0.5.1 # via ipython -terminado==0.15.0 +terminado==0.16.0 # via # jupyter-server # nbclassic # notebook -tinycss2==1.1.1 +tinycss2==1.2.1 # via nbconvert toml==0.10.2 # via pre-commit @@ -281,6 +281,7 @@ tomli==2.0.1 # via # black # build + # jupyterlab # pep517 tornado==6.2 # via @@ -291,7 +292,7 @@ tornado==6.2 # nbclassic # notebook # terminado -traitlets==5.3.0 +traitlets==5.5.0 # via # ipykernel # ipython @@ -304,11 +305,9 @@ traitlets==5.3.0 # nbconvert # nbformat # notebook -typing-extensions==4.3.0 - # via black -urllib3==1.26.11 +urllib3==1.26.12 # via requests -virtualenv==20.16.1 +virtualenv==20.16.5 # via pre-commit wcwidth==0.2.5 # via prompt-toolkit @@ -316,14 +315,10 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.3.3 +websocket-client==1.4.1 # via jupyter-server wheel==0.37.1 # via pip-tools -zipp==3.8.1 - # via - # importlib-metadata - # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements.in b/requirements.in index 296d654..762e489 100644 --- a/requirements.in +++ b/requirements.in @@ -1 +1,13 @@ -numpy \ No newline at end of file +fastapi +flake8==5.0.4 +isort[requirements] +numpy +pandas +pip-tools +pre-commit +plotly +scikit-learn +statsmodels +matplotlib +matplotlib +dython diff --git a/requirements.txt b/requirements.txt index b000696..498d8d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,154 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.txt requirements.in # -numpy==1.23.1 +anyio==3.6.2 + # via starlette +build==0.8.0 + # via pip-tools +cfgv==3.3.1 + # via pre-commit +click==8.1.3 + # via pip-tools +contourpy==1.0.5 + # via matplotlib +cycler==0.11.0 + # via matplotlib +distlib==0.3.6 + # via virtualenv +dython==0.7.2 # via -r requirements.in +fastapi==0.85.1 + # via -r requirements.in +filelock==3.8.0 + # via virtualenv +flake8==5.0.4 + # via -r requirements.in +fonttools==4.38.0 + # via matplotlib +identify==2.5.6 + # via pre-commit +idna==3.4 + # via anyio +isort[requirements]==5.10.1 + # via -r requirements.in +joblib==1.2.0 + # via + # scikit-learn + # scikit-plot +kiwisolver==1.4.4 + # via matplotlib +matplotlib==3.6.0 + # via + # -r requirements.in + # dython + # scikit-plot + # seaborn +mccabe==0.7.0 + # via flake8 +nodeenv==1.7.0 + # via pre-commit +numpy==1.23.4 + # via + # -r requirements.in + # contourpy + # dython + # matplotlib + # pandas + # patsy + # scikit-learn + # scipy + # seaborn + # statsmodels +packaging==21.3 + # via + # build + # matplotlib + # statsmodels +pandas==1.5.1 + # via + # -r requirements.in + # dython + # seaborn + # statsmodels +patsy==0.5.3 + # via statsmodels +pep517==0.13.0 + # via build +pillow==9.2.0 + # via matplotlib +pip-tools==6.9.0 + # via -r requirements.in +platformdirs==2.5.2 + # via virtualenv +plotly==5.10.0 + # via -r requirements.in +pre-commit==2.20.0 + # via -r requirements.in +psutil==5.9.3 + # via dython +pycodestyle==2.9.1 + # via flake8 +pydantic==1.10.2 + # via fastapi +pyflakes==2.5.0 + # via flake8 +pyparsing==3.0.9 + # via + # matplotlib + # packaging +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +pytz==2022.5 + # via pandas +pyyaml==6.0 + # via pre-commit +scikit-learn==1.1.2 + # via + # -r requirements.in + # dython + # scikit-plot +scikit-plot==0.3.7 + # via dython +scipy==1.9.3 + # via + # dython + # scikit-learn + # scikit-plot + # statsmodels +seaborn==0.12.1 + # via dython +six==1.16.0 + # via + # patsy + # python-dateutil +sniffio==1.3.0 + # via anyio +starlette==0.20.4 + # via fastapi +statsmodels==0.13.2 + # via -r requirements.in +tenacity==8.1.0 + # via plotly +threadpoolctl==3.1.0 + # via scikit-learn +toml==0.10.2 + # via pre-commit +tomli==2.0.1 + # via + # build + # pep517 +typing-extensions==4.4.0 + # via pydantic +virtualenv==20.16.5 + # via pre-commit +wheel==0.37.1 + # via pip-tools + +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools