From 4b3eee09882217c3053cdcedd05e04b6b16e6cdc Mon Sep 17 00:00:00 2001 From: TDerig23 <89554971+TDerig23@users.noreply.github.com> Date: Wed, 26 Oct 2022 23:26:34 -0700 Subject: [PATCH 1/2] midterm_wednesday_upload --- bda602_midterm.py | 197 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 bda602_midterm.py diff --git a/bda602_midterm.py b/bda602_midterm.py new file mode 100644 index 0000000..68d2240 --- /dev/null +++ b/bda602_midterm.py @@ -0,0 +1,197 @@ +import random +from typing import List +from scipy import stats +import pandas as pd +import seaborn +from sklearn import datasets +import sys + +import numpy +from plotly import express as px +from plotly import figure_factory as ff +from plotly import graph_objects as go +from sklearn.metrics import confusion_matrix + +titanic_df = pd.read_csv( + "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") + + +TITANIC_PREDICTORS = [ + "pclass", + "sex", + "age", + "sibsp", + "embarked", + "parch", + "fare", + "who", + "adult_male", + "deck", + "embark_town", + "alone", + "class", +] + +dataframe = pd.DataFrame() + + +def get_test_data_set(data_set_name: str = None) -> (pd.DataFrame, List[str], str): + """Function to load a few test data sets + + :param: + data_set_name : string, optional + Data set to load + + :return: + data_set : :class:`pandas.DataFrame` + Tabular data, possibly with some preprocessing applied. + predictors :list[str] + List of predictor variables + response: str + Response variable + """ + seaborn_data_sets = ["mpg", "tips", "titanic", "titanic_2"] + sklearn_data_sets = ["boston", "diabetes", "breast_cancer"] + all_data_sets = seaborn_data_sets + sklearn_data_sets + + if data_set_name is None: + data_set_name = random.choice(all_data_sets) + else: + if data_set_name not in all_data_sets: + raise Exception(f"Data set choice not valid: {data_set_name}") + + if data_set_name in seaborn_data_sets: + if data_set_name == "mpg": + data_set = seaborn.load_dataset(name="mpg").dropna().reset_index() + predictors = [ + "cylinders", + "displacement", + "horsepower", + "weight", + "acceleration", + "origin", + "name", + ] + response = "mpg" + elif data_set_name == "tips": + data_set = seaborn.load_dataset(name="tips").dropna().reset_index() + predictors = [ + "total_bill", + "sex", + "smoker", + "day", + "time", + "size", + ] + response = "tip" + elif data_set_name == "titanic": + data_set = seaborn.load_dataset(name="titanic").dropna() + predictors = TITANIC_PREDICTORS + response = "survived" + elif data_set_name == "titanic_2": + data_set = seaborn.load_dataset(name="titanic").dropna() + predictors = TITANIC_PREDICTORS + response = "alive" + elif data_set_name in sklearn_data_sets: + if data_set_name == "boston": + data = datasets.load_boston() + data_set = pd.DataFrame(data.data, columns=data.feature_names) + data_set["CHAS"] = data_set["CHAS"].astype(str) + elif data_set_name == "diabetes": + data = datasets.load_diabetes() + data_set = pd.DataFrame(data.data, columns=data.feature_names) + data_set["gender"] = ["1" if i > 0 else "0" for i in data_set["sex"]] + elif data_set_name == "breast_cancer": + data = datasets.load_breast_cancer() + data_set = pd.DataFrame(data.data, columns=data.feature_names) + + data_set["target"] = data.target + predictors = data.feature_names + response = "target" + + print(f"Data set selected: {data_set_name}") + return data_set, predictors, response + + +dataframe, predictors, response = get_test_data_set("breast_cancer") + + + +sibsp = dataframe['worst texture'].tolist() + + +# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it. +## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder) +## loop through to find the datatype of each predictor create a list of the types. +def column_sep(dataframe): + cat_df = pd.DataFrame() + cont_df = pd.DataFrame() + for column in dataframe.columns: + if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len( + pd.unique(dataframe[column])) == 2: + cat_df[column] = dataframe[column] + else: + cont_df[column] = dataframe[column] + + return cat_df, cont_df + + +cat_df, cont_df = column_sep(dataframe) + + +# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it. +## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder) +## loop through to find the datatype of each predictor create a list of the types. +def column_sep(dataframe): + predictor_dict = {} + for column in dataframe.columns: + if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len( + pd.unique(dataframe[column])) == 2: + predictor_dict[column] = "categorical" + else: + predictor_dict[column] = "continuous" + + return predictor_dict + + +predictors = column_sep(dataframe) + +for key, value in predictors.items(): + print + +## get datatype and dataname +predictors + + +def type_chooser(predictors): + cont_cont_df = pd.DataFrame() + for key1 in range(len(predictors)): + for key2 in range(key1, len(predictors)): + if predictors.get(key1) == "continuous" and predictors.get(key2) == "continuous": + res = stats.pearsonr(dataframe[key1], dataframe[key2]) + cont_cont_df["Pearsons_R"] = res + elif predictors.get(key1) == "continuous" and predictors.get(key2) == "categorical": + res = stats.pearsonr(dataframe[key1], dataframe[key2]) + cont_cont_df["Pearsons_R"] = res + + +res = type_chooser(predictors) + +print(res) + +res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4]) +res + +## for pearsons r continious v continious loop on columns for cont_df (double for loop) get pearson correlation. store in dataframe +## enumerate the index +## Continuous / Categorical pairs df.corr[] +# Categorical / Categorical pairs us cat_correlation + +## use for loop to determine the graphing types. for loop then if else to determine if response is boolean or not +## then correspond to correct correlation type. + + +# for predictor1 in cat df + + + From 2afbb17ca04cb16700729e6fcff0bd7ec84065d9 Mon Sep 17 00:00:00 2001 From: TDerig23 <89554971+TDerig23@users.noreply.github.com> Date: Sat, 29 Oct 2022 20:29:31 -0700 Subject: [PATCH 2/2] midterm_upload --- bda602_midterm.py | 276 ++++++++++++++++++++++++++++++++----------- requirements.dev.in | 16 +-- requirements.dev.txt | 119 ++++++++++--------- requirements.in | 11 +- requirements.txt | 80 ++++++++++++- 5 files changed, 365 insertions(+), 137 deletions(-) diff --git a/bda602_midterm.py b/bda602_midterm.py index 68d2240..6c18082 100644 --- a/bda602_midterm.py +++ b/bda602_midterm.py @@ -5,15 +5,13 @@ import seaborn from sklearn import datasets import sys - +import warnings import numpy from plotly import express as px from plotly import figure_factory as ff from plotly import graph_objects as go from sklearn.metrics import confusion_matrix -titanic_df = pd.read_csv( - "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") TITANIC_PREDICTORS = [ @@ -116,82 +114,220 @@ def get_test_data_set(data_set_name: str = None) -> (pd.DataFrame, List[str], st dataframe, predictors, response = get_test_data_set("breast_cancer") - -sibsp = dataframe['worst texture'].tolist() - - -# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it. -## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder) -## loop through to find the datatype of each predictor create a list of the types. def column_sep(dataframe): - cat_df = pd.DataFrame() - cont_df = pd.DataFrame() + predictor_name = [] + predictor_type = [] for column in dataframe.columns: if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len( - pd.unique(dataframe[column])) == 2: - cat_df[column] = dataframe[column] + pd.unique(dataframe[column])) < 5: + predictor_name.append(column) + predictor_type.append("categorical") else: - cont_df[column] = dataframe[column] - - return cat_df, cont_df - - -cat_df, cont_df = column_sep(dataframe) - - -# dont separate the dataframe, determine if predictors are continious or categroical and make a list from it. -## also use a separate function to determine if response has n.unique == 2 then boolean (use label encoder) -## loop through to find the datatype of each predictor create a list of the types. -def column_sep(dataframe): - predictor_dict = {} - for column in dataframe.columns: - if dataframe[column].dtypes == "bool" or dataframe[column].dtypes == "object" or len( - pd.unique(dataframe[column])) == 2: - predictor_dict[column] = "categorical" - else: - predictor_dict[column] = "continuous" - - return predictor_dict - + predictor_name.append(column) + predictor_type.append("continuous") -predictors = column_sep(dataframe) + predictor_list = list(map(list, zip(predictor_name, predictor_type))) -for key, value in predictors.items(): - print + return predictor_list -## get datatype and dataname -predictors +predictor_list = column_sep(dataframe) -def type_chooser(predictors): - cont_cont_df = pd.DataFrame() - for key1 in range(len(predictors)): - for key2 in range(key1, len(predictors)): - if predictors.get(key1) == "continuous" and predictors.get(key2) == "continuous": - res = stats.pearsonr(dataframe[key1], dataframe[key2]) - cont_cont_df["Pearsons_R"] = res - elif predictors.get(key1) == "continuous" and predictors.get(key2) == "categorical": - res = stats.pearsonr(dataframe[key1], dataframe[key2]) - cont_cont_df["Pearsons_R"] = res - - -res = type_chooser(predictors) - -print(res) - -res = stats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4]) -res - -## for pearsons r continious v continious loop on columns for cont_df (double for loop) get pearson correlation. store in dataframe -## enumerate the index -## Continuous / Categorical pairs df.corr[] -# Categorical / Categorical pairs us cat_correlation - -## use for loop to determine the graphing types. for loop then if else to determine if response is boolean or not -## then correspond to correct correlation type. - - -# for predictor1 in cat df +def fill_na(data): + if isinstance(data, pd.Series): + return data.fillna(0) + else: + return numpy.array([value if value is not None else 0 for value in data]) +def cat_correlation(x, y, bias_correction=True, tschuprow=False): + """ + Calculates correlation statistic for categorical-categorical association. + The two measures supported are: + 1. Cramer'V ( default ) + 2. Tschuprow'T + + SOURCES: + 1.) CODE: https://github.com/MavericksDS/pycorr + 2.) Used logic from: + https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix + to ignore yates correction factor on 2x2 + 3.) Haven't validated Tschuprow + + Bias correction and formula's taken from : https://www.researchgate.net/publication/270277061_A_bias-correction_for_Cramer's_V_and_Tschuprow's_T + + Wikipedia for Cramer's V: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V + Wikipedia for Tschuprow' T: https://en.wikipedia.org/wiki/Tschuprow%27s_T + Parameters: + ----------- + x : list / ndarray / Pandas Series + A sequence of categorical measurements + y : list / NumPy ndarray / Pandas Series + A sequence of categorical measurements + bias_correction : Boolean, default = True + tschuprow : Boolean, default = False + For choosing Tschuprow as measure + Returns: + -------- + float in the range of [0,1] + """ + corr_coeff = numpy.nan + try: + x, y = fill_na(x), fill_na(y) + crosstab_matrix = pd.crosstab(x, y) + n_observations = crosstab_matrix.sum().sum() + + yates_correct = True + if bias_correction: + if crosstab_matrix.shape == (2, 2): + yates_correct = False + + chi2, _, _, _ = stats.chi2_contingency( + crosstab_matrix, correction=yates_correct + ) + phi2 = chi2 / n_observations + + # r and c are number of categories of x and y + r, c = crosstab_matrix.shape + if bias_correction: + phi2_corrected = max(0, phi2 - ((r - 1) * (c - 1)) / (n_observations - 1)) + r_corrected = r - ((r - 1) ** 2) / (n_observations - 1) + c_corrected = c - ((c - 1) ** 2) / (n_observations - 1) + if tschuprow: + corr_coeff = numpy.sqrt( + phi2_corrected / numpy.sqrt((r_corrected - 1) * (c_corrected - 1)) + ) + return corr_coeff + corr_coeff = numpy.sqrt( + phi2_corrected / min((r_corrected - 1), (c_corrected - 1)) + ) + return corr_coeff + if tschuprow: + corr_coeff = numpy.sqrt(phi2 / numpy.sqrt((r - 1) * (c - 1))) + return corr_coeff + corr_coeff = numpy.sqrt(phi2 / min((r - 1), (c - 1))) + return corr_coeff + except Exception as ex: + print(ex) + if tschuprow: + warnings.warn("Error calculating Tschuprow's T", RuntimeWarning) + else: + warnings.warn("Error calculating Cramer's V", RuntimeWarning) + return corr_coeff +def cat_cont_correlation_ratio(categories, values): + """ + Correlation Ratio: https://en.wikipedia.org/wiki/Correlation_ratio + SOURCE: + 1.) https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9 + :param categories: Numpy array of categories + :param values: Numpy array of values + :return: correlation + """ + f_cat, _ = pd.factorize(categories) + cat_num = numpy.max(f_cat) + 1 + y_avg_array = numpy.zeros(cat_num) + n_array = numpy.zeros(cat_num) + for i in range(0, cat_num): + cat_measures = values[numpy.argwhere(f_cat == i).flatten()] + n_array[i] = len(cat_measures) + y_avg_array[i] = numpy.average(cat_measures) + y_total_avg = numpy.sum(numpy.multiply(y_avg_array, n_array)) / numpy.sum(n_array) + numerator = numpy.sum( + numpy.multiply( + n_array, numpy.power(numpy.subtract(y_avg_array, y_total_avg), 2) + ) + ) + denominator = numpy.sum(numpy.power(numpy.subtract(values, y_total_avg), 2)) + if numerator == 0: + eta = 0.0 + else: + eta = numpy.sqrt(numerator / denominator) + return eta + + +def type_sep(predictor_list, dataframe): + cont_cont_preds1 = [] # done + cont_cont_preds2 = [] # done + cont_cont_pearsons = [] # done + cont_cont_html = [] # done + # cont_cont_preds1_values = [] #done + # cont_cont_preds2_values = [] #done + + cat_cont_preds1 = [] + cat_cont_preds2 = [] + cat_cont_html = [] + for reg_predictor, reg_type in predictor_list: + for sorted_pred, sorted_type in predictor_list[1:]: + if reg_type == "continuous" and sorted_type == "continuous": + pearsons_r = stats.pearsonr(dataframe[reg_predictor].values, dataframe[sorted_pred].values) + cont_cont_preds1.append(reg_predictor) + cont_cont_preds2.append(sorted_pred) + cont_cont_pearsons.append(pearsons_r[0]) + fig = px.scatter(dataframe, x=dataframe[reg_predictor].values, y=dataframe[sorted_pred].values, + trendline="ols") + fig.update_layout(title=f"chart{reg_predictor}_{sorted_pred}", + xaxis_title=f"Variable: {reg_predictor}", yaxis_title=f"Variable:{sorted_pred}") + html = "C:/Users/thoma\OneDrive/Documents/bda602/midterm/html_links/{0}_{1}_file.html".format( + reg_predictor, sorted_pred) + fig.write_html(html) + cont_cont_html.append(html) + + # elif reg_type == "continuous" and sorted_type == "categorical" or reg_type == "categorical" and sorted_type == "continuous": : + # cat_preds = np.array([sorted_pred]) + # cont_preds = np.array([reg_predictor]) + # cat_cont_array = np.concatenate((cat_preds,cont_preds)) + # cat_array = dataframe[sorted_pred].to_numpy() + # flattened_cat = cat_array.flatten() + + # cat_array = dataframe[sorted_pred].values.flatten() + # le = preprocessing.LabelEncoder() + # le.fit([dataframe[sorted_pred].flatten()]) + # classes = le.classes_ + # transformed_predictors = le.transform(classes) + fig1 = px.scatter(dataframe, x=dataframe[reg_predictor].values, y=dataframe[sorted_pred].values, + trendline="ols") + fig1.update_layout(title=f"chart{reg_predictor}_{sorted_pred}", + xaxis_title=f"Variable: {reg_predictor}", yaxis_title=f"Variable:{sorted_pred}") + html_cat_cont = "C:/Users/thoma\OneDrive/Documents/bda602/midterm/html_links/{0}_{1}_file.html".format( + reg_predictor, sorted_pred) + fig1.write_html(html_cat_cont) + cat_cont_html.append(html) + + # cat_cont_values = np.array([dataframe[reg_predictors],dataframe[sorted_pred]]) + # eta = cat_cont_correlation_ratio(cat_cont_categories,cat_cont_values) + + # else: + + # x = dataframe[sorted_pred] + # le = preprocessing.LabelEncoder() + # le.fit([cat_array]) + # classes = le.classes_ + # transformed_predictors = le.transform(classes) + # transformed_predictors + + cont_cont_df = pd.DataFrame( + {"Predictor 1": cont_cont_preds1, "Predictor 2": cont_cont_preds2, "Pearsons R": cont_cont_pearsons, + "HTML_LinregGraph": cont_cont_html}) + cont_cont_html = cont_cont_df.to_html() + text_file = open("C:/Users/thoma\OneDrive/Documents/bda602/midterm/html_links/test_dataframe.html", "w") + text_file.write(cont_cont_html) + text_file.close() + + return cont_cont_df + + +cont_cont_df = type_sep(predictor_list, dataframe) + + +data = cont_cont_df["Pearsons R"].values +heatmap = px.imshow(data,labels=dict(x=cont_cont_df["Predictor 1"], y=cont_cont_df["Predictor 2"])) +heatmap.show() + + +def main(): + + return + +if __name__ == "__main__": + sys.exit(main()) diff --git a/requirements.dev.in b/requirements.dev.in index a975411..bd93aea 100644 --- a/requirements.dev.in +++ b/requirements.dev.in @@ -1,8 +1,8 @@ -black==22.6.0 -detect-secrets==1.3.0 -flake8==4.0.1 -isort[requirements]==5.10.1 -jupyterlab -pip-tools -pre-commit==2.20.0 -nose +black==22.6.0 +detect-secrets==1.3.0 +flake8==4.0.1 +isort[requirements]==5.10.1 +jupyterlab +pip-tools +pre-commit==2.20.0 +nose diff --git a/requirements.dev.txt b/requirements.dev.txt index e05a14b..7a57d34 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=requirements.dev.txt requirements.dev.in # -anyio==3.6.1 +anyio==3.6.2 # via jupyter-server argon2-cffi==21.3.0 # via @@ -13,9 +13,9 @@ argon2-cffi==21.3.0 # notebook argon2-cffi-bindings==21.2.0 # via argon2-cffi -asttokens==2.0.5 +asttokens==2.1.0 # via stack-data -attrs==21.4.0 +attrs==22.1.0 # via jsonschema babel==2.10.3 # via jupyterlab-server @@ -27,21 +27,21 @@ black==22.6.0 # via -r requirements.dev.in bleach==5.0.1 # via nbconvert -build==0.8.0 +build==0.9.0 # via pip-tools -certifi==2022.6.15 +certifi==2022.9.24 # via requests cffi==1.15.1 # via argon2-cffi-bindings cfgv==3.3.1 # via pre-commit -charset-normalizer==2.1.0 +charset-normalizer==2.1.1 # via requests click==8.1.3 # via # black # pip-tools -debugpy==1.6.2 +debugpy==1.6.3 # via ipykernel decorator==5.1.1 # via ipython @@ -49,35 +49,35 @@ defusedxml==0.7.1 # via nbconvert detect-secrets==1.3.0 # via -r requirements.dev.in -distlib==0.3.5 +distlib==0.3.6 # via virtualenv entrypoints==0.4 - # via - # jupyter-client - # nbconvert -executing==0.9.1 + # via jupyter-client +executing==1.2.0 # via stack-data -fastjsonschema==2.16.1 +fastjsonschema==2.16.2 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv flake8==4.0.1 # via -r requirements.dev.in -identify==2.5.2 +identify==2.5.8 # via pre-commit -idna==3.3 +idna==3.4 # via # anyio # requests -importlib-metadata==4.12.0 - # via jupyterlab-server -importlib-resources==5.9.0 +importlib-metadata==5.0.0 + # via + # jupyterlab-server + # nbconvert +importlib-resources==5.10.0 # via jsonschema -ipykernel==6.15.1 +ipykernel==6.16.2 # via # nbclassic # notebook -ipython==8.4.0 +ipython==8.5.0 # via # ipykernel # jupyterlab @@ -97,20 +97,20 @@ jinja2==3.1.2 # nbclassic # nbconvert # notebook -json5==0.9.8 +json5==0.9.10 # via jupyterlab-server -jsonschema==4.7.2 +jsonschema==4.16.0 # via # jupyterlab-server # nbformat -jupyter-client==7.3.4 +jupyter-client==7.4.4 # via # ipykernel # jupyter-server # nbclassic # nbclient # notebook -jupyter-core==4.11.1 +jupyter-core==4.11.2 # via # jupyter-client # jupyter-server @@ -119,49 +119,51 @@ jupyter-core==4.11.1 # nbconvert # nbformat # notebook -jupyter-server==1.18.1 +jupyter-server==1.21.0 # via # jupyterlab # jupyterlab-server # nbclassic # notebook-shim -jupyterlab==3.4.4 +jupyterlab==3.5.0 # via -r requirements.dev.in jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.15.0 +jupyterlab-server==2.16.1 # via jupyterlab markupsafe==2.1.1 # via # jinja2 # nbconvert -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via # ipykernel # ipython mccabe==0.6.1 # via flake8 -mistune==0.8.4 +mistune==2.0.4 # via nbconvert mypy-extensions==0.4.3 # via black -nbclassic==0.4.3 - # via jupyterlab -nbclient==0.6.6 +nbclassic==0.4.5 + # via + # jupyterlab + # notebook +nbclient==0.7.0 # via nbconvert -nbconvert==6.5.0 +nbconvert==7.2.3 # via # jupyter-server # nbclassic # notebook -nbformat==5.4.0 +nbformat==5.7.0 # via # jupyter-server # nbclassic # nbclient # nbconvert # notebook -nest-asyncio==1.5.5 +nest-asyncio==1.5.6 # via # ipykernel # jupyter-client @@ -172,9 +174,9 @@ nodeenv==1.7.0 # via pre-commit nose==1.3.7 # via -r requirements.dev.in -notebook==6.4.12 +notebook==6.5.1 # via jupyterlab -notebook-shim==0.1.0 +notebook-shim==0.2.0 # via nbclassic packaging==21.3 # via @@ -188,30 +190,32 @@ pandocfilters==1.5.0 # via nbconvert parso==0.8.3 # via jedi -pathspec==0.9.0 +pathspec==0.10.1 # via black -pep517==0.12.0 +pep517==0.13.0 # via build pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.8.0 +pip-tools==6.9.0 # via -r requirements.dev.in +pkgutil-resolve-name==1.3.10 + # via jsonschema platformdirs==2.5.2 # via # black # virtualenv pre-commit==2.20.0 # via -r requirements.dev.in -prometheus-client==0.14.1 +prometheus-client==0.15.0 # via # jupyter-server # nbclassic # notebook -prompt-toolkit==3.0.30 +prompt-toolkit==3.0.31 # via ipython -psutil==5.9.1 +psutil==5.9.3 # via ipykernel ptyprocess==0.7.0 # via @@ -225,7 +229,7 @@ pycparser==2.21 # via cffi pyflakes==2.4.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # ipython # nbconvert @@ -235,13 +239,13 @@ pyrsistent==0.18.1 # via jsonschema python-dateutil==2.8.2 # via jupyter-client -pytz==2022.1 +pytz==2022.5 # via babel pyyaml==6.0 # via # detect-secrets # pre-commit -pyzmq==23.2.0 +pyzmq==24.0.1 # via # ipykernel # jupyter-client @@ -262,18 +266,18 @@ six==1.16.0 # asttokens # bleach # python-dateutil -sniffio==1.2.0 +sniffio==1.3.0 # via anyio soupsieve==2.3.2.post1 # via beautifulsoup4 -stack-data==0.3.0 +stack-data==0.6.0 # via ipython -terminado==0.15.0 +terminado==0.17.0 # via # jupyter-server # nbclassic # notebook -tinycss2==1.1.1 +tinycss2==1.2.1 # via nbconvert toml==0.10.2 # via pre-commit @@ -281,6 +285,7 @@ tomli==2.0.1 # via # black # build + # jupyterlab # pep517 tornado==6.2 # via @@ -291,7 +296,7 @@ tornado==6.2 # nbclassic # notebook # terminado -traitlets==5.3.0 +traitlets==5.5.0 # via # ipykernel # ipython @@ -304,11 +309,11 @@ traitlets==5.3.0 # nbconvert # nbformat # notebook -typing-extensions==4.3.0 +typing-extensions==4.4.0 # via black -urllib3==1.26.11 +urllib3==1.26.12 # via requests -virtualenv==20.16.1 +virtualenv==20.16.6 # via pre-commit wcwidth==0.2.5 # via prompt-toolkit @@ -316,11 +321,11 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.3.3 +websocket-client==1.4.1 # via jupyter-server wheel==0.37.1 # via pip-tools -zipp==3.8.1 +zipp==3.10.0 # via # importlib-metadata # importlib-resources diff --git a/requirements.in b/requirements.in index 296d654..952e14d 100644 --- a/requirements.in +++ b/requirements.in @@ -1 +1,10 @@ -numpy \ No newline at end of file +mariadb +numpy +pandas +plotly +pyspark +pyspark-stubs +seaborn +scikit-learn +sqlalchemy +statsmodels \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b000696..5aada87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,83 @@ # # pip-compile --output-file=requirements.txt requirements.in # -numpy==1.23.1 +contourpy==1.0.5 + # via matplotlib +cycler==0.11.0 + # via matplotlib +fonttools==4.38.0 + # via matplotlib +greenlet==1.1.3.post0 + # via sqlalchemy +joblib==1.2.0 + # via scikit-learn +kiwisolver==1.4.4 + # via matplotlib +mariadb==1.1.4 # via -r requirements.in +matplotlib==3.6.0 + # via seaborn +numpy==1.23.4 + # via + # -r requirements.in + # contourpy + # matplotlib + # pandas + # patsy + # scikit-learn + # scipy + # seaborn + # statsmodels +packaging==21.3 + # via + # matplotlib + # statsmodels +pandas==1.5.1 + # via + # -r requirements.in + # seaborn + # statsmodels +patsy==0.5.3 + # via statsmodels +pillow==9.3.0 + # via matplotlib +plotly==5.11.0 + # via -r requirements.in +py4j==0.10.9 + # via pyspark +pyparsing==3.0.9 + # via + # matplotlib + # packaging +pyspark==3.0.3 + # via + # -r requirements.in + # pyspark-stubs +pyspark-stubs==3.0.0.post3 + # via -r requirements.in +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +pytz==2022.5 + # via pandas +scikit-learn==1.1.3 + # via -r requirements.in +scipy==1.9.3 + # via + # scikit-learn + # statsmodels +seaborn==0.12.1 + # via -r requirements.in +six==1.16.0 + # via + # patsy + # python-dateutil +sqlalchemy==1.4.42 + # via -r requirements.in +statsmodels==0.13.2 + # via -r requirements.in +tenacity==8.1.0 + # via plotly +threadpoolctl==3.1.0 + # via scikit-learn