diff --git a/Assignment3/Assignment3.py b/Assignment3/Assignment3.py new file mode 100644 index 0000000..46f5156 --- /dev/null +++ b/Assignment3/Assignment3.py @@ -0,0 +1,136 @@ +import sys +from pyspark import keyword_only +from pyspark.ml import Pipeline, Transformer +from pyspark.ml.param.shared import HasInputCols, HasOutputCol +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql import SparkSession +from pyspark.ml.regression import GeneralizedLinearRegression + +class transformer( + Transformer, + HasInputCols, + HasOutputCol, + DefaultParamsReadable, + DefaultParamsWritable, +): + @keyword_only + def _init_(self, inputCols=None, outputCol=None): + super(transformer, self)._init_() + kwargs = self._input_kwargs + self.setParams(**kwargs) + return + + @keyword_only + def setParams(self, inputCols=None, outputCol=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _transform(self, dataset): + input_cols = self.getInputCols() + output_col = self.getOutputCol() + + return dataset.show() + + + +def main(): + appName = "assignment3" + master = "local" + spark = ( + SparkSession.builder.appName(appName) + .master(master) + .config( + "spark.jars", + "/Users/bitaetaati/PythonProjectTemplate/PythonProjectTemplate/mariadb-java-client-3.0.8.jar", + ) + .getOrCreate() + ) + + sql1 = "select * from baseball.batter_counts" + database = "baseball" + user = "bita" + password = "" + server = "localhost" + port = 3306 + jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme" + jdbc_driver = "org.mariadb.jdbc.Driver" + + df1 = ( + spark.read.format("jdbc") + .option("url", jdbc_url) + .option("query", sql1) + .option("user", user) + .option("password", password) + .option("driver", jdbc_driver) + .load() + ) + + df1.show() + df1.printSchema() + + sql2 = "select * from baseball.game" + database = "baseball" + user = "bita" + password = "" + server = "localhost" + port = 3306 + jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme" + jdbc_driver = "org.mariadb.jdbc.Driver" + + df2 = ( + spark.read.format("jdbc") + .option("url", jdbc_url) + .option("query", sql2) + .option("user", user) + .option("password", password) + .option("driver", jdbc_driver) + .load() + ) + + df2.show() + df2.printSchema() + + + df1.createOrReplaceTempView("batter_counts") + df2.createOrReplaceTempView("game") + + rolling_average = spark.sql( + """with t1 as + (select btc.batter, max(gm.local_date) as max_date, btc.game_id from batter_counts btc + left join game gm + on btc.game_id = gm.game_id + group by btc.batter, btc.game_id), + t2 as + (select btc.batter, sum(btc.hit)/sum(btc.atBat) as batting_average, + gm.local_date, case when btc.atBat = 0 then 'zero' end as atBat + from batter_counts btc + left join game gm + on btc.game_id = gm.game_id + group by btc.batter, btc.game_id) + select t2.batter , t2.batting_average from t2 + right join t1 on t2.batter = t1.batter + where t2.local_date > date_add(t1.max_date, INTERVAL -100 DAY) + group by t1.batter, t1.game_id)""" + ) + rolling_average.show() + return + + transformer = transformer() + + glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3, + labelCol="batting_average", + predictionCol="pred ", + probabilityCol="prob_batting_avg", + rawPredictionCol="raw_pred_batting_avg") + + + pipeline = Pipeline(stages=[transformer, glr]) + model = pipeline.fit(rolling_average) + rolling_average= model.transform(rolling_average) + rolling_average.show() + + return + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/README.md b/README.md index 3cd99cc..c31fa59 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ - Setup a python 3.x venv (usually in `.venv`) - You can run `./scripts/create-venv.sh` to generate one - `pip3 install --upgrade pip` -- Install pip-tools `pip3 install pip-tools` -- Update dev requirements: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade` +- Install pip-tools ` ` +- Update dev requirement s: `pip-compile --output-file=requirements.dev.txt requirements.dev.in --upgrade` - Update requirements: `pip-compile --output-file=requirements.txt requirements.in --upgrade` - Install dev requirements `pip3 install -r requirements.dev.txt` - Install requirements `pip3 install -r requirements.txt` diff --git a/mariadb-java-client-3.0.8.jar b/mariadb-java-client-3.0.8.jar new file mode 100644 index 0000000..0c21e78 Binary files /dev/null and b/mariadb-java-client-3.0.8.jar differ diff --git a/requirements.dev.txt b/requirements.dev.txt index e05a14b..83f5594 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,11 +1,15 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.dev.txt requirements.dev.in # anyio==3.6.1 # via jupyter-server +appnope==0.1.3 + # via + # ipykernel + # ipython argon2-cffi==21.3.0 # via # jupyter-server @@ -13,9 +17,9 @@ argon2-cffi==21.3.0 # notebook argon2-cffi-bindings==21.2.0 # via argon2-cffi -asttokens==2.0.5 +asttokens==2.0.8 # via stack-data -attrs==21.4.0 +attrs==22.1.0 # via jsonschema babel==2.10.3 # via jupyterlab-server @@ -29,19 +33,19 @@ bleach==5.0.1 # via nbconvert build==0.8.0 # via pip-tools -certifi==2022.6.15 +certifi==2022.9.24 # via requests cffi==1.15.1 # via argon2-cffi-bindings cfgv==3.3.1 # via pre-commit -charset-normalizer==2.1.0 +charset-normalizer==2.1.1 # via requests click==8.1.3 # via # black # pip-tools -debugpy==1.6.2 +debugpy==1.6.3 # via ipykernel decorator==5.1.1 # via ipython @@ -49,35 +53,29 @@ defusedxml==0.7.1 # via nbconvert detect-secrets==1.3.0 # via -r requirements.dev.in -distlib==0.3.5 +distlib==0.3.6 # via virtualenv entrypoints==0.4 - # via - # jupyter-client - # nbconvert -executing==0.9.1 + # via jupyter-client +executing==1.1.0 # via stack-data -fastjsonschema==2.16.1 +fastjsonschema==2.16.2 # via nbformat -filelock==3.7.1 +filelock==3.8.0 # via virtualenv flake8==4.0.1 # via -r requirements.dev.in -identify==2.5.2 +identify==2.5.6 # via pre-commit -idna==3.3 +idna==3.4 # via # anyio # requests -importlib-metadata==4.12.0 - # via jupyterlab-server -importlib-resources==5.9.0 - # via jsonschema -ipykernel==6.15.1 +ipykernel==6.16.0 # via # nbclassic # notebook -ipython==8.4.0 +ipython==8.5.0 # via # ipykernel # jupyterlab @@ -97,13 +95,13 @@ jinja2==3.1.2 # nbclassic # nbconvert # notebook -json5==0.9.8 +json5==0.9.10 # via jupyterlab-server -jsonschema==4.7.2 +jsonschema==4.16.0 # via # jupyterlab-server # nbformat -jupyter-client==7.3.4 +jupyter-client==7.3.5 # via # ipykernel # jupyter-server @@ -119,49 +117,49 @@ jupyter-core==4.11.1 # nbconvert # nbformat # notebook -jupyter-server==1.18.1 +jupyter-server==1.19.1 # via # jupyterlab # jupyterlab-server # nbclassic # notebook-shim -jupyterlab==3.4.4 +jupyterlab==3.4.8 # via -r requirements.dev.in jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.15.0 +jupyterlab-server==2.15.2 # via jupyterlab markupsafe==2.1.1 # via # jinja2 # nbconvert -matplotlib-inline==0.1.3 +matplotlib-inline==0.1.6 # via # ipykernel # ipython mccabe==0.6.1 # via flake8 -mistune==0.8.4 +mistune==2.0.4 # via nbconvert mypy-extensions==0.4.3 # via black -nbclassic==0.4.3 +nbclassic==0.4.4 # via jupyterlab -nbclient==0.6.6 +nbclient==0.6.8 # via nbconvert -nbconvert==6.5.0 +nbconvert==7.1.0 # via # jupyter-server # nbclassic # notebook -nbformat==5.4.0 +nbformat==5.6.1 # via # jupyter-server # nbclassic # nbclient # nbconvert # notebook -nest-asyncio==1.5.5 +nest-asyncio==1.5.6 # via # ipykernel # jupyter-client @@ -188,9 +186,9 @@ pandocfilters==1.5.0 # via nbconvert parso==0.8.3 # via jedi -pathspec==0.9.0 +pathspec==0.10.1 # via black -pep517==0.12.0 +pep517==0.13.0 # via build pexpect==4.8.0 # via ipython @@ -209,9 +207,9 @@ prometheus-client==0.14.1 # jupyter-server # nbclassic # notebook -prompt-toolkit==3.0.30 +prompt-toolkit==3.0.31 # via ipython -psutil==5.9.1 +psutil==5.9.2 # via ipykernel ptyprocess==0.7.0 # via @@ -225,7 +223,7 @@ pycparser==2.21 # via cffi pyflakes==2.4.0 # via flake8 -pygments==2.12.0 +pygments==2.13.0 # via # ipython # nbconvert @@ -235,13 +233,13 @@ pyrsistent==0.18.1 # via jsonschema python-dateutil==2.8.2 # via jupyter-client -pytz==2022.1 +pytz==2022.4 # via babel pyyaml==6.0 # via # detect-secrets # pre-commit -pyzmq==23.2.0 +pyzmq==24.0.1 # via # ipykernel # jupyter-client @@ -262,13 +260,13 @@ six==1.16.0 # asttokens # bleach # python-dateutil -sniffio==1.2.0 +sniffio==1.3.0 # via anyio soupsieve==2.3.2.post1 # via beautifulsoup4 -stack-data==0.3.0 +stack-data==0.5.1 # via ipython -terminado==0.15.0 +terminado==0.16.0 # via # jupyter-server # nbclassic @@ -281,6 +279,7 @@ tomli==2.0.1 # via # black # build + # jupyterlab # pep517 tornado==6.2 # via @@ -291,7 +290,7 @@ tornado==6.2 # nbclassic # notebook # terminado -traitlets==5.3.0 +traitlets==5.4.0 # via # ipykernel # ipython @@ -304,11 +303,9 @@ traitlets==5.3.0 # nbconvert # nbformat # notebook -typing-extensions==4.3.0 - # via black -urllib3==1.26.11 +urllib3==1.26.12 # via requests -virtualenv==20.16.1 +virtualenv==20.16.5 # via pre-commit wcwidth==0.2.5 # via prompt-toolkit @@ -316,14 +313,10 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.3.3 +websocket-client==1.4.1 # via jupyter-server wheel==0.37.1 # via pip-tools -zipp==3.8.1 - # via - # importlib-metadata - # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements.in b/requirements.in index 296d654..f5cccfc 100644 --- a/requirements.in +++ b/requirements.in @@ -1 +1,28 @@ -numpy \ No newline at end of file +fastapi +flake8==5.0.4 +flask +graphviz +gunicorn +isort[requirements] +netcal +nose +numpy +pandas +pip-tools +pre-commit +plotly +pydot +pygam +pyspark +pyspark-stubs +requests +scikit-learn +seaborn +statsmodels +sqlalchemy +sympy +uvicorn +wheel +MariaDB +pyspark +pyspark-stubs diff --git a/requirements.txt b/requirements.txt index b000696..05da548 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,285 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.10 # To update, run: # # pip-compile --output-file=requirements.txt requirements.in # -numpy==1.23.1 +absl-py==1.2.0 + # via tensorboard +anyio==3.6.1 + # via starlette +build==0.8.0 + # via pip-tools +cachetools==5.2.0 + # via google-auth +certifi==2022.9.24 + # via requests +cfgv==3.3.1 + # via pre-commit +charset-normalizer==2.1.1 + # via requests +click==8.1.3 + # via + # flask + # pip-tools + # uvicorn +contourpy==1.0.5 + # via matplotlib +cycler==0.11.0 + # via matplotlib +distlib==0.3.6 + # via virtualenv +fastapi==0.85.0 # via -r requirements.in +filelock==3.8.0 + # via virtualenv +flake8==5.0.4 + # via -r requirements.in +flask==2.2.2 + # via -r requirements.in +fonttools==4.37.4 + # via matplotlib +future==0.18.2 + # via pygam +google-auth==2.12.0 + # via + # google-auth-oauthlib + # tensorboard +google-auth-oauthlib==0.4.6 + # via tensorboard +graphviz==0.20.1 + # via -r requirements.in +grpcio==1.49.1 + # via tensorboard +gunicorn==20.1.0 + # via -r requirements.in +h11==0.14.0 + # via uvicorn +identify==2.5.6 + # via pre-commit +idna==3.4 + # via + # anyio + # requests +isort[requirements]==5.10.1 + # via -r requirements.in +itsdangerous==2.1.2 + # via flask +jinja2==3.1.2 + # via flask +joblib==1.2.0 + # via scikit-learn +kiwisolver==1.4.4 + # via matplotlib +mariadb==1.1.4 + # via -r requirements.in +markdown==3.4.1 + # via tensorboard +markupsafe==2.1.1 + # via + # jinja2 + # werkzeug +matplotlib==3.6.0 + # via + # netcal + # seaborn + # tikzplotlib +mccabe==0.7.0 + # via flake8 +mpmath==1.2.1 + # via sympy +netcal==1.2.1 + # via -r requirements.in +nodeenv==1.7.0 + # via pre-commit +nose==1.3.7 + # via -r requirements.in +numpy==1.23.3 + # via + # -r requirements.in + # contourpy + # matplotlib + # netcal + # opt-einsum + # pandas + # patsy + # pygam + # pyro-ppl + # scikit-learn + # scipy + # seaborn + # statsmodels + # tensorboard + # tikzplotlib + # torchvision +oauthlib==3.2.1 + # via requests-oauthlib +opt-einsum==3.3.0 + # via pyro-ppl +packaging==21.3 + # via + # build + # matplotlib + # statsmodels +pandas==1.5.0 + # via + # -r requirements.in + # seaborn + # statsmodels +patsy==0.5.2 + # via statsmodels +pep517==0.13.0 + # via build +pillow==9.2.0 + # via + # matplotlib + # tikzplotlib + # torchvision +pip-tools==6.8.0 + # via -r requirements.in +platformdirs==2.5.2 + # via virtualenv +plotly==5.10.0 + # via -r requirements.in +pre-commit==2.20.0 + # via -r requirements.in +progressbar2==4.0.0 + # via pygam +protobuf==3.19.6 + # via tensorboard +py4j==0.10.9 + # via pyspark +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.2.8 + # via google-auth +pycodestyle==2.9.1 + # via flake8 +pydantic==1.10.2 + # via fastapi +pydot==1.4.2 + # via -r requirements.in +pyflakes==2.5.0 + # via flake8 +pygam==0.8.0 + # via -r requirements.in +pyparsing==3.0.9 + # via + # matplotlib + # packaging + # pydot +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.8.2 + # via netcal +pyspark==3.0.3 + # via + # -r requirements.in + # pyspark-stubs +pyspark-stubs==3.0.0.post3 + # via -r requirements.in +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +python-utils==3.3.3 + # via progressbar2 +pytz==2022.4 + # via pandas +pyyaml==6.0 + # via pre-commit +requests==2.28.1 + # via + # -r requirements.in + # requests-oauthlib + # tensorboard + # torchvision +requests-oauthlib==1.3.1 + # via google-auth-oauthlib +rsa==4.9 + # via google-auth +scikit-learn==1.1.2 + # via + # -r requirements.in + # netcal +scipy==1.9.1 + # via + # netcal + # pygam + # scikit-learn + # statsmodels +seaborn==0.12.0 + # via -r requirements.in +six==1.16.0 + # via + # google-auth + # grpcio + # patsy + # python-dateutil +sniffio==1.3.0 + # via anyio +sqlalchemy==1.4.41 + # via -r requirements.in +starlette==0.20.4 + # via fastapi +statsmodels==0.13.2 + # via -r requirements.in +sympy==1.11.1 + # via -r requirements.in +tenacity==8.1.0 + # via plotly +tensorboard==2.10.1 + # via netcal +tensorboard-data-server==0.6.1 + # via tensorboard +tensorboard-plugin-wit==1.8.1 + # via tensorboard +threadpoolctl==3.1.0 + # via scikit-learn +tikzplotlib==0.10.1 + # via netcal +toml==0.10.2 + # via pre-commit +tomli==2.0.1 + # via + # build + # pep517 +torch==1.12.1 + # via + # netcal + # pyro-ppl + # torchvision +torchvision==0.13.1 + # via netcal +tqdm==4.64.1 + # via + # netcal + # pyro-ppl +typing-extensions==4.3.0 + # via + # pydantic + # torch + # torchvision +urllib3==1.26.12 + # via requests +uvicorn==0.18.3 + # via -r requirements.in +virtualenv==20.16.5 + # via pre-commit +webcolors==1.12 + # via tikzplotlib +werkzeug==2.2.2 + # via + # flask + # tensorboard +wheel==0.37.1 + # via + # -r requirements.in + # pip-tools + # tensorboard + +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools