From 4787f1b4268ff696a6b0ef5f54c943584cca19ea Mon Sep 17 00:00:00 2001 From: Brad Date: Tue, 4 May 2021 20:09:05 -0500 Subject: [PATCH 01/11] testing ray --- demo.py | 14 +++++++++----- memo/_runner.py | 18 ++++++++++++++++-- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/demo.py b/demo.py index ee8d9dc..e15c6e2 100644 --- a/demo.py +++ b/demo.py @@ -1,6 +1,6 @@ from memo import Runner import numpy as np - +import ray from memo import memlist, memfile, grid, time_taken data = [] @@ -18,13 +18,17 @@ def birthday_experiment(class_size, n_sim): return {"est_proba": proba} -for setting in grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000]): - birthday_experiment(**setting) +# for setting in grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000]): +# birthday_experiment(**setting) # To Run in parallel +ray.init(address='auto', _redis_password='5241590000000000') -settings = grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False) -runner = Runner(backend="threading", n_jobs=-1) +settings = list(grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False)) +runner = Runner(backend="ray", n_jobs=-1) runner.run(func=birthday_experiment, settings=settings) + +print(data) +print(len(data)) diff --git a/memo/_runner.py b/memo/_runner.py index 022a623..6b33515 100644 --- a/memo/_runner.py +++ b/memo/_runner.py @@ -17,8 +17,7 @@ class Runner: All keyword arguments during instantiaition will pass through to `parallel_backend`. More information on joblib can be found [here](https://joblib.readthedocs.io/en/latest/parallel.html). - Joblib can also attach to third party backends such as Ray or Apache spark, - however that functionality has not yet been tested. + Joblib can also attach to third party backends such as Ray or Apache spark. Usage: @@ -27,6 +26,11 @@ class Runner: runner = Runner(backend='threading', n_jobs=2) ``` + + With Ray Backend From the command line + ```shell + ray start --head --port=6379 + ``` """ def __init__( @@ -45,6 +49,16 @@ def _run(self, func: Callable, settings: Iterable[Dict]) -> None: """run the parallel backend Private. All arguments passed through run method """ + if self.backend == "ray": + try: + from ray.util.joblib import register_ray + register_ray() + except ImportError as e: + import sys + raise type(e)( + str(e) + "\nRay backend must be installed" + ).with_traceback(sys.exc_info()[2]) + try: with parallel_backend(*self.args, self.backend, self.n_jobs, **self.kwargs): Parallel(require="sharedmem")( From a5dd5f186405d2719afc09b9f1adb0a46315cb10 Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:32:19 -0500 Subject: [PATCH 02/11] Run local ray server --- .github/workflows/tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c51978b..a90e0db 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -26,6 +26,9 @@ jobs: - name: Lint with flake8 run: | flake8 + - name: Start local ray server + run: | + ray start --head --port=6379 --redis-password=5241590000000000 - name: Test with pytest run: | pytest From 353a3b33abe72529b8741cc14528dd612647dbb4 Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:33:17 -0500 Subject: [PATCH 03/11] Ignore ray deprication warnings --- pytest.ini | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..b0e5a94 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning \ No newline at end of file From 9e058cae30ea788fcf24d2c0596725fa52dc5347 Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:34:14 -0500 Subject: [PATCH 04/11] wrap grid output with list to avoid warnings during test --- docs/getting-started.md | 54 ++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/getting-started.md b/docs/getting-started.md index 86496ca..f0234c8 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,10 +1,10 @@ ## Base Scenario -Let's say you're running a simulation, or maybe a machine learning experiment. Then you -might have code that looks like this; +Let's say you're running a simulation, or maybe a machine learning experiment. Then you +might have code that looks like this; ```python -import numpy as np +import numpy as np def birthday_experiment(class_size, n_sim=10_000): """Simulates the birthday paradox. Vectorized = Fast!""" @@ -16,18 +16,18 @@ def birthday_experiment(class_size, n_sim=10_000): results = [birthday_experiment(s) for s in range(2, 40)] ``` -This example sort of works, but how would we now go about plotting our results? If you want +This example sort of works, but how would we now go about plotting our results? If you want to plot the effect of `class_size` and the simulated probability then it'd be do-able. But things -get tricky if you're also interested in seeing the effect of `n_sim` as well. The input of the -simulation isn't nicely captured together with the output of the simulation. +get tricky if you're also interested in seeing the effect of `n_sim` as well. The input of the +simulation isn't nicely captured together with the output of the simulation. ## Decorators The idea behind this library is that you can rewrite this function, only slightly, to make -all of this data collection a whole log simpler. +all of this data collection a whole log simpler. ```python -import numpy as np +import numpy as np from memo import memlist data = [] @@ -52,15 +52,15 @@ via `pd.DataFrame(data)`. ## Logging More The `memlist` decorate takes care of all data collection. It captures all keyword -arguments of the function as well as the dictionary output of the function. This -then is appended this to a list `data`. Especially when you're iteration on your -experiments this might turn out to be a lovely pattern. +arguments of the function as well as the dictionary output of the function. This +then is appended this to a list `data`. Especially when you're iteration on your +experiments this might turn out to be a lovely pattern. For example, suppose we also want to log how long the simulation takes; ```python -import time -import numpy as np +import time +import numpy as np from memo import memlist data = [] @@ -81,14 +81,14 @@ for size in range(2, 40): birthday_experiment(class_size=size, n_sim=n_sim) ``` -## Power +## Power -The real power of the library is that you can choose not only to log to -a list. You can just as easily write to a file too! +The real power of the library is that you can choose not only to log to +a list. You can just as easily write to a file too! ```python -import time -import numpy as np +import time +import numpy as np from memo import memlist, memfile data = [] @@ -110,15 +110,15 @@ for size in range(2, 40): birthday_experiment(class_size=size, n_sim=n_sim) ``` -## Utilities +## Utilities -The library also offers utilities to make the creation of these grids even easier. In particular; +The library also offers utilities to make the creation of these grids even easier. In particular; -- We supply a grid generation mechanism to prevent a lot of for-loops. +- We supply a grid generation mechanism to prevent a lot of for-loops. - We supply a `@capture_time` so that you don't need to write that logic yourself. ```python -import numpy as np +import numpy as np from memo import memlist, memfile, grid, time_taken data = [] @@ -138,10 +138,10 @@ for settings in grid(class_size=range(2, 40), n_sim=[1000, 10000, 100000]): birthday_experiment(**settings) ``` -## Parallel +## Parallel If you have a lot of simulations you'd like to run, it might be helpful to -run them in parallel. That's why this library also hosts a `Runner` class +run them in parallel. That's why this library also hosts a `Runner` class that can run your functions on multiple CPU cores. ```python @@ -162,7 +162,7 @@ def birthday_experiment(class_size, n_sim): proba = np.mean(n_uniq != class_size) return {"est_proba": proba} -settings = grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False) +settings = list(grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False)) # To Run in parallel runner = Runner(backend="threading", n_jobs=-1) @@ -173,10 +173,10 @@ runner.run(func=birthday_experiment, settings=settings) These decorators aren't performing magic, but my experience has been that these decorators make it more fun to actually log the results of experiments. -It's nice to be able to just add a decorator to a function and not have to +It's nice to be able to just add a decorator to a function and not have to worry about logging the statistics. -The library also offers extra features to make things a whole *log* simpler. +The library also offers extra features to make things a whole _log_ simpler. - `memweb` sends the json blobs to a server via http-post requests - `memfunc` sends the data to a callable that you supply, like `print` From 96454e0544db46157d3c436a1d50f0cee2c93aed Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:34:31 -0500 Subject: [PATCH 05/11] add ray to test_packages --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c6336a0..40ab8cd 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ "numpy>=1.19.4", "mktestdocs>=0.1.0", "tqdm>=4.54.0", + "ray>=1.3.0" ] + base_packages util_packages = [ From be63ac659629d33cee73e2bdf46c0bae70ff60c3 Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:35:07 -0500 Subject: [PATCH 06/11] Add support for ray backend --- memo/_runner.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/memo/_runner.py b/memo/_runner.py index 6b33515..29356fb 100644 --- a/memo/_runner.py +++ b/memo/_runner.py @@ -5,6 +5,7 @@ from rich.progress import Progress import time import warnings +from memo import NotInstalled class Runner: @@ -12,25 +13,35 @@ class Runner: Run functions in parallel with joblib. Arguments: - backend: choice of parallism backend, can be "loky", "multiprocessing" or "threading" + backend: choice of parallism backend, can be "loky", "multiprocessing", "threading", or "ray" n_jobs: degree of parallism, set to -1 to use all available cores All keyword arguments during instantiaition will pass through to `parallel_backend`. More information on joblib can be found [here](https://joblib.readthedocs.io/en/latest/parallel.html). - Joblib can also attach to third party backends such as Ray or Apache spark. + Joblib can also attach to third party backends such as [Ray](https://docs.ray.io/en/releases-1.3.0/) or Apache spark. Usage: + ```python from memo import Runner runner = Runner(backend='threading', n_jobs=2) ``` + With Ray Backend From the command line ```shell ray start --head --port=6379 ``` + + ```python + from memo import Runner + + runner = Runner(backend='ray', n_jobs=-1) + ``` + + """ def __init__( @@ -53,11 +64,8 @@ def _run(self, func: Callable, settings: Iterable[Dict]) -> None: try: from ray.util.joblib import register_ray register_ray() - except ImportError as e: - import sys - raise type(e)( - str(e) + "\nRay backend must be installed" - ).with_traceback(sys.exc_info()[2]) + except ImportError: + NotInstalled("ray", "dev") try: with parallel_backend(*self.args, self.backend, self.n_jobs, **self.kwargs): From 5344fdfeada0b3c0c270ae7db91bab60eb68bd41 Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:35:30 -0500 Subject: [PATCH 07/11] include ray in tests --- tests/test_runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_runner.py b/tests/test_runner.py index af30dd7..6d2a4d4 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -1,10 +1,12 @@ import pytest from memo import memlist, Runner, grid +import ray +ray.init(address='auto', _redis_password='5241590000000000') @pytest.mark.parametrize( "kw", - [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}], + [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}, {"backend": "ray"}], ) def test_base_multiple_calls(kw): data = [] @@ -21,7 +23,7 @@ def count_values(n_jobs=-1, **kwargs): @pytest.mark.parametrize( "kw", - [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}], + [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}, {"backend": "ray"}], ) def test_keys_included(kw): data = [] @@ -39,7 +41,7 @@ def count_values(**kwargs): @pytest.mark.parametrize( "kw", - [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}], + [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}, {"backend": "ray"}], ) def test_base_args_included(kw): data = [] @@ -62,7 +64,7 @@ def count_values(a, b, **kwargs): def test_raises_type_error(): data = [] - g = {"a": 3, "b": 4, "c": 5} + g = [{"a": 3, "b": 4, "c": 5}] with pytest.raises(TypeError): From 95a13a65f3f8b788f0e8f7df31eb68500256ea12 Mon Sep 17 00:00:00 2001 From: Brad Date: Wed, 5 May 2021 15:36:20 -0500 Subject: [PATCH 08/11] demo run parallel with ray --- demo.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/demo.py b/demo.py index e15c6e2..3847aa8 100644 --- a/demo.py +++ b/demo.py @@ -18,17 +18,15 @@ def birthday_experiment(class_size, n_sim): return {"est_proba": proba} -# for setting in grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000]): -# birthday_experiment(**setting) +for setting in grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000]): + birthday_experiment(**setting) # To Run in parallel +data = [] ray.init(address='auto', _redis_password='5241590000000000') settings = list(grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False)) runner = Runner(backend="ray", n_jobs=-1) runner.run(func=birthday_experiment, settings=settings) - -print(data) -print(len(data)) From c2a091386472d69dc9ce8b54e6b1c1434af70974 Mon Sep 17 00:00:00 2001 From: Brad Date: Thu, 6 May 2021 17:35:51 -0500 Subject: [PATCH 09/11] ray packages --- .github/workflows/tests.yml | 4 +--- setup.py | 6 ++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a90e0db..e89018b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,12 +23,10 @@ jobs: run: | python -m pip install -e "." python -m pip install -e ".[test]" + python -m pip install -e ".[ray]" - name: Lint with flake8 run: | flake8 - - name: Start local ray server - run: | - ray start --head --port=6379 --redis-password=5241590000000000 - name: Test with pytest run: | pytest diff --git a/setup.py b/setup.py index 40ab8cd..1e7f4c3 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,7 @@ "pytest>=4.0.2", "numpy>=1.19.4", "mktestdocs>=0.1.0", - "tqdm>=4.54.0", - "ray>=1.3.0" + "tqdm>=4.54.0" ] + base_packages util_packages = [ @@ -26,6 +25,8 @@ web_packages = ["httpx>=0.16.1"] + base_packages +ray_packages = ["ray>=1.3.0"] + base_packages + setup( name="memo", version="0.1.3", @@ -35,5 +36,6 @@ "web": web_packages, "test": test_packages, "dev": dev_packages, + "ray": ray_packages }, ) From 5be2031a76c421dd2851937acb6f06cd25c258a2 Mon Sep 17 00:00:00 2001 From: Brad Date: Thu, 6 May 2021 17:36:29 -0500 Subject: [PATCH 10/11] implicitly start ray server --- tests/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_runner.py b/tests/test_runner.py index 6d2a4d4..b189ac2 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -1,7 +1,7 @@ import pytest from memo import memlist, Runner, grid import ray -ray.init(address='auto', _redis_password='5241590000000000') +ray.init() @pytest.mark.parametrize( From 49051d56d28937f34ed7d816a1e7ec60fc49d47c Mon Sep 17 00:00:00 2001 From: Brad Date: Thu, 6 May 2021 17:37:34 -0500 Subject: [PATCH 11/11] change package dependency name --- memo/_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/memo/_runner.py b/memo/_runner.py index 29356fb..87acf76 100644 --- a/memo/_runner.py +++ b/memo/_runner.py @@ -65,7 +65,7 @@ def _run(self, func: Callable, settings: Iterable[Dict]) -> None: from ray.util.joblib import register_ray register_ray() except ImportError: - NotInstalled("ray", "dev") + NotInstalled("ray", "ray") try: with parallel_backend(*self.args, self.backend, self.n_jobs, **self.kwargs):