diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c51978b..e89018b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,6 +23,7 @@ jobs: run: | python -m pip install -e "." python -m pip install -e ".[test]" + python -m pip install -e ".[ray]" - name: Lint with flake8 run: | flake8 diff --git a/demo.py b/demo.py index ee8d9dc..3847aa8 100644 --- a/demo.py +++ b/demo.py @@ -1,6 +1,6 @@ from memo import Runner import numpy as np - +import ray from memo import memlist, memfile, grid, time_taken data = [] @@ -24,7 +24,9 @@ def birthday_experiment(class_size, n_sim): # To Run in parallel +data = [] +ray.init(address='auto', _redis_password='5241590000000000') -settings = grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False) -runner = Runner(backend="threading", n_jobs=-1) +settings = list(grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False)) +runner = Runner(backend="ray", n_jobs=-1) runner.run(func=birthday_experiment, settings=settings) diff --git a/docs/getting-started.md b/docs/getting-started.md index 86496ca..f0234c8 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,10 +1,10 @@ ## Base Scenario -Let's say you're running a simulation, or maybe a machine learning experiment. Then you -might have code that looks like this; +Let's say you're running a simulation, or maybe a machine learning experiment. Then you +might have code that looks like this; ```python -import numpy as np +import numpy as np def birthday_experiment(class_size, n_sim=10_000): """Simulates the birthday paradox. Vectorized = Fast!""" @@ -16,18 +16,18 @@ def birthday_experiment(class_size, n_sim=10_000): results = [birthday_experiment(s) for s in range(2, 40)] ``` -This example sort of works, but how would we now go about plotting our results? If you want +This example sort of works, but how would we now go about plotting our results? If you want to plot the effect of `class_size` and the simulated probability then it'd be do-able. But things -get tricky if you're also interested in seeing the effect of `n_sim` as well. The input of the -simulation isn't nicely captured together with the output of the simulation. +get tricky if you're also interested in seeing the effect of `n_sim` as well. The input of the +simulation isn't nicely captured together with the output of the simulation. ## Decorators The idea behind this library is that you can rewrite this function, only slightly, to make -all of this data collection a whole log simpler. +all of this data collection a whole log simpler. ```python -import numpy as np +import numpy as np from memo import memlist data = [] @@ -52,15 +52,15 @@ via `pd.DataFrame(data)`. ## Logging More The `memlist` decorate takes care of all data collection. It captures all keyword -arguments of the function as well as the dictionary output of the function. This -then is appended this to a list `data`. Especially when you're iteration on your -experiments this might turn out to be a lovely pattern. +arguments of the function as well as the dictionary output of the function. This +then is appended this to a list `data`. Especially when you're iteration on your +experiments this might turn out to be a lovely pattern. For example, suppose we also want to log how long the simulation takes; ```python -import time -import numpy as np +import time +import numpy as np from memo import memlist data = [] @@ -81,14 +81,14 @@ for size in range(2, 40): birthday_experiment(class_size=size, n_sim=n_sim) ``` -## Power +## Power -The real power of the library is that you can choose not only to log to -a list. You can just as easily write to a file too! +The real power of the library is that you can choose not only to log to +a list. You can just as easily write to a file too! ```python -import time -import numpy as np +import time +import numpy as np from memo import memlist, memfile data = [] @@ -110,15 +110,15 @@ for size in range(2, 40): birthday_experiment(class_size=size, n_sim=n_sim) ``` -## Utilities +## Utilities -The library also offers utilities to make the creation of these grids even easier. In particular; +The library also offers utilities to make the creation of these grids even easier. In particular; -- We supply a grid generation mechanism to prevent a lot of for-loops. +- We supply a grid generation mechanism to prevent a lot of for-loops. - We supply a `@capture_time` so that you don't need to write that logic yourself. ```python -import numpy as np +import numpy as np from memo import memlist, memfile, grid, time_taken data = [] @@ -138,10 +138,10 @@ for settings in grid(class_size=range(2, 40), n_sim=[1000, 10000, 100000]): birthday_experiment(**settings) ``` -## Parallel +## Parallel If you have a lot of simulations you'd like to run, it might be helpful to -run them in parallel. That's why this library also hosts a `Runner` class +run them in parallel. That's why this library also hosts a `Runner` class that can run your functions on multiple CPU cores. ```python @@ -162,7 +162,7 @@ def birthday_experiment(class_size, n_sim): proba = np.mean(n_uniq != class_size) return {"est_proba": proba} -settings = grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False) +settings = list(grid(class_size=range(20, 30), n_sim=[100, 10_000, 1_000_000], progbar=False)) # To Run in parallel runner = Runner(backend="threading", n_jobs=-1) @@ -173,10 +173,10 @@ runner.run(func=birthday_experiment, settings=settings) These decorators aren't performing magic, but my experience has been that these decorators make it more fun to actually log the results of experiments. -It's nice to be able to just add a decorator to a function and not have to +It's nice to be able to just add a decorator to a function and not have to worry about logging the statistics. -The library also offers extra features to make things a whole *log* simpler. +The library also offers extra features to make things a whole _log_ simpler. - `memweb` sends the json blobs to a server via http-post requests - `memfunc` sends the data to a callable that you supply, like `print` diff --git a/memo/_runner.py b/memo/_runner.py index 022a623..87acf76 100644 --- a/memo/_runner.py +++ b/memo/_runner.py @@ -5,6 +5,7 @@ from rich.progress import Progress import time import warnings +from memo import NotInstalled class Runner: @@ -12,21 +13,35 @@ class Runner: Run functions in parallel with joblib. Arguments: - backend: choice of parallism backend, can be "loky", "multiprocessing" or "threading" + backend: choice of parallism backend, can be "loky", "multiprocessing", "threading", or "ray" n_jobs: degree of parallism, set to -1 to use all available cores All keyword arguments during instantiaition will pass through to `parallel_backend`. More information on joblib can be found [here](https://joblib.readthedocs.io/en/latest/parallel.html). - Joblib can also attach to third party backends such as Ray or Apache spark, - however that functionality has not yet been tested. + Joblib can also attach to third party backends such as [Ray](https://docs.ray.io/en/releases-1.3.0/) or Apache spark. Usage: + ```python from memo import Runner runner = Runner(backend='threading', n_jobs=2) ``` + + + With Ray Backend From the command line + ```shell + ray start --head --port=6379 + ``` + + ```python + from memo import Runner + + runner = Runner(backend='ray', n_jobs=-1) + ``` + + """ def __init__( @@ -45,6 +60,13 @@ def _run(self, func: Callable, settings: Iterable[Dict]) -> None: """run the parallel backend Private. All arguments passed through run method """ + if self.backend == "ray": + try: + from ray.util.joblib import register_ray + register_ray() + except ImportError: + NotInstalled("ray", "ray") + try: with parallel_backend(*self.args, self.backend, self.n_jobs, **self.kwargs): Parallel(require="sharedmem")( diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..b0e5a94 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning \ No newline at end of file diff --git a/setup.py b/setup.py index c6336a0..1e7f4c3 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ "pytest>=4.0.2", "numpy>=1.19.4", "mktestdocs>=0.1.0", - "tqdm>=4.54.0", + "tqdm>=4.54.0" ] + base_packages util_packages = [ @@ -25,6 +25,8 @@ web_packages = ["httpx>=0.16.1"] + base_packages +ray_packages = ["ray>=1.3.0"] + base_packages + setup( name="memo", version="0.1.3", @@ -34,5 +36,6 @@ "web": web_packages, "test": test_packages, "dev": dev_packages, + "ray": ray_packages }, ) diff --git a/tests/test_runner.py b/tests/test_runner.py index af30dd7..b189ac2 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -1,10 +1,12 @@ import pytest from memo import memlist, Runner, grid +import ray +ray.init() @pytest.mark.parametrize( "kw", - [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}], + [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}, {"backend": "ray"}], ) def test_base_multiple_calls(kw): data = [] @@ -21,7 +23,7 @@ def count_values(n_jobs=-1, **kwargs): @pytest.mark.parametrize( "kw", - [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}], + [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}, {"backend": "ray"}], ) def test_keys_included(kw): data = [] @@ -39,7 +41,7 @@ def count_values(**kwargs): @pytest.mark.parametrize( "kw", - [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}], + [{"backend": "loky"}, {"backend": "threading"}, {"backend": "multiprocessing"}, {"backend": "ray"}], ) def test_base_args_included(kw): data = [] @@ -62,7 +64,7 @@ def count_values(a, b, **kwargs): def test_raises_type_error(): data = [] - g = {"a": 3, "b": 4, "c": 5} + g = [{"a": 3, "b": 4, "c": 5}] with pytest.raises(TypeError):