From 62226cb2f9a925b8f6adbcf6be4626718ece8599 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Thu, 31 May 2018 20:16:17 -0700 Subject: [PATCH 01/34] add and tested local singularity mode --- doodad/launch_tools.py | 14 +++--- doodad/mode.py | 104 ++++++++++++++++++++++++++++++++++++++--- doodad/mount.py | 2 +- 3 files changed, 105 insertions(+), 15 deletions(-) diff --git a/doodad/launch_tools.py b/doodad/launch_tools.py index c5e1e07..db5d19d 100644 --- a/doodad/launch_tools.py +++ b/doodad/launch_tools.py @@ -22,13 +22,11 @@ def launch_python( mode=LOCAL, mount_points=None, args=None, - env=None, - dry=False, fake_display=False, target_mount_dir='target', - verbose=False, use_cloudpickle=False, target_mount=None, + **launch_command_kwargs ): """ @@ -37,7 +35,6 @@ def launch_python( :param mode: :param mount_points: :param args: - :param env: :param dry: :param fake_display: :param target_mount_dir: @@ -62,7 +59,7 @@ def launch_python( else: target_mount = MountLocal(local_dir=target_dir, mount_point=target_mount_dir) mount_points = mount_points + [target_mount] - target_full_path = os.path.join(target_mount.docker_mount_dir(), os.path.basename(target)) + target_full_path = os.path.join(target_mount.mount_dir(), os.path.basename(target)) command = make_python_command( target_full_path, @@ -71,7 +68,8 @@ def launch_python( fake_display=fake_display, use_cloudpickle=use_cloudpickle, ) - mode.launch_command(command, mount_points=mount_points, dry=dry, verbose=verbose) + mode.launch_command(command, mount_points=mount_points, + **launch_command_kwargs) return target_mount HEADLESS = 'xvfb-run -a -s "-ac -screen 0 1400x900x24 +extension RANDR"' @@ -90,8 +88,8 @@ def make_python_command( args_encoded, cp_version = encode_args(args, cloudpickle=use_cloudpickle) if args: - cmd = '%s=%s %s=%s %s=%s %s' % (ARGS_DATA, args_encoded, - USE_CLOUDPICKLE, str(int(use_cloudpickle)), + cmd = '%s=%s %s=%s %s=%s %s' % (ARGS_DATA, args_encoded, + USE_CLOUDPICKLE, str(int(use_cloudpickle)), CLOUDPICKLE_VERSION, cp_version, cmd) diff --git a/doodad/mode.py b/doodad/mode.py index e729da5..10eea07 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -5,23 +5,21 @@ import time import base64 -from doodad.ec2.autoconfig import AUTOCONFIG - try: from StringIO import StringIO except ImportError: from io import StringIO -from .mount import * +from .mount import MountLocal, MountS3 from .utils import hash_file, call_and_wait, CommandBuilder from .ec2.aws_util import s3_upload, s3_exists + class LaunchMode(object): def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): raise NotImplementedError() - class Local(LaunchMode): def __init__(self): super(Local, self).__init__() @@ -131,7 +129,7 @@ def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): for mount in mount_points: if isinstance(mount, MountLocal): #mount_pnt = os.path.expanduser(mount.mount_point) - mount_pnt = mount.docker_mount_dir() + mount_pnt = mount.mount_dir() mnt_args += ' -v %s:%s' % (mount.local_dir, mount_pnt) call_and_wait('mkdir -p %s' % mount.local_dir) if mount.pythonpath: @@ -181,7 +179,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): remote_cmds.append('mkdir -p %s' % remote_mnt_dir) unzip_cmd = 'tar -xf %s -C %s' % (remote_tar, remote_mnt_dir) remote_cmds.append(unzip_cmd) - mount_point = mount.docker_mount_dir() + mount_point = mount.mount_dir() mnt_args += ' -v %s:%s' % (os.path.join(remote_mnt_dir, os.path.basename(mount.mount_point)) ,mount_point) else: #remote_cmds.append('mkdir -p %s' % mount.mount_point) @@ -593,3 +591,97 @@ def __init__(self): super(CodalabDocker, self).__init__() raise NotImplementedError() + +class SingularityMode(LaunchMode): + def __init__(self, image, gpu=False): + super(SingularityMode, self).__init__() + self.singularity_image = image + self.gpu = gpu + + def get_singularity_cmd( + self, + main_cmd, + extra_args='', + verbose=True, + pythonpath=None, + pre_cmd=None, + post_cmd=None, + ): + cmd_list= CommandBuilder() + if pre_cmd: + cmd_list.extend(pre_cmd) + + if verbose: + if self.gpu: + cmd_list.append('echo \"Running in singularity (gpu)\"') + else: + cmd_list.append('echo \"Running in singularity\"') + if pythonpath: + cmd_list.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(pythonpath))) + + cmd_list.append('export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/vitchyr/.mujoco/mjpro150/bin') + cmd_list.append(main_cmd) + if post_cmd: + cmd_list.extend(post_cmd) + + if self.gpu: + extra_args += ' --nv ' + singularity_prefix = 'singularity exec %s %s /bin/bash -c ' % ( + extra_args, + self.singularity_image, + ) + main_cmd = cmd_list.to_string() + full_cmd = singularity_prefix + ("\'%s\'" % main_cmd) + return full_cmd + + +class LocalSingularity(SingularityMode): + def get_mount_info(self, mount_points): + mnt_args = '' + py_path = [] + for mount in mount_points: + if isinstance(mount, MountLocal): + mount_pnt = mount.mount_dir() + mnt_args += ' -B %s:%s' % (mount.local_dir, mount_pnt) + call_and_wait('mkdir -p %s' % mount.local_dir) + if mount.pythonpath: + py_path.append(mount_pnt) + else: + raise NotImplementedError(type(mount)) + return mnt_args, py_path + + + def launch_command(self, cmd, mount_points=None, dry=False, + verbose=False, pre_cmd=None, post_cmd=None): + mnt_args, py_path = self.get_mount_info(mount_points) + full_cmd = self.get_singularity_cmd( + cmd, + extra_args=mnt_args, + pythonpath=py_path, + pre_cmd=pre_cmd, + post_cmd=post_cmd, + verbose=verbose, + ) + if verbose: + print(full_cmd) + call_and_wait(full_cmd, dry=dry) + + +class SlurmSingularity(LocalSingularity): + def launch_command(self, cmd, mount_points=None, dry=False, + verbose=False, pre_cmd=None, post_cmd=None): + mnt_args, py_path = self.get_mount_info(mount_points) + singularity_cmd = self.get_singularity_cmd( + cmd, + extra_args=mnt_args, + pythonpath=py_path, + pre_cmd=pre_cmd, + post_cmd=post_cmd, + verbose=verbose, + ) + full_cmd = "sbatch -A fc_rail -p savio -t 5 {}".format( + singularity_cmd + ) + if verbose: + print(full_cmd) + call_and_wait(full_cmd, dry=dry) diff --git a/doodad/mount.py b/doodad/mount.py index c1eab15..b686ee3 100644 --- a/doodad/mount.py +++ b/doodad/mount.py @@ -77,7 +77,7 @@ def filter_func(tar_info): def __str__(self): return 'MountLocal@%s'%self.local_dir - def docker_mount_dir(self): + def mount_dir(self): return os.path.join('/mounts', self.mount_point.replace('~/','')) From e8f4a5cc1bdcccc11458601eb30ae2d40e421e7f Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Thu, 31 May 2018 21:24:01 -0700 Subject: [PATCH 02/34] fix slurm singularity and use local files since singularity sees host files --- doodad/launch_tools.py | 5 ++++- doodad/mode.py | 27 ++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/doodad/launch_tools.py b/doodad/launch_tools.py index db5d19d..292f630 100644 --- a/doodad/launch_tools.py +++ b/doodad/launch_tools.py @@ -26,6 +26,7 @@ def launch_python( target_mount_dir='target', use_cloudpickle=False, target_mount=None, + launch_locally=None, **launch_command_kwargs ): """ @@ -47,6 +48,8 @@ def launch_python( args = {} if mount_points is None: mount_points = [] + if launch_locally is None: + launch_locally = isinstance(mode, Local) if target_mount is None: # mount @@ -54,7 +57,7 @@ def launch_python( if not target_mount_dir: target_mount_dir = target_dir target_mount_dir = os.path.join(target_mount_dir, os.path.basename(target_dir)) - if isinstance(mode, Local): + if launch_locally: target_mount = MountLocal(local_dir=target_dir) else: target_mount = MountLocal(local_dir=target_dir, mount_point=target_mount_dir) diff --git a/doodad/mode.py b/doodad/mode.py index 10eea07..9ca331f 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -636,27 +636,18 @@ def get_singularity_cmd( class LocalSingularity(SingularityMode): - def get_mount_info(self, mount_points): - mnt_args = '' + def launch_command(self, cmd, mount_points=None, dry=False, + verbose=False, pre_cmd=None, post_cmd=None): py_path = [] for mount in mount_points: if isinstance(mount, MountLocal): - mount_pnt = mount.mount_dir() - mnt_args += ' -B %s:%s' % (mount.local_dir, mount_pnt) - call_and_wait('mkdir -p %s' % mount.local_dir) if mount.pythonpath: - py_path.append(mount_pnt) + py_path.append(mount.local_dir) else: raise NotImplementedError(type(mount)) - return mnt_args, py_path - - def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, pre_cmd=None, post_cmd=None): - mnt_args, py_path = self.get_mount_info(mount_points) full_cmd = self.get_singularity_cmd( cmd, - extra_args=mnt_args, pythonpath=py_path, pre_cmd=pre_cmd, post_cmd=post_cmd, @@ -670,16 +661,22 @@ def launch_command(self, cmd, mount_points=None, dry=False, class SlurmSingularity(LocalSingularity): def launch_command(self, cmd, mount_points=None, dry=False, verbose=False, pre_cmd=None, post_cmd=None): - mnt_args, py_path = self.get_mount_info(mount_points) + py_path = [] + for mount in mount_points: + if isinstance(mount, MountLocal): + if mount.pythonpath: + py_path.append(mount.local_dir) + else: + raise NotImplementedError(type(mount)) + singularity_cmd = self.get_singularity_cmd( cmd, - extra_args=mnt_args, pythonpath=py_path, pre_cmd=pre_cmd, post_cmd=post_cmd, verbose=verbose, ) - full_cmd = "sbatch -A fc_rail -p savio -t 5 {}".format( + full_cmd = "sbatch -A fc_rail -p savio2_htc -t 5 {}".format( singularity_cmd ) if verbose: From e629f9a0cd7902400e2e06e954f53708f7a30637 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Thu, 31 May 2018 21:46:47 -0700 Subject: [PATCH 03/34] remove hard-coded debugging line --- doodad/mode.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doodad/mode.py b/doodad/mode.py index 9ca331f..894ee92 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -619,7 +619,6 @@ def get_singularity_cmd( if pythonpath: cmd_list.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(pythonpath))) - cmd_list.append('export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/vitchyr/.mujoco/mjpro150/bin') cmd_list.append(main_cmd) if post_cmd: cmd_list.extend(post_cmd) From a7f2b01835f0267acb64311bb0eca732822affad Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Thu, 31 May 2018 22:45:28 -0700 Subject: [PATCH 04/34] add configuration options for sbatch --- doodad/mode.py | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 894ee92..8e88aa4 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -658,8 +658,27 @@ def launch_command(self, cmd, mount_points=None, dry=False, class SlurmSingularity(LocalSingularity): + # TODO: set up an auto-config + def __init__( + self, image, account_name, partition, time_in_mins, + qos=None, + nodes=1, + n_tasks=1, + n_gpus=1, + **kwargs + ): + super(SlurmSingularity, self).__init__(image, **kwargs) + self.account_name = account_name + self.partition = partition + self.time_in_mins = time_in_mins + self.nodes = nodes + self.n_tasks = n_tasks + self.n_gpus = n_gpus + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False, pre_cmd=None, post_cmd=None): + if pre_cmd is None: + pre_cmd = [] py_path = [] for mount in mount_points: if isinstance(mount, MountLocal): @@ -675,9 +694,28 @@ def launch_command(self, cmd, mount_points=None, dry=False, post_cmd=post_cmd, verbose=verbose, ) - full_cmd = "sbatch -A fc_rail -p savio2_htc -t 5 {}".format( - singularity_cmd - ) + if self.gpu: + full_cmd = ( + "sbatch -A {account_name} -p {partition} -t {time}" + " -N {nodes} -n {n_tasks} --cpus-per-task={cpus_per_task}" + " --gres=gpu:{n_gpus} {cmd}".format( + account_name=self.account_name, + partition=self.partition, + time=self.time_in_mins, + nodes=self.nodes, + n_tasks=self.n_tasks, + cpus_per_task=2*self.n_gpus, + n_gpus=self.n_gpus, + cmd=singularity_cmd, + ) + ) + else: + full_cmd = "sbatch -A {account_name} -p {partition} -t {time} {cmd}".format( + account_name=self.account_name, + partition=self.partition, + time=self.time_in_mins, + cmd=singularity_cmd, + ) if verbose: print(full_cmd) call_and_wait(full_cmd, dry=dry) From b684ba0d291600fca87b0a76a792ba6d7cd690e1 Mon Sep 17 00:00:00 2001 From: Justin Fu Date: Tue, 5 Jun 2018 09:59:24 -0700 Subject: [PATCH 05/34] Update LICENSE --- LICENSE | 695 ++------------------------------------------------------ 1 file changed, 21 insertions(+), 674 deletions(-) diff --git a/LICENSE b/LICENSE index 9cecc1d..6ec89f7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,674 +1,21 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - {one line to give the program's name and a brief idea of what it does.} - Copyright (C) {year} {name of author} - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - {project} Copyright (C) {year} {fullname} - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. +MIT License + +Copyright (c) 2018 Justin Fu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 4b5d101e899cdd76b6825c92c7d358791f8bc2d1 Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Mon, 25 Jun 2018 12:25:35 -0700 Subject: [PATCH 06/34] Support for launching multiple containers per ec2 instance --- doodad/mode.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 8e88aa4..7c253ee 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -74,7 +74,7 @@ def __init__(self, image='ubuntu:16.04', gpu=False): self.gpu = gpu def get_docker_cmd(self, main_cmd, extra_args='', use_tty=True, verbose=True, pythonpath=None, pre_cmd=None, post_cmd=None, - checkpoint=False, no_root=False): + checkpoint=False, no_root=False, use_docker_generated_name=False): cmd_list= CommandBuilder() if pre_cmd: cmd_list.extend(pre_cmd) @@ -99,7 +99,7 @@ def get_docker_cmd(self, main_cmd, extra_args='', use_tty=True, verbose=True, py cmd_list.extend(post_cmd) docker_name = self.docker_name - if docker_name: + if docker_name and not use_docker_generated_name: extra_args += ' --name %s '%docker_name if checkpoint: @@ -284,7 +284,7 @@ def s3_upload(self, file_name, bucket, remote_filename=None, dry=False, check_ex def make_timekey(self): return '%d'%(int(time.time()*1000)) - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, num_exps=1): default_config = dict( image_id=self.image_id, instance_type=self.instance_type, @@ -446,7 +446,10 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): if self.checkpoint and self.checkpoint.restore: raise NotImplementedError() else: - docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) + docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) + assert num_exps > 0 + for _ in range(num_exps - 1): + sio.write(docker_cmd+' &\n') sio.write(docker_cmd+'\n') # Sync all output mounts to s3 after running the user script @@ -498,7 +501,6 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): print(full_script) with open("/tmp/full_ec2_script", "w") as f: f.write(full_script) - instance_args = dict( ImageId=aws_config["image_id"], KeyName=aws_config["key_name"], From d8075192efb23fafd51cc6093e668e7295384fbb Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Mon, 25 Jun 2018 12:25:35 -0700 Subject: [PATCH 07/34] Support for launching multiple containers per ec2 instance --- doodad/mode.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 8e88aa4..c15ec0a 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -74,7 +74,7 @@ def __init__(self, image='ubuntu:16.04', gpu=False): self.gpu = gpu def get_docker_cmd(self, main_cmd, extra_args='', use_tty=True, verbose=True, pythonpath=None, pre_cmd=None, post_cmd=None, - checkpoint=False, no_root=False): + checkpoint=False, no_root=False, use_docker_generated_name=False): cmd_list= CommandBuilder() if pre_cmd: cmd_list.extend(pre_cmd) @@ -99,7 +99,7 @@ def get_docker_cmd(self, main_cmd, extra_args='', use_tty=True, verbose=True, py cmd_list.extend(post_cmd) docker_name = self.docker_name - if docker_name: + if docker_name and not use_docker_generated_name: extra_args += ' --name %s '%docker_name if checkpoint: @@ -284,7 +284,7 @@ def s3_upload(self, file_name, bucket, remote_filename=None, dry=False, check_ex def make_timekey(self): return '%d'%(int(time.time()*1000)) - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, num_exps=1): default_config = dict( image_id=self.image_id, instance_type=self.instance_type, @@ -301,6 +301,14 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): else: exp_name = self.s3_log_name exp_prefix = self.s3_log_prefix + # Assume that the subdirectories are handled by having a subdirectory structure + # on the instance itself rather than copying to a subdirectory in s3. + # So instead of + # s3 sync OUTPUT_DIR_FOR_DOODAD_TARGET AWS_S3_PATH/exp-prefix/exp-name + # we now do + # s3 sync OUTPUT_DIR_FOR_DOODAD_TARGET AWS_S3_PATH/exp-prefix + # where OUTPUT_DIR_FOR_DOODAD_TARGET/exp-name[0..n] exist on the instance + exp_name = "" s3_base_dir = os.path.join(self.aws_s3_path, exp_prefix.replace("_", "-"), exp_name) sio = StringIO() @@ -310,8 +318,8 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): sio.write('die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n') sio.write('EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n') sio.write(""" - aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region} - """.format(exp_name=exp_name, aws_region=self.region)) + aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_prefix} --region {aws_region} + """.format(exp_prefix=exp_prefix, aws_region=self.region)) sio.write(""" aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region} """.format(exp_prefix=exp_prefix, aws_region=self.region)) @@ -418,8 +426,8 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): raise NotImplementedError() - sio.write("aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region}\n".format( - exp_name=exp_name, aws_region=self.region)) + sio.write("aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_prefix} --region {aws_region}\n".format( + exp_prefix=exp_prefix, aws_region=self.region)) if self.gpu: #sio.write('echo "LSMOD NVIDIA:"\n') @@ -446,7 +454,10 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): if self.checkpoint and self.checkpoint.restore: raise NotImplementedError() else: - docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) + docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) + assert num_exps > 0 + for _ in range(num_exps - 1): + sio.write(docker_cmd+' &\n') sio.write(docker_cmd+'\n') # Sync all output mounts to s3 after running the user script From 00bc8adbedcaa9f257f9539a9d86fd1995e43e73 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 25 Jun 2018 16:59:22 -0700 Subject: [PATCH 08/34] remove exp_name clearing hack --- doodad/mode.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index c15ec0a..31b404f 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -301,14 +301,6 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, else: exp_name = self.s3_log_name exp_prefix = self.s3_log_prefix - # Assume that the subdirectories are handled by having a subdirectory structure - # on the instance itself rather than copying to a subdirectory in s3. - # So instead of - # s3 sync OUTPUT_DIR_FOR_DOODAD_TARGET AWS_S3_PATH/exp-prefix/exp-name - # we now do - # s3 sync OUTPUT_DIR_FOR_DOODAD_TARGET AWS_S3_PATH/exp-prefix - # where OUTPUT_DIR_FOR_DOODAD_TARGET/exp-name[0..n] exist on the instance - exp_name = "" s3_base_dir = os.path.join(self.aws_s3_path, exp_prefix.replace("_", "-"), exp_name) sio = StringIO() @@ -318,11 +310,12 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, sio.write('die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n') sio.write('EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n') sio.write(""" - aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_prefix} --region {aws_region} - """.format(exp_prefix=exp_prefix, aws_region=self.region)) + aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region} + """.format(exp_name=exp_name, aws_region=self.region)) sio.write(""" aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region} """.format(exp_prefix=exp_prefix, aws_region=self.region)) + sio.write("service docker start\n") sio.write("docker --config /home/ubuntu/.docker pull {docker_image}\n".format(docker_image=self.docker_image)) sio.write("export AWS_DEFAULT_REGION={aws_region}\n".format(aws_region=self.s3_bucket_region)) @@ -426,9 +419,6 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, raise NotImplementedError() - sio.write("aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_prefix} --region {aws_region}\n".format( - exp_prefix=exp_prefix, aws_region=self.region)) - if self.gpu: #sio.write('echo "LSMOD NVIDIA:"\n') #sio.write("lsmod | grep nvidia\n") From baa12c597d5330189a7930efb6eabcb6a6678868 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 25 Jun 2018 20:16:24 -0700 Subject: [PATCH 09/34] append the ec2 instance id to the stdout log --- doodad/mode.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doodad/mode.py b/doodad/mode.py index 31b404f..d961984 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -459,7 +459,10 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, local_dir=local_output_dir, s3_dir=s3_dir_path )) - sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_dir_path}/stdout.log\n".format(s3_dir_path=s3_base_dir)) + if num_exps == 1: + sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_dir_path}/stdout.log\n".format(s3_dir_path=s3_base_dir)) + else: + sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_dir_path}/stdout_$EC2_INSTANCE_ID.log\n".format(s3_dir_path=s3_base_dir)) # Wait for last sync if max_sync_interval > 0: From 83335257dce2378e1e70d39b8b01b4402ba37c40 Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Tue, 17 Jul 2018 12:06:57 -0700 Subject: [PATCH 10/34] Add swap to ec2 --- doodad/mode.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/doodad/mode.py b/doodad/mode.py index d961984..e1947e9 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -284,7 +284,7 @@ def s3_upload(self, file_name, bucket, remote_filename=None, dry=False, check_ex def make_timekey(self): return '%d'%(int(time.time()*1000)) - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, num_exps=1): + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, num_exps=1, swap_size=4096): default_config = dict( image_id=self.image_id, instance_type=self.instance_type, @@ -316,6 +316,19 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region} """.format(exp_prefix=exp_prefix, aws_region=self.region)) + # Add swap file + if self.gpu: + swap_location = '/mnt/swapfile' + else: + swap_location = '/var/swap.1' + sio.write( + 'sudo dd if=/dev/zero of={swap_location} bs=1M count={swap_size}\n' + .format(swap_location=swap_location, swap_size=swap_size)) + sio.write('sudo mkswap {swap_location}\n'.format(swap_location=swap_location)) + sio.write('sudo chmod 600 {swap_location}\n'.format(swap_location=swap_location)) + sio.write('sudo swapon {swap_location}\n'.format(swap_location=swap_location)) + + sio.write("service docker start\n") sio.write("docker --config /home/ubuntu/.docker pull {docker_image}\n".format(docker_image=self.docker_image)) sio.write("export AWS_DEFAULT_REGION={aws_region}\n".format(aws_region=self.s3_bucket_region)) From 134793d2459cabcb318e6c72080522d9fe4a088f Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Tue, 7 Aug 2018 13:21:46 -0700 Subject: [PATCH 11/34] remove trailing slash before stdout.log --- doodad/mode.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index d961984..98f6b47 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -460,9 +460,14 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, s3_dir=s3_dir_path )) if num_exps == 1: - sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_dir_path}/stdout.log\n".format(s3_dir_path=s3_base_dir)) + sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_path}\n".format( + s3_path=os.path.join(s3_base_dir, 'stdout.log'), + )) else: - sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_dir_path}/stdout_$EC2_INSTANCE_ID.log\n".format(s3_dir_path=s3_base_dir)) + sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_path}\n".format( + s3_path=os.path.join(s3_base_dir, + 'stdout_$EC2_INSTANCE_ID.log'), + )) # Wait for last sync if max_sync_interval > 0: From 5f0303a5950aa2b58f82854551549c5796c5906c Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Sat, 25 Aug 2018 15:57:37 -0700 Subject: [PATCH 12/34] add mode that generates a script to launch slurm experiments --- doodad/mode.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 8e88aa4..5ac157b 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -1,4 +1,5 @@ import os +import stat import subprocess import tempfile import uuid @@ -675,8 +676,8 @@ def __init__( self.n_tasks = n_tasks self.n_gpus = n_gpus - def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, pre_cmd=None, post_cmd=None): + def create_slurm_command(self, cmd, mount_points=None, + verbose=False, pre_cmd=None, post_cmd=None): if pre_cmd is None: pre_cmd = [] py_path = [] @@ -718,4 +719,32 @@ def launch_command(self, cmd, mount_points=None, dry=False, ) if verbose: print(full_cmd) + return full_cmd + + def launch_command(self, cmd, dry=False, **kwargs): + full_cmd = self.create_slurm_command(cmd, **kwargs) call_and_wait(full_cmd, dry=dry) + + +class ScriptSlurmSingularity(SlurmSingularity): + """ + Create or add to a script to run a bunch of slurm jobs. + """ + TMP_FILE = '/tmp/script_to_scp_over.sh' + + def launch_command( + self, cmd, first_launch_command=False, + dry=False, **kwargs + ): + full_cmd = self.create_slurm_command(cmd, **kwargs) + if first_launch_command: + with open(self.TMP_FILE, "w") as myfile: + myfile.write(full_cmd + '\n') + # make file executable + st = os.stat(self.TMP_FILE) + os.chmod(self.TMP_FILE, st.st_mode | stat.S_IEXEC) + print("Script generated! scp this script over:", self.TMP_FILE) + else: + with open(self.TMP_FILE, "a") as myfile: + myfile.write(full_cmd + '\n') + print("Script updated. scp this script over:", self.TMP_FILE) From 537fd2d5427de751d0b6a663d50f69cfaa8ab590 Mon Sep 17 00:00:00 2001 From: Dinesh Jayaraman Date: Wed, 5 Sep 2018 18:10:03 -0700 Subject: [PATCH 13/34] Fixed typo --- scripts/setup_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/setup_ec2.py b/scripts/setup_ec2.py index 6b4ece3..45ed52e 100644 --- a/scripts/setup_ec2.py +++ b/scripts/setup_ec2.py @@ -24,7 +24,7 @@ raise ValueError('Please set the $AWS_ACCESS_KEY environment variable') ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", None) if ACCESS_SECRET is None: - raise ValueError('Please set the $AWS_ACCESS_KEY environment variable') + raise ValueError('Please set the $AWS_ACCESS_SECRET environment variable') S3_BUCKET_NAME = os.environ.get("DOODAD_S3_BUCKET", None) if S3_BUCKET_NAME is None: raise ValueError('Please set the $DOODAD_S3_BUCKET environment variable') From 2f041de6d7770c76be075b55d2e81e8cb3b74fee Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Fri, 14 Sep 2018 15:37:34 -0700 Subject: [PATCH 14/34] give option to skip wait in call_and_wait --- doodad/mode.py | 19 +++++++++---------- doodad/utils.py | 15 +++++++++++---- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 8e88aa4..bd171a0 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -25,7 +25,8 @@ def __init__(self): super(Local, self).__init__() self.env = {} - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): + def launch_command(self, cmd, mount_points=None, dry=False, + verbose=False, skip_wait=False): if dry: print(cmd); return @@ -61,7 +62,7 @@ def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): commands.extend(cleanup_commands) # Call everything - commands.call_and_wait() + commands.call_and_wait(verbose=verbose, dry=dry, skip_wait=skip_wait) LOCAL = Local() @@ -123,7 +124,8 @@ def __init__(self, checkpoints=None, **kwargs): super(LocalDocker, self).__init__(**kwargs) self.checkpoints = checkpoints - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): + def launch_command(self, cmd, mount_points=None, dry=False, + verbose=False, skip_wait=False): mnt_args = '' py_path = [] for mount in mount_points: @@ -139,9 +141,7 @@ def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): full_cmd = self.get_docker_cmd(cmd, extra_args=mnt_args, pythonpath=py_path, checkpoint=self.checkpoints) - if verbose: - print(full_cmd) - call_and_wait(full_cmd, dry=dry) + call_and_wait(full_cmd, verbose=verbose, dry=dry, skip_wait=skip_wait) class SSHDocker(DockerMode): @@ -636,7 +636,8 @@ def get_singularity_cmd( class LocalSingularity(SingularityMode): def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, pre_cmd=None, post_cmd=None): + verbose=False, pre_cmd=None, post_cmd=None, + skip_wait=False): py_path = [] for mount in mount_points: if isinstance(mount, MountLocal): @@ -652,9 +653,7 @@ def launch_command(self, cmd, mount_points=None, dry=False, post_cmd=post_cmd, verbose=verbose, ) - if verbose: - print(full_cmd) - call_and_wait(full_cmd, dry=dry) + call_and_wait(full_cmd, verbose=verbose, dry=dry, skip_wait=skip_wait) class SlurmSingularity(LocalSingularity): diff --git a/doodad/utils.py b/doodad/utils.py index 5bafeb3..1989239 100644 --- a/doodad/utils.py +++ b/doodad/utils.py @@ -8,7 +8,7 @@ REPO_DIR = os.path.dirname(THIS_FILE_DIR) EXAMPLES_DIR = os.path.join(REPO_DIR, 'examples') -HASH_BUF_SIZE = 65536 +HASH_BUF_SIZE = 65536 def hash_file(filename): hasher = hashlib.md5() @@ -21,11 +21,13 @@ def hash_file(filename): return hasher.hexdigest() -def call_and_wait(cmd, verbose=False, dry=False): +def call_and_wait(cmd, verbose=False, dry=False, skip_wait=False): if dry or verbose: print(cmd) if not dry: p = subprocess.Popen(cmd, shell=True) + if skip_wait: + return try: p.wait() except KeyboardInterrupt: @@ -64,8 +66,13 @@ def __iter__(self): for cmd in self.cmds: yield cmd - def call_and_wait(self, verbose=False, dry=False): - return call_and_wait(self.to_string()) + def call_and_wait(self, verbose=False, dry=False, skip_wait=False): + return call_and_wait( + self.to_string(), + verbose=verbose, + dry=dry, + skip_wait=skip_wait, + ) @contextlib.contextmanager def as_script(self, suffix='.sh'): From 8a46c239fcc95b6efe0b27b3cc8818341823e839 Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Tue, 2 Oct 2018 21:44:34 -0700 Subject: [PATCH 15/34] WIP --- doodad/mode.py | 185 ++++++++++++++++++++++++++++++++++++++++++++++++ doodad/mount.py | 15 ++++ 2 files changed, 200 insertions(+) diff --git a/doodad/mode.py b/doodad/mode.py index 5ac157b..88ba20b 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -216,6 +216,191 @@ def dedent(s): lines = [l.strip() for l in s.split('\n')] return '\n'.join(lines) +class GCPDocker(DockerMode): + def __init__(self, + zone='"us-east1-d"', + gcp_bucket_name=None, + instance_type='n1-standard-1', + disk_size:"Gb"=64, + terminate=True, + image_id=None, + gce_log_prefix='experiment', + gce_log_name=None, + gce_log_path=None, + **kwargs + ): + assert 'CLOUDSDK_CORE_PROJECT' in os.environ.keys() + self.zone = zone + self.gcp_bucket_name = gcp_bucket_name + self.instance_type = instance_type + self.terminate = terminate + self.image_id = image_id + self.disk_size = disk_size + self.machine_type = \ + "zones/{zone}/machineTypes/{instance_type}".format( + zone=self.zone, + instance_type=self.instance_type, + ) + import googleapiclient.discovery + from google.cloud import storage + self.compute = googleapiclient.discovery.build('compute', 'v1') + storage_client = storage.Client() + self.bucket = storage_client.get_bucket(self.gcp_bucket_name) + + self.gce_log_prefix = gce_log_prefix + self.gce_log_name = gce_log_name + self.gce_log_path = gce_log_path or 'doodad/logs' + + def upload_file_to_gc_storage(self, file_name, remote_filename=None, dry=False): + if remote_filename is None: + remote_filename = os.path.basename(file_name) + remote_path = 'doodad/mount/'+remote_filename + blob = self.bucket.blob(remote_path) + blob.upload_from_filename(file_name) + + # def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): + # metadata = { + # 'docker_image': docker_container_name, + # 'bucket_name': 'test-gce-rail', + # 'local_mounts': json.dumps(['local_mount1', 'local_mount2', 'local_mount3']), + # 'gce_mounts': json.dumps(['local_mount1', 'local_mount2', 'local_mount3']), + # 'startup-script': open('/Users/steven/gce_script.sh', 'r').read() + # } + # pass + + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): + if self.gce_log_name is None: + exp_name = "{}-{}".format(self.gce_log_prefix, EC2SpotDocker.make_timekey(self)) + else: + exp_name = self.gce_log_name + exp_prefix = self.gce_log_prefix + gce_base_dir = os.path.join(self.gce_log_path, exp_prefix.replace("_", "-"), exp_name) + + mnt_args = '' + py_path = [] + local_output_dir_and_s3_path = [] + max_sync_interval = 0 + local_files = [] + gce_paths = [] + for mount in mount_points: + print('Handling mount: ', mount) + if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects + if mount.read_only: + if mount.path_on_remote is None: + with mount.gzip() as gzip_file: + gzip_path = os.path.realpath(gzip_file) + file_hash = hash_file(gzip_path) + gce_path = self.upload_file_to_gc_storage( + file_name=gzip_path, + remote_filename=file_hash+'.tar' + ) + mount.path_on_remote = gce_path + mount.local_file_hash = gzip_path + else: + import pdb; pdb.set_trace() + file_hash = mount.local_file_hash + s3_path = mount.path_on_remote + remote_unpack_name = '/tmp/'+file_hash + mount_point = os.path.join('/mounts', mount.mount_point.replace('~/','')) + mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename(mount.local_dir)), mount_point) + if mount.pythonpath: + py_path.append(mount_point) + else: + raise ValueError() + elif isinstance(mount, MountS3): + import pdb; pdb.set_trace() + ec2_local_dir = mount.mount_point + gce_path = os.path.join(gce_base_dir, mount.gce_path) + if not mount.output: + raise NotImplementedError() + local_output_dir_and_gce_path.append( + (ec2_local_dir, gce_path) + ) + mnt_args += ' -v %s:%s' % (ec2_local_dir, mount.mount_point) + else: + raise NotImplementedError() + + if self.checkpoint and self.checkpoint.restore: + raise NotImplementedError() + else: + docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) + + metadata = {} + metadata['docker_cmd'] = docker_cmd + metadata['mnt_args'] = mnt_args + metadata['local_mounts'] = local_mounts + metadata['gce_mounts'] = local_output_dir_and_gce_path + metadata['python_path'] = py_path + metadata['use_gpu'] = json.dumps(self.gpu) + + + if self.terminate: + sio.write(""" + EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`" + aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region} + """.format(aws_region=self.region)) + + + def create_instance(self, metadata): + required_metadata_keys = [ + 'local_mounts', + 'gce_mounts', + 'bucket_name', + 'docker_image', + 'startup-script', + 'use_gpu', + ] + for required_key in required_metadata_keys: + assert required_key in metadata.keys() + + image_response = compute.images().getFromFamily( + project='ubuntu-os-cloud', + family='ubuntu-1604-lts' + ).execute() + source_disk_image = image_response['selfLink'] + # modified config template from: + # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/compute/api/create_instance.py + config = { + 'name': name, + 'machineType': self.machine_type, + 'disks': [ + { + 'boot': True, + 'autoDelete': True, + 'initializeParams': { + 'sourceImage': source_disk_image, + 'diskSizeGb': self.disk_size, + } + } + ], + 'networkInterfaces': [{ + 'network': 'global/networks/default', + 'accessConfigs': [ + {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'} + ] + }], + 'serviceAccounts': [{ + 'email': 'default', + 'scopes': [ + 'https://www.googleapis.com/auth/devstorage.read_write', + 'https://www.googleapis.com/auth/logging.write' + ] + }], + 'metadata': { + 'items': [ + {'key': key, 'value': value} + for key, value in metadata.items() + ] + } + } + return compute.instances().insert( + project=project, + zone=zone, + body=config).execute() + + + + class EC2SpotDocker(DockerMode): def __init__(self, credentials, diff --git a/doodad/mount.py b/doodad/mount.py index b686ee3..c5a1b14 100644 --- a/doodad/mount.py +++ b/doodad/mount.py @@ -88,6 +88,21 @@ def __init__(self, git_url, git_credentials=None, **kwargs): self.git_credentials = git_credentials raise NotImplementedError() +class MountGCP(Mount): + def __init__(self, gcp_path, s3_bucket=None, sync_interval=15, output=False, + include_types=('*.txt', '*.csv', '*.json', '*.gz', '*.tar', '*.log', '*.pkl'), **kwargs): + super(MountS3, self).__init__(**kwargs) + if s3_bucket is None: + # load from config + from doodad.ec2.autoconfig import AUTOCONFIG + s3_bucket = AUTOCONFIG.s3_bucket() + self.s3_bucket = s3_bucket + self.s3_path = s3_path + self.output = output + self.sync_interval = sync_interval + self.sync_on_terminate = True + self.include_types = include_types + class MountS3(Mount): def __init__(self, s3_path, s3_bucket=None, sync_interval=15, output=False, From 7ef5153885daa6b6412ae68f784979c1efe42fb5 Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Tue, 2 Oct 2018 21:47:04 -0700 Subject: [PATCH 16/34] Add gce script --- scripts/gce_script.sh | 52 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 scripts/gce_script.sh diff --git a/scripts/gce_script.sh b/scripts/gce_script.sh new file mode 100644 index 0000000..5d6d38b --- /dev/null +++ b/scripts/gce_script.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +install_docker() { + sudo apt-get install --no-install-recommends \ + apt-transport-https \ + curl \ + software-properties-common + curl -fsSL 'https://sks-keyservers.net/pks/lookup?op=get&search=0xee6d536cf7dc86e2d7d56f59a178ac6c6238f52e' | sudo apt-key add - + sudo add-apt-repository \ + "deb https://packages.docker.com/1.12/apt/repo/ \ + ubuntu-$(lsb_release -cs) \ + main" + sudo apt-get update + sudo apt-get -y install docker-engine + sudo usermod -a -G docker ubuntu +} + +truncate -s 0 /home/ubuntu/user_data.log +{ + bucket_name=$(curl http://metadata/computeMetadata/v1/instance/attributes/bucket_name -H "Metadata-Flavor: Google") + docker_image=$(curl http://metadata/computeMetadata/v1/instance/attributes/docker_image -H "Metadata-Flavor: Google") + local_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/local_mounts -H "Metadata-Flavor: Google") + gce_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/gce_mounts -H "Metadata-Flavor: Google") + use_gpu=$(curl http://metadata/computeMetadata/v1/instance/attributes/use_gpu -H "Metadata-Flavor: Google") + + yes | sudo apt-get update + install_docker + sudo apt-get install jq git unzip + die() { status=$1; shift; echo "FATAL: $*"; exit $status; } + service docker start + docker --config /home/ubuntu/.docker pull $docker_image + + num_local_mounts=$(jq length <<< $local_mounts) + for ((i=0;i<$num_local_mounts;i++)); do + local_mount=$(jq .[$i] <<< $local_mounts) + echo "Mounting " $local_mount + gsutil cp gs://$bucket_name/doodad/mount/$local_mount.tar /tmp/$local_mount.tar + mkdir -p /tmp/$local_mount + tar -xvf /tmp/$local_mount.tar -C /tmp/$local_mount + done + + num_gce_mounts=$(jq length <<< $gce_mounts) + for ((i=0;i<$num_gce_mounts;i++)); do + gce_mount_info=$(jq .[$i] <<< $gce_mounts) + # assume gce_mount_info is a (local_path, bucket_path) pair + local_path=$(jq .[0] <<< $gce_mount_info) + gce_bucket_path=$(jq .[1] <<< $gce_mount_info) + echo "Adding periodic sync " $gce_mount + done + + echo $num_mounts +} >> /home/ubuntu/user_data.log 2>&1 From 390dc190db522065216f5ea37aa40d3b80afa66d Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 29 Oct 2018 17:35:51 -0700 Subject: [PATCH 17/34] move skip_wait to init rather than launch_command --- doodad/mode.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index bd171a0..fa8b35e 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -21,12 +21,12 @@ def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): class Local(LaunchMode): - def __init__(self): + def __init__(self, skip_wait=False): super(Local, self).__init__() self.env = {} + self.skip_wait = skip_wait - def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, skip_wait=False): + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): if dry: print(cmd); return @@ -62,7 +62,8 @@ def launch_command(self, cmd, mount_points=None, dry=False, commands.extend(cleanup_commands) # Call everything - commands.call_and_wait(verbose=verbose, dry=dry, skip_wait=skip_wait) + commands.call_and_wait(verbose=verbose, dry=dry, + skip_wait=self.skip_wait) LOCAL = Local() @@ -120,12 +121,12 @@ def get_docker_cmd(self, main_cmd, extra_args='', use_tty=True, verbose=True, py class LocalDocker(DockerMode): - def __init__(self, checkpoints=None, **kwargs): + def __init__(self, checkpoints=None, skip_wait=False, **kwargs): super(LocalDocker, self).__init__(**kwargs) self.checkpoints = checkpoints + self.skip_wait = skip_wait - def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, skip_wait=False): + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): mnt_args = '' py_path = [] for mount in mount_points: @@ -141,7 +142,8 @@ def launch_command(self, cmd, mount_points=None, dry=False, full_cmd = self.get_docker_cmd(cmd, extra_args=mnt_args, pythonpath=py_path, checkpoint=self.checkpoints) - call_and_wait(full_cmd, verbose=verbose, dry=dry, skip_wait=skip_wait) + call_and_wait(full_cmd, verbose=verbose, dry=dry, + skip_wait=self.skip_wait) class SSHDocker(DockerMode): @@ -593,10 +595,11 @@ def __init__(self): class SingularityMode(LaunchMode): - def __init__(self, image, gpu=False): + def __init__(self, image, gpu=False, skip_wait=False): super(SingularityMode, self).__init__() self.singularity_image = image self.gpu = gpu + self.skip_wait = skip_wait def get_singularity_cmd( self, @@ -636,8 +639,7 @@ def get_singularity_cmd( class LocalSingularity(SingularityMode): def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, pre_cmd=None, post_cmd=None, - skip_wait=False): + verbose=False, pre_cmd=None, post_cmd=None): py_path = [] for mount in mount_points: if isinstance(mount, MountLocal): @@ -653,7 +655,8 @@ def launch_command(self, cmd, mount_points=None, dry=False, post_cmd=post_cmd, verbose=verbose, ) - call_and_wait(full_cmd, verbose=verbose, dry=dry, skip_wait=skip_wait) + call_and_wait(full_cmd, verbose=verbose, dry=dry, + skip_wait=self.skip_wait) class SlurmSingularity(LocalSingularity): @@ -717,4 +720,4 @@ def launch_command(self, cmd, mount_points=None, dry=False, ) if verbose: print(full_cmd) - call_and_wait(full_cmd, dry=dry) + call_and_wait(full_cmd, dry=dry, skip_wait=self.skip_wait) From 394c76e2f0650db2582a93f40b6e93e7b3cc12c9 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 29 Oct 2018 17:45:51 -0700 Subject: [PATCH 18/34] move num_exps and swap_size to init --- doodad/mode.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index b46dce9..c4fc738 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -233,8 +233,10 @@ def __init__(self, security_groups=None, aws_s3_path=None, extra_ec2_instance_kwargs=None, + num_exps=1, + swap_size=4096, **kwargs - ): + ): super(EC2SpotDocker, self).__init__(**kwargs) if security_group_ids is None: security_group_ids = [] @@ -255,6 +257,8 @@ def __init__(self, self.security_groups = security_groups self.iam_instance_profile_name = iam_instance_profile_name self.extra_ec2_instance_kwargs = extra_ec2_instance_kwargs + self.num_exps = num_exps + self.swap_size = swap_size self.checkpoint = None self.s3_mount_path = 's3://%s/doodad/mount' % self.s3_bucket @@ -284,7 +288,7 @@ def s3_upload(self, file_name, bucket, remote_filename=None, dry=False, check_ex def make_timekey(self): return '%d'%(int(time.time()*1000)) - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, num_exps=1, swap_size=4096): + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): default_config = dict( image_id=self.image_id, instance_type=self.instance_type, @@ -323,7 +327,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, swap_location = '/var/swap.1' sio.write( 'sudo dd if=/dev/zero of={swap_location} bs=1M count={swap_size}\n' - .format(swap_location=swap_location, swap_size=swap_size)) + .format(swap_location=swap_location, swap_size=self.swap_size)) sio.write('sudo mkswap {swap_location}\n'.format(swap_location=swap_location)) sio.write('sudo chmod 600 {swap_location}\n'.format(swap_location=swap_location)) sio.write('sudo swapon {swap_location}\n'.format(swap_location=swap_location)) @@ -458,8 +462,8 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, raise NotImplementedError() else: docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) - assert num_exps > 0 - for _ in range(num_exps - 1): + assert self.num_exps > 0 + for _ in range(self.num_exps - 1): sio.write(docker_cmd+' &\n') sio.write(docker_cmd+'\n') @@ -472,7 +476,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False, local_dir=local_output_dir, s3_dir=s3_dir_path )) - if num_exps == 1: + if self.num_exps == 1: sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_path}\n".format( s3_path=os.path.join(s3_base_dir, 'stdout.log'), )) From 76263e38c1e5804afce866b7a04bedd72ed07fbf Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Sat, 25 Aug 2018 15:57:37 -0700 Subject: [PATCH 19/34] add mode that generates a script to launch slurm experiments --- doodad/mode.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 8e88aa4..5ac157b 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -1,4 +1,5 @@ import os +import stat import subprocess import tempfile import uuid @@ -675,8 +676,8 @@ def __init__( self.n_tasks = n_tasks self.n_gpus = n_gpus - def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, pre_cmd=None, post_cmd=None): + def create_slurm_command(self, cmd, mount_points=None, + verbose=False, pre_cmd=None, post_cmd=None): if pre_cmd is None: pre_cmd = [] py_path = [] @@ -718,4 +719,32 @@ def launch_command(self, cmd, mount_points=None, dry=False, ) if verbose: print(full_cmd) + return full_cmd + + def launch_command(self, cmd, dry=False, **kwargs): + full_cmd = self.create_slurm_command(cmd, **kwargs) call_and_wait(full_cmd, dry=dry) + + +class ScriptSlurmSingularity(SlurmSingularity): + """ + Create or add to a script to run a bunch of slurm jobs. + """ + TMP_FILE = '/tmp/script_to_scp_over.sh' + + def launch_command( + self, cmd, first_launch_command=False, + dry=False, **kwargs + ): + full_cmd = self.create_slurm_command(cmd, **kwargs) + if first_launch_command: + with open(self.TMP_FILE, "w") as myfile: + myfile.write(full_cmd + '\n') + # make file executable + st = os.stat(self.TMP_FILE) + os.chmod(self.TMP_FILE, st.st_mode | stat.S_IEXEC) + print("Script generated! scp this script over:", self.TMP_FILE) + else: + with open(self.TMP_FILE, "a") as myfile: + myfile.write(full_cmd + '\n') + print("Script updated. scp this script over:", self.TMP_FILE) From adaec17ea34ce5162bd8a80a6bd0297afa39393d Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Mon, 29 Oct 2018 17:42:40 -0700 Subject: [PATCH 20/34] Periodic log sync fix missing self reference also sync user_data.log on shutdown just use same stdout format --- doodad/mode.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doodad/mode.py b/doodad/mode.py index 5ac157b..647e7fc 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -402,6 +402,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): then logger "Running shutdown hook." aws s3 cp --recursive {log_dir} {s3_path} + aws s3 cp /home/ubuntu/user_data.log {s3_path} break else # Spot instance not yet marked for termination. @@ -418,6 +419,16 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): else: raise NotImplementedError() + stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') + sio.write(""" + while /bin/true; do + aws s3 cp /home/ubuntu/user_data.log {s3_path} + sleep {periodic_sync_interval} + done & echo sync initiated + """.format( + s3_path=stdout_log_s3_path, + periodic_sync_interval=max_sync_interval + )) sio.write("aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region}\n".format( exp_name=exp_name, aws_region=self.region)) From e2ffde6106527b25c5e29bb68ffe9e2d5ce15272 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 29 Oct 2018 18:16:28 -0700 Subject: [PATCH 21/34] move mode-specific arguments to init/state --- doodad/mode.py | 53 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 5ac157b..88bc416 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -594,10 +594,13 @@ def __init__(self): class SingularityMode(LaunchMode): - def __init__(self, image, gpu=False): + def __init__(self, image, gpu=False, pre_cmd=None, + post_cmd=None): super(SingularityMode, self).__init__() self.singularity_image = image self.gpu = gpu + self.pre_cmd = pre_cmd + self.post_cmd = post_cmd def get_singularity_cmd( self, @@ -605,12 +608,10 @@ def get_singularity_cmd( extra_args='', verbose=True, pythonpath=None, - pre_cmd=None, - post_cmd=None, ): cmd_list= CommandBuilder() - if pre_cmd: - cmd_list.extend(pre_cmd) + if self.pre_cmd: + cmd_list.extend(self.pre_cmd) if verbose: if self.gpu: @@ -621,8 +622,8 @@ def get_singularity_cmd( cmd_list.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(pythonpath))) cmd_list.append(main_cmd) - if post_cmd: - cmd_list.extend(post_cmd) + if self.post_cmd: + cmd_list.extend(self.post_cmd) if self.gpu: extra_args += ' --nv ' @@ -636,8 +637,7 @@ def get_singularity_cmd( class LocalSingularity(SingularityMode): - def launch_command(self, cmd, mount_points=None, dry=False, - verbose=False, pre_cmd=None, post_cmd=None): + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): py_path = [] for mount in mount_points: if isinstance(mount, MountLocal): @@ -649,8 +649,6 @@ def launch_command(self, cmd, mount_points=None, dry=False, full_cmd = self.get_singularity_cmd( cmd, pythonpath=py_path, - pre_cmd=pre_cmd, - post_cmd=post_cmd, verbose=verbose, ) if verbose: @@ -676,10 +674,7 @@ def __init__( self.n_tasks = n_tasks self.n_gpus = n_gpus - def create_slurm_command(self, cmd, mount_points=None, - verbose=False, pre_cmd=None, post_cmd=None): - if pre_cmd is None: - pre_cmd = [] + def create_slurm_command(self, cmd, mount_points=None, verbose=False): py_path = [] for mount in mount_points: if isinstance(mount, MountLocal): @@ -691,8 +686,6 @@ def create_slurm_command(self, cmd, mount_points=None, singularity_cmd = self.get_singularity_cmd( cmd, pythonpath=py_path, - pre_cmd=pre_cmd, - post_cmd=post_cmd, verbose=verbose, ) if self.gpu: @@ -721,8 +714,10 @@ def create_slurm_command(self, cmd, mount_points=None, print(full_cmd) return full_cmd - def launch_command(self, cmd, dry=False, **kwargs): - full_cmd = self.create_slurm_command(cmd, **kwargs) + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): + full_cmd = self.create_slurm_command( + cmd, mount_points=mount_points, verbose=verbose, + ) call_and_wait(full_cmd, dry=dry) @@ -732,12 +727,24 @@ class ScriptSlurmSingularity(SlurmSingularity): """ TMP_FILE = '/tmp/script_to_scp_over.sh' + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.is_first_time = False + + def set_first_time(self, is_first_time): + self.is_first_time = is_first_time + def launch_command( - self, cmd, first_launch_command=False, - dry=False, **kwargs + self, + cmd, + dry=False, + mount_points=None, + verbose=False, ): - full_cmd = self.create_slurm_command(cmd, **kwargs) - if first_launch_command: + full_cmd = self.create_slurm_command( + cmd, mount_points=mount_points, verbose=verbose, + ) + if self.is_first_time: with open(self.TMP_FILE, "w") as myfile: myfile.write(full_cmd + '\n') # make file executable From b8f4e0f0fecf57994cf2cd744649c1edec96188c Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 29 Oct 2018 18:31:58 -0700 Subject: [PATCH 22/34] use s3_path if there is only one exp --- doodad/mode.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doodad/mode.py b/doodad/mode.py index 647e7fc..0e40099 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -419,7 +419,10 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): else: raise NotImplementedError() - stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') + if self.num_exps > 1: + stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') + else: + stdout_log_s3_path = os.path.join(s3_path, 'stdout.log') sio.write(""" while /bin/true; do aws s3 cp /home/ubuntu/user_data.log {s3_path} From eb8e11d072700fb7f065dfe057d28f5ab2093a4e Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Mon, 29 Oct 2018 18:55:32 -0700 Subject: [PATCH 23/34] for num_exp=1, save stdout.log to exp-specific dir --- doodad/mode.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 0e40099..2701dc3 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -303,6 +303,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): exp_name = self.s3_log_name exp_prefix = self.s3_log_prefix s3_base_dir = os.path.join(self.aws_s3_path, exp_prefix.replace("_", "-"), exp_name) + stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') sio = StringIO() sio.write("#!/bin/bash\n") @@ -366,6 +367,8 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): # spot instance ec2_local_dir = mount.mount_point s3_path = os.path.join(s3_base_dir, mount.s3_path) + if self.num_exps == 1: + stdout_log_s3_path = os.path.join(s3_path, 'stdout_$EC2_INSTANCE_ID.log') if not mount.output: raise NotImplementedError() local_output_dir_and_s3_path.append( @@ -402,7 +405,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): then logger "Running shutdown hook." aws s3 cp --recursive {log_dir} {s3_path} - aws s3 cp /home/ubuntu/user_data.log {s3_path} + aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} break else # Spot instance not yet marked for termination. @@ -415,17 +418,14 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): """.format( log_dir=ec2_local_dir, s3_path=s3_path, + stdout_log_s3_path=stdout_log_s3_path, )) else: raise NotImplementedError() - if self.num_exps > 1: - stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') - else: - stdout_log_s3_path = os.path.join(s3_path, 'stdout.log') sio.write(""" while /bin/true; do - aws s3 cp /home/ubuntu/user_data.log {s3_path} + aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} sleep {periodic_sync_interval} done & echo sync initiated """.format( @@ -473,7 +473,9 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): local_dir=local_output_dir, s3_dir=s3_dir_path )) - sio.write("aws s3 cp /home/ubuntu/user_data.log {s3_dir_path}/stdout.log\n".format(s3_dir_path=s3_base_dir)) + sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format( + stdout_log_s3_path, + )) # Wait for last sync if max_sync_interval > 0: From eee12cb52509215bf3986beee7cabe5431dfdbda Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Mon, 8 Oct 2018 12:05:41 -0700 Subject: [PATCH 24/34] WIP --- doodad/gcp/gcp_util.py | 55 +++++ doodad/mode.py | 360 ++++++++++++++--------------- doodad/mount.py | 19 +- scripts/gce_script.sh | 52 ----- scripts/gcp/gcp_shutdown_script.sh | 16 ++ scripts/gcp/gcp_startup_script.sh | 83 +++++++ 6 files changed, 338 insertions(+), 247 deletions(-) create mode 100644 doodad/gcp/gcp_util.py delete mode 100644 scripts/gce_script.sh create mode 100644 scripts/gcp/gcp_shutdown_script.sh create mode 100644 scripts/gcp/gcp_startup_script.sh diff --git a/doodad/gcp/gcp_util.py b/doodad/gcp/gcp_util.py new file mode 100644 index 0000000..940a049 --- /dev/null +++ b/doodad/gcp/gcp_util.py @@ -0,0 +1,55 @@ +import os + +from doodad.utils import hash_file, call_and_wait, CommandBuilder, REPO_DIR + +GCP_STARTUP_SCRIPT_PATH = os.path.join(REPO_DIR, "scripts/gcp/gcp_startup_script.sh") +GCP_SHUTDOWN_SCRIPT_PATH = os.path.join(REPO_DIR, "scripts/gcp/gcp_shutdown_script.sh") + +def upload_file_to_gcp_storage( + bucket_name, + file_name, + remote_filename=None, + dry=False, + check_exists=True +): + from google.cloud import storage + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + + if remote_filename is None: + remote_filename = os.path.basename(file_name) + remote_path = 'doodad/mount/' + remote_filename + blob = bucket.blob(remote_path) + if check_exists and blob.exists(storage_client): + print("{remote_path} already exists".format(remote_path=remote_path)) + return remote_path + blob.upload_from_filename(file_name) + return remote_path + +def get_machine_type(zone, instance_type): + return "zones/{zone}/machineTypes/{instance_type}".format( + zone=zone, + instance_type=instance_type, + ) + +def get_gpu_type(project, zone, gpu_model): + """ + Check the available gpu models for each zone + https://cloud.google.com/compute/docs/gpus/ + """ + assert gpu_model in [ + 'nvidia-tesla-p4', + 'nvidia-tesla-k80', + 'nvidia-tesla-v100', + 'nvidia-tesla-p100' + ] + + return ( + "https://www.googleapis.com/compute/v1/" + "projects/{project}/zones/{zone}/acceleratorTypes/{gpu_model}".format( + project=project, + zone=zone, + gpu_model=gpu_model + ) + ) + diff --git a/doodad/mode.py b/doodad/mode.py index 88ba20b..171bc1d 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -5,16 +5,18 @@ import uuid import time import base64 +import json try: from StringIO import StringIO except ImportError: from io import StringIO -from .mount import MountLocal, MountS3 -from .utils import hash_file, call_and_wait, CommandBuilder +from .mount import MountLocal, MountS3, MountGCP +from .utils import hash_file, call_and_wait, CommandBuilder, REPO_DIR from .ec2.aws_util import s3_upload, s3_exists - +from .gcp.gcp_util import GCP_STARTUP_SCRIPT_PATH, GCP_SHUTDOWN_SCRIPT_PATH, \ + upload_file_to_gcp_storage, get_machine_type, get_gpu_type class LaunchMode(object): def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): @@ -216,190 +218,6 @@ def dedent(s): lines = [l.strip() for l in s.split('\n')] return '\n'.join(lines) -class GCPDocker(DockerMode): - def __init__(self, - zone='"us-east1-d"', - gcp_bucket_name=None, - instance_type='n1-standard-1', - disk_size:"Gb"=64, - terminate=True, - image_id=None, - gce_log_prefix='experiment', - gce_log_name=None, - gce_log_path=None, - **kwargs - ): - assert 'CLOUDSDK_CORE_PROJECT' in os.environ.keys() - self.zone = zone - self.gcp_bucket_name = gcp_bucket_name - self.instance_type = instance_type - self.terminate = terminate - self.image_id = image_id - self.disk_size = disk_size - self.machine_type = \ - "zones/{zone}/machineTypes/{instance_type}".format( - zone=self.zone, - instance_type=self.instance_type, - ) - import googleapiclient.discovery - from google.cloud import storage - self.compute = googleapiclient.discovery.build('compute', 'v1') - storage_client = storage.Client() - self.bucket = storage_client.get_bucket(self.gcp_bucket_name) - - self.gce_log_prefix = gce_log_prefix - self.gce_log_name = gce_log_name - self.gce_log_path = gce_log_path or 'doodad/logs' - - def upload_file_to_gc_storage(self, file_name, remote_filename=None, dry=False): - if remote_filename is None: - remote_filename = os.path.basename(file_name) - remote_path = 'doodad/mount/'+remote_filename - blob = self.bucket.blob(remote_path) - blob.upload_from_filename(file_name) - - # def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): - # metadata = { - # 'docker_image': docker_container_name, - # 'bucket_name': 'test-gce-rail', - # 'local_mounts': json.dumps(['local_mount1', 'local_mount2', 'local_mount3']), - # 'gce_mounts': json.dumps(['local_mount1', 'local_mount2', 'local_mount3']), - # 'startup-script': open('/Users/steven/gce_script.sh', 'r').read() - # } - # pass - - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): - if self.gce_log_name is None: - exp_name = "{}-{}".format(self.gce_log_prefix, EC2SpotDocker.make_timekey(self)) - else: - exp_name = self.gce_log_name - exp_prefix = self.gce_log_prefix - gce_base_dir = os.path.join(self.gce_log_path, exp_prefix.replace("_", "-"), exp_name) - - mnt_args = '' - py_path = [] - local_output_dir_and_s3_path = [] - max_sync_interval = 0 - local_files = [] - gce_paths = [] - for mount in mount_points: - print('Handling mount: ', mount) - if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects - if mount.read_only: - if mount.path_on_remote is None: - with mount.gzip() as gzip_file: - gzip_path = os.path.realpath(gzip_file) - file_hash = hash_file(gzip_path) - gce_path = self.upload_file_to_gc_storage( - file_name=gzip_path, - remote_filename=file_hash+'.tar' - ) - mount.path_on_remote = gce_path - mount.local_file_hash = gzip_path - else: - import pdb; pdb.set_trace() - file_hash = mount.local_file_hash - s3_path = mount.path_on_remote - remote_unpack_name = '/tmp/'+file_hash - mount_point = os.path.join('/mounts', mount.mount_point.replace('~/','')) - mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename(mount.local_dir)), mount_point) - if mount.pythonpath: - py_path.append(mount_point) - else: - raise ValueError() - elif isinstance(mount, MountS3): - import pdb; pdb.set_trace() - ec2_local_dir = mount.mount_point - gce_path = os.path.join(gce_base_dir, mount.gce_path) - if not mount.output: - raise NotImplementedError() - local_output_dir_and_gce_path.append( - (ec2_local_dir, gce_path) - ) - mnt_args += ' -v %s:%s' % (ec2_local_dir, mount.mount_point) - else: - raise NotImplementedError() - - if self.checkpoint and self.checkpoint.restore: - raise NotImplementedError() - else: - docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) - - metadata = {} - metadata['docker_cmd'] = docker_cmd - metadata['mnt_args'] = mnt_args - metadata['local_mounts'] = local_mounts - metadata['gce_mounts'] = local_output_dir_and_gce_path - metadata['python_path'] = py_path - metadata['use_gpu'] = json.dumps(self.gpu) - - - if self.terminate: - sio.write(""" - EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`" - aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region} - """.format(aws_region=self.region)) - - - def create_instance(self, metadata): - required_metadata_keys = [ - 'local_mounts', - 'gce_mounts', - 'bucket_name', - 'docker_image', - 'startup-script', - 'use_gpu', - ] - for required_key in required_metadata_keys: - assert required_key in metadata.keys() - - image_response = compute.images().getFromFamily( - project='ubuntu-os-cloud', - family='ubuntu-1604-lts' - ).execute() - source_disk_image = image_response['selfLink'] - # modified config template from: - # https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/compute/api/create_instance.py - config = { - 'name': name, - 'machineType': self.machine_type, - 'disks': [ - { - 'boot': True, - 'autoDelete': True, - 'initializeParams': { - 'sourceImage': source_disk_image, - 'diskSizeGb': self.disk_size, - } - } - ], - 'networkInterfaces': [{ - 'network': 'global/networks/default', - 'accessConfigs': [ - {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'} - ] - }], - 'serviceAccounts': [{ - 'email': 'default', - 'scopes': [ - 'https://www.googleapis.com/auth/devstorage.read_write', - 'https://www.googleapis.com/auth/logging.write' - ] - }], - 'metadata': { - 'items': [ - {'key': key, 'value': value} - for key, value in metadata.items() - ] - } - } - return compute.instances().insert( - project=project, - zone=zone, - body=config).execute() - - - class EC2SpotDocker(DockerMode): def __init__(self, @@ -772,6 +590,174 @@ def __init__(self, ) +class GCPDocker(DockerMode): + def __init__( + self, + zone="us-east4-a", + gcp_bucket_name=None, + instance_type='n1-standard-8', + image_name=None, + image_project=None, + disk_size:"Gb"=64, + terminate=True, + gcp_log_prefix='experiment', + gcp_log_name=None, + gcp_log_path=None, + gpu_kwargs=None, + preemptible=True, + **kwargs + ): + super(GCPDocker, self).__init__(**kwargs) + assert 'CLOUDSDK_CORE_PROJECT' in os.environ.keys() + self.project = os.environ['CLOUDSDK_CORE_PROJECT'] + self.zone = zone + self.gcp_bucket_name = gcp_bucket_name + self.instance_type = instance_type + self.terminate = terminate + self.disk_size = disk_size + self.image_project = image_project + self.image_name = image_name + self.preemptible = preemptible + + self.gcp_log_prefix = gcp_log_prefix + self.gcp_log_name = gcp_log_name + self.gcp_log_path = gcp_log_path or 'doodad/logs' + if self.gpu: + self.num_gpu = gpu_kwargs['num_gpu'] + self.gpu_model = gpu_kwargs['gpu_model'] + self.gpu_type = get_gpu_type(self.project, self.zone, self.gpu_model) + + import googleapiclient.discovery + self.compute = googleapiclient.discovery.build('compute', 'v1') + + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): + if self.gcp_log_name is None: + exp_name = "{}-{}".format(self.gcp_log_prefix, EC2SpotDocker.make_timekey(self)) + else: + exp_name = self.gcp_log_name + exp_prefix = self.gcp_log_prefix + gcp_base_dir = os.path.join(self.gcp_log_path, exp_prefix.replace("_", "-"), exp_name) + + mnt_args = '' + py_path = [] + gcp_mount_info = [] + max_sync_interval = 0 + local_mounts = [] + for mount in mount_points: + print('Handling mount: ', mount) + if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects + if mount.read_only: + if mount.path_on_remote is None: + with mount.gzip() as gzip_file: + gzip_path = os.path.realpath(gzip_file) + file_hash = hash_file(gzip_path) + gcp_path = upload_file_to_gcp_storage( + bucket_name=self.gcp_bucket_name, + file_name=gzip_path, + remote_filename=file_hash+'.tar' + ) + mount.path_on_remote = gcp_path + mount.local_file_hash = file_hash + else: + file_hash = mount.local_file_hash + gcp_path = mount.path_on_remote + remote_unpack_name = '/tmp/'+file_hash + mount_point = os.path.join('/mounts', mount.mount_point.replace('~/','')) + mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename(mount.local_dir)), mount_point) + if mount.pythonpath: + py_path.append(mount_point) + local_mounts.append(file_hash) + else: + raise ValueError() + elif isinstance(mount, MountGCP): + gcp_local_dir = mount.mount_point + gcp_path = os.path.join(gcp_base_dir, mount.gcp_path) + if not mount.output: + raise NotImplementedError() + gcp_mount_info.append( + (gcp_local_dir, gcp_path, mount.include_string, mount.sync_interval) + ) + mnt_args += ' -v %s:%s' % (gcp_local_dir, mount.mount_point) + else: + raise NotImplementedError() + + docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) + + metadata = { + 'bucket_name': self.gcp_bucket_name, + 'docker_cmd': docker_cmd, + 'docker_image': self.docker_image, + 'local_mounts': json.dumps(local_mounts), + 'gcp_mounts': json.dumps(gcp_mount_info), + 'use_gpu': json.dumps(self.gpu), + 'terminate': json.dumps(self.terminate), + 'startup-script': open(GCP_STARTUP_SCRIPT_PATH, "r").read(), + 'shutdown-script': open(GCP_SHUTDOWN_SCRIPT_PATH, "r").read(), + } + unique_prefix = "doodad" + str(uuid.uuid4()).replace("-", "") + # instance name must match regex '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"> + import re + name_pattern = re.compile('(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)') + name = (exp_prefix + exp_name).replace("-", "").replace("_", "") + if not name_pattern.match(unique_prefix + name): + print(unique_prefix + name, " is not a valid GCP instance name") + name = "" + instance_name = unique_prefix + name + + self.create_instance(metadata, name=instance_name) + if verbose: + print(metadata) + + def create_instance(self, metadata, name): + image_response = self.compute.images().get( + project=self.image_project, + image=self.image_name, + ).execute() + source_disk_image = image_response['selfLink'] + config = { + 'name': name, + 'machineType': get_machine_type(self.zone, self.instance_type), + 'disks': [{ + 'boot': True, + 'autoDelete': True, + 'initializeParams': { + 'sourceImage': source_disk_image, + 'diskSizeGb': self.disk_size, + } + }], + 'networkInterfaces': [{ + 'network': 'global/networks/default', + 'accessConfigs': [ + {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'} + ] + }], + 'serviceAccounts': [{ + 'email': 'default', + 'scopes': ['https://www.googleapis.com/auth/cloud-platform'] + }], + 'metadata': { + 'items': [ + {'key': key, 'value': value} + for key, value in metadata.items() + ] + }, + 'scheduling': { + "onHostMaintenance": "terminate", + "automaticRestart": False, + "preemptible": self.preemptible, + }, + } + if self.gpu: + config["guestAccelerators"] = [{ + "acceleratorType": self.gpu_type, + "acceleratorCount": self.num_gpu, + }] + return self.compute.instances().insert( + project=self.project, + zone=self.zone, + body=config + ).execute() + class CodalabDocker(DockerMode): def __init__(self): super(CodalabDocker, self).__init__() diff --git a/doodad/mount.py b/doodad/mount.py index c5a1b14..1b72591 100644 --- a/doodad/mount.py +++ b/doodad/mount.py @@ -88,21 +88,24 @@ def __init__(self, git_url, git_credentials=None, **kwargs): self.git_credentials = git_credentials raise NotImplementedError() + class MountGCP(Mount): - def __init__(self, gcp_path, s3_bucket=None, sync_interval=15, output=False, + def __init__(self, gcp_path, gcp_bucket_name, sync_interval=15, output=False, include_types=('*.txt', '*.csv', '*.json', '*.gz', '*.tar', '*.log', '*.pkl'), **kwargs): - super(MountS3, self).__init__(**kwargs) - if s3_bucket is None: - # load from config - from doodad.ec2.autoconfig import AUTOCONFIG - s3_bucket = AUTOCONFIG.s3_bucket() - self.s3_bucket = s3_bucket - self.s3_path = s3_path + super(MountGCP, self).__init__(**kwargs) + self.gcp_bucket_name = gcp_bucket_name + self.gcp_path = gcp_path self.output = output self.sync_interval = sync_interval self.sync_on_terminate = True self.include_types = include_types + def __str__(self): + return 'MountGCP@gcp://%s/%s'% (self.gcp_bucket_name, self.gcp_path) + + @property + def include_string(self): + return ' '.join(['--include \'%s\''%type_ for type_ in self.include_types]) class MountS3(Mount): def __init__(self, s3_path, s3_bucket=None, sync_interval=15, output=False, diff --git a/scripts/gce_script.sh b/scripts/gce_script.sh deleted file mode 100644 index 5d6d38b..0000000 --- a/scripts/gce_script.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -install_docker() { - sudo apt-get install --no-install-recommends \ - apt-transport-https \ - curl \ - software-properties-common - curl -fsSL 'https://sks-keyservers.net/pks/lookup?op=get&search=0xee6d536cf7dc86e2d7d56f59a178ac6c6238f52e' | sudo apt-key add - - sudo add-apt-repository \ - "deb https://packages.docker.com/1.12/apt/repo/ \ - ubuntu-$(lsb_release -cs) \ - main" - sudo apt-get update - sudo apt-get -y install docker-engine - sudo usermod -a -G docker ubuntu -} - -truncate -s 0 /home/ubuntu/user_data.log -{ - bucket_name=$(curl http://metadata/computeMetadata/v1/instance/attributes/bucket_name -H "Metadata-Flavor: Google") - docker_image=$(curl http://metadata/computeMetadata/v1/instance/attributes/docker_image -H "Metadata-Flavor: Google") - local_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/local_mounts -H "Metadata-Flavor: Google") - gce_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/gce_mounts -H "Metadata-Flavor: Google") - use_gpu=$(curl http://metadata/computeMetadata/v1/instance/attributes/use_gpu -H "Metadata-Flavor: Google") - - yes | sudo apt-get update - install_docker - sudo apt-get install jq git unzip - die() { status=$1; shift; echo "FATAL: $*"; exit $status; } - service docker start - docker --config /home/ubuntu/.docker pull $docker_image - - num_local_mounts=$(jq length <<< $local_mounts) - for ((i=0;i<$num_local_mounts;i++)); do - local_mount=$(jq .[$i] <<< $local_mounts) - echo "Mounting " $local_mount - gsutil cp gs://$bucket_name/doodad/mount/$local_mount.tar /tmp/$local_mount.tar - mkdir -p /tmp/$local_mount - tar -xvf /tmp/$local_mount.tar -C /tmp/$local_mount - done - - num_gce_mounts=$(jq length <<< $gce_mounts) - for ((i=0;i<$num_gce_mounts;i++)); do - gce_mount_info=$(jq .[$i] <<< $gce_mounts) - # assume gce_mount_info is a (local_path, bucket_path) pair - local_path=$(jq .[0] <<< $gce_mount_info) - gce_bucket_path=$(jq .[1] <<< $gce_mount_info) - echo "Adding periodic sync " $gce_mount - done - - echo $num_mounts -} >> /home/ubuntu/user_data.log 2>&1 diff --git a/scripts/gcp/gcp_shutdown_script.sh b/scripts/gcp/gcp_shutdown_script.sh new file mode 100644 index 0000000..fe0b121 --- /dev/null +++ b/scripts/gcp/gcp_shutdown_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +bucket_name=$(curl http://metadata/computeMetadata/v1/instance/attributes/bucket_name -H "Metadata-Flavor: Google") +gcp_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/gcp_mounts -H "Metadata-Flavor: Google") +instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") + +num_gcp_mounts=$(jq length <<< $gcp_mounts) +for ((i=0;i<$num_gcp_mounts;i++)); do + gcp_mount_info=$(jq .[$i] <<< $gcp_mounts) + # assume gcp_mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple + local_path=$(jq .[0] <<< $gcp_mount_info | tr -d '"') + gcp_bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') + gsutil -m rsync -r $local_path gs://$bucket_name/$gcp_bucket_path +done + +gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log diff --git a/scripts/gcp/gcp_startup_script.sh b/scripts/gcp/gcp_startup_script.sh new file mode 100644 index 0000000..16d5d7a --- /dev/null +++ b/scripts/gcp/gcp_startup_script.sh @@ -0,0 +1,83 @@ +#!/bin/bash +install_docker() { + sudo apt-get install -y --no-install-recommends \ + apt-transport-https \ + curl \ + software-properties-common + curl -fsSL 'https://sks-keyservers.net/pks/lookup?op=get&search=0xee6d536cf7dc86e2d7d56f59a178ac6c6238f52e' | sudo apt-key add - + sudo add-apt-repository \ + "deb https://packages.docker.com/1.12/apt/repo/ \ + ubuntu-$(lsb_release -cs) \ + main" + sudo apt-get update + sudo apt-get -y install docker-engine + sudo usermod -a -G docker ubuntu +} + +{ + bucket_name=$(curl http://metadata/computeMetadata/v1/instance/attributes/bucket_name -H "Metadata-Flavor: Google") + docker_cmd=$(curl http://metadata/computeMetadata/v1/instance/attributes/docker_cmd -H "Metadata-Flavor: Google") + docker_image=$(curl http://metadata/computeMetadata/v1/instance/attributes/docker_image -H "Metadata-Flavor: Google") + local_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/local_mounts -H "Metadata-Flavor: Google") + gcp_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/gcp_mounts -H "Metadata-Flavor: Google") + use_gpu=$(curl http://metadata/computeMetadata/v1/instance/attributes/use_gpu -H "Metadata-Flavor: Google") + terminate=$(curl http://metadata/computeMetadata/v1/instance/attributes/terminate -H "Metadata-Flavor: Google") + instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") + echo $bucket_name, $docker_cmd, $docker_image, $local_mounts, $_mounts, $use_gpu, $terminate + + sudo apt-get update + #install_docker + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do + sleep 1 + done + sudo apt-get install -y jq git unzip + die() { status=$1; shift; echo "FATAL: $*"; exit $status; } + service docker start + docker --config /home/ubuntu/.docker pull $docker_image + + num_local_mounts=$(jq length <<< $local_mounts) + for ((i=0;i<$num_local_mounts;i++)); do + local_mount=$(jq .[$i] <<< $local_mounts | tr -d '"') + echo "Mounting " $local_mount + gsutil cp gs://$bucket_name/doodad/mount/$local_mount.tar /tmp/$local_mount.tar + mkdir -p /tmp/$local_mount + tar -xvf /tmp/$local_mount.tar -C /tmp/$local_mount + done + + num__mounts=$(jq length <<< $gcp_mounts) + for ((i=0;i<$num__mounts;i++)); do + _mount_info=$(jq .[$i] <<< $gcp_mounts) + # assume _mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple + local_path=$(jq .[0] <<< $_mount_info | tr -d '"') + _bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') + include_string=$(jq .[2] <<< $_mount_info | tr -d '"') + periodic_sync_interval=$(jq .[3] <<< $_mount_info | tr -d '"') + while /bin/true; do + gsutil -m rsync -r $local_path gs://$bucket_name/$_bucket_path + sleep $periodic_sync_interval + done & echo sync from $local_path to gs://$bucket_name/$_bucket_path initiated + done + while /bin/true; do + gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log + sleep 300 + done & + + if [ "$use_gpu" = "true" ]; then + for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done + systemctl start nvidia-docker + echo 'Testing nvidia-smi' + nvidia-smi + echo 'Testing nvidia-smi inside docker' + nvidia-docker run --rm $docker_image nvidia-smi + fi + + echo $docker_cmd >> run_docker_command.sh + bash run_docker_command.sh + + if [ "$terminate" = "true" ]; then + echo "Finished experiment. Terminating" + zone=$(curl http://metadata/computeMetadata/v1/instance/zone -H "Metadata-Flavor: Google") + zone="${zone##*/}" + gcloud compute instances delete $instance_name --zone $zone --quiet + fi +} >> /home/ubuntu/user_data.log 2>&1 From 972402e2940d7999af3c2e4ada46338a5ec530ce Mon Sep 17 00:00:00 2001 From: Steven Lin Date: Thu, 1 Nov 2018 14:00:16 -0700 Subject: [PATCH 25/34] Add labels to instances --- doodad/mode.py | 24 +++++++--------- scripts/gcp/gcp_shutdown_script.sh | 8 ++++-- scripts/gcp/gcp_startup_script.sh | 46 +++++++++++++++++++----------- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index f6bfc48..ff31553 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -538,6 +538,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): print(full_script) with open("/tmp/full_ec2_script", "w") as f: f.write(full_script) + instance_args = dict( ImageId=aws_config["image_id"], KeyName=aws_config["key_name"], @@ -630,16 +631,16 @@ def __init__( self, zone="us-east4-a", gcp_bucket_name=None, - instance_type='n1-standard-8', + instance_type='n1-standard-4', image_name=None, image_project=None, disk_size:"Gb"=64, terminate=True, + preemptible=True, gcp_log_prefix='experiment', gcp_log_name=None, gcp_log_path=None, gpu_kwargs=None, - preemptible=True, **kwargs ): super(GCPDocker, self).__init__(**kwargs) @@ -729,21 +730,14 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): 'startup-script': open(GCP_STARTUP_SCRIPT_PATH, "r").read(), 'shutdown-script': open(GCP_SHUTDOWN_SCRIPT_PATH, "r").read(), } - unique_prefix = "doodad" + str(uuid.uuid4()).replace("-", "") # instance name must match regex '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"> - import re - name_pattern = re.compile('(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)') - name = (exp_prefix + exp_name).replace("-", "").replace("_", "") - if not name_pattern.match(unique_prefix + name): - print(unique_prefix + name, " is not a valid GCP instance name") - name = "" - instance_name = unique_prefix + name - - self.create_instance(metadata, name=instance_name) + unique_name= "doodad" + str(uuid.uuid4()).replace("-", "") + self.create_instance(metadata, unique_name, exp_name, exp_prefix) if verbose: + print(unique_name) print(metadata) - def create_instance(self, metadata, name): + def create_instance(self, metadata, name, exp_name="", exp_prefix=""): image_response = self.compute.images().get( project=self.image_project, image=self.image_name, @@ -781,6 +775,10 @@ def create_instance(self, metadata, name): "automaticRestart": False, "preemptible": self.preemptible, }, + "labels": { + "exp_name": exp_name, + "exp_prefix": exp_prefix, + } } if self.gpu: config["guestAccelerators"] = [{ diff --git a/scripts/gcp/gcp_shutdown_script.sh b/scripts/gcp/gcp_shutdown_script.sh index fe0b121..99cf3ff 100644 --- a/scripts/gcp/gcp_shutdown_script.sh +++ b/scripts/gcp/gcp_shutdown_script.sh @@ -1,7 +1,11 @@ #!/bin/bash +query_metadata() { + attribute_name=$1 + curl http://metadata/computeMetadata/v1/instance/attributes/$attribute_name -H "Metadata-Flavor: Google" +} -bucket_name=$(curl http://metadata/computeMetadata/v1/instance/attributes/bucket_name -H "Metadata-Flavor: Google") -gcp_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/gcp_mounts -H "Metadata-Flavor: Google") +bucket_name=$(query_metadata bucket_name) +gcp_mounts=$(query_metadata gcp_mounts) instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") num_gcp_mounts=$(jq length <<< $gcp_mounts) diff --git a/scripts/gcp/gcp_startup_script.sh b/scripts/gcp/gcp_startup_script.sh index 16d5d7a..9338d87 100644 --- a/scripts/gcp/gcp_startup_script.sh +++ b/scripts/gcp/gcp_startup_script.sh @@ -14,16 +14,28 @@ install_docker() { sudo usermod -a -G docker ubuntu } +query_metadata() { + attribute_name=$1 + curl http://metadata/computeMetadata/v1/instance/attributes/$attribute_name -H "Metadata-Flavor: Google" +} + { - bucket_name=$(curl http://metadata/computeMetadata/v1/instance/attributes/bucket_name -H "Metadata-Flavor: Google") - docker_cmd=$(curl http://metadata/computeMetadata/v1/instance/attributes/docker_cmd -H "Metadata-Flavor: Google") - docker_image=$(curl http://metadata/computeMetadata/v1/instance/attributes/docker_image -H "Metadata-Flavor: Google") - local_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/local_mounts -H "Metadata-Flavor: Google") - gcp_mounts=$(curl http://metadata/computeMetadata/v1/instance/attributes/gcp_mounts -H "Metadata-Flavor: Google") - use_gpu=$(curl http://metadata/computeMetadata/v1/instance/attributes/use_gpu -H "Metadata-Flavor: Google") - terminate=$(curl http://metadata/computeMetadata/v1/instance/attributes/terminate -H "Metadata-Flavor: Google") + bucket_name=$(query_metadata bucket_name) + docker_cmd=$(query_metadata docker_cmd) + docker_image=$(query_metadata docker_image) + local_mounts=$(query_metadata local_mounts) + gcp_mounts=$(query_metadata gcp_mounts) + use_gpu=$(query_metadata use_gpu) + terminate=$(query_metadata terminate) instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") - echo $bucket_name, $docker_cmd, $docker_image, $local_mounts, $_mounts, $use_gpu, $terminate + echo "bucket_name:" $bucket_name + echo "docker_cmd:" $docker_cmd + echo "docker_image:" $docker_image + echo "local_mounts:" $local_mounts + echo "gcp_mounts:" $gcp_mounts + echo "use_gpu:" $use_gpu + echo "terminate:" $terminate + echo "instance_name:" $instance_name sudo apt-get update #install_docker @@ -44,18 +56,18 @@ install_docker() { tar -xvf /tmp/$local_mount.tar -C /tmp/$local_mount done - num__mounts=$(jq length <<< $gcp_mounts) - for ((i=0;i<$num__mounts;i++)); do - _mount_info=$(jq .[$i] <<< $gcp_mounts) + num_gcp_mounts=$(jq length <<< $gcp_mounts) + for ((i=0;i<$num_gcp_mounts;i++)); do + gcp_mount_info=$(jq .[$i] <<< $gcp_mounts) # assume _mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple - local_path=$(jq .[0] <<< $_mount_info | tr -d '"') - _bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') - include_string=$(jq .[2] <<< $_mount_info | tr -d '"') - periodic_sync_interval=$(jq .[3] <<< $_mount_info | tr -d '"') + local_path=$(jq .[0] <<< $gcp_mount_info | tr -d '"') + gcp_bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') + include_string=$(jq .[2] <<< $gcp_mount_info | tr -d '"') + periodic_sync_interval=$(jq .[3] <<< $gcp_mount_info | tr -d '"') while /bin/true; do - gsutil -m rsync -r $local_path gs://$bucket_name/$_bucket_path + gsutil -m rsync -r $local_path gs://$bucket_name/$gcp_bucket_path sleep $periodic_sync_interval - done & echo sync from $local_path to gs://$bucket_name/$_bucket_path initiated + done & echo sync from $local_path to gs://$bucket_name/$gcp_bucket_path initiated done while /bin/true; do gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log From 3bb86f607f5e73b7679a9ee7e26a5cdd6b9aaf42 Mon Sep 17 00:00:00 2001 From: Vitchyr Pong Date: Fri, 2 Nov 2018 18:26:55 -0700 Subject: [PATCH 26/34] fix typo --- doodad/mode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doodad/mode.py b/doodad/mode.py index 6b0a2d3..3730348 100644 --- a/doodad/mode.py +++ b/doodad/mode.py @@ -449,7 +449,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): sleep {periodic_sync_interval} done & echo sync initiated """.format( - s3_path=stdout_log_s3_path, + stdout_log_s3_path=stdout_log_s3_path, periodic_sync_interval=max_sync_interval )) @@ -493,7 +493,7 @@ def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): local_dir=local_output_dir, s3_dir=s3_dir_path )) - + sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format( stdout_log_s3_path, )) From b0e9127ae7d04209d446ce518ae76b4696cc955e Mon Sep 17 00:00:00 2001 From: Justin Fu Date: Fri, 2 Nov 2018 23:22:06 -0700 Subject: [PATCH 27/34] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index eb33a0d..bb90c1e 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,11 @@ python scripts/ec2_setup.py - (Optional) Set up [Docker](https://docs.docker.com/engine/installation/). This is required on the target machine if running in a Docker-enabled mode. +- (Optional) Set up GCP + - https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu + - https://cloud.google.com/compute/docs/tutorials/python-guide + - https://cloud.google.com/storage/docs/reference/libraries#client-libraries-install-python + ## Example From e9e899d9c18cb73ff8a198c4aefa710de2d8b547 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 03:54:34 -0800 Subject: [PATCH 28/34] rename get arg config --- .idea/vcs.xml | 6 ++++++ doodad/arg_parse.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 .idea/vcs.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/doodad/arg_parse.py b/doodad/arg_parse.py index c194814..e0a4c24 100644 --- a/doodad/arg_parse.py +++ b/doodad/arg_parse.py @@ -9,7 +9,7 @@ __ARGS = None -def __get_arg_config(): +def __get_unpickled_arg_config(): """ global __ARGS if __ARGS is not None: @@ -34,7 +34,7 @@ def __get_arg_config(): def get_args(key=None, default=None): - args = __get_arg_config() + args = __get_unpickled_arg_config() if args.args_data: if args.use_cloudpickle: From 3f7c4d49109f655240f50fd355871fd81c265180 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 04:04:27 -0800 Subject: [PATCH 29/34] rm idea vcs xml --- .gitignore | 103 --- LICENSE | 21 - README.md | 44 -- doodad/__init__.py | 2 - doodad/arg_parse.py | 4 +- doodad/easy_sweep/__init__.py | 1 - doodad/easy_sweep/hyper_sweep.py | 106 --- doodad/easy_sweep/launcher.py | 99 --- doodad/ec2/__init__.py | 1 - doodad/ec2/autoconfig.py | 43 - doodad/ec2/aws_util.py | 33 - doodad/ec2/credentials.py | 46 -- doodad/gcp/gcp_util.py | 55 -- doodad/launch_tools.py | 100 --- doodad/mode.py | 960 ----------------------- doodad/mount.py | 130 --- doodad/relaunch.py | 53 -- doodad/ssh/__init__.py | 1 - doodad/ssh/credentials.py | 66 -- doodad/utils.py | 89 --- examples/.gitignore | 1 - examples/docker_checkpoint/app_main.py | 21 - examples/docker_checkpoint/launch.py | 41 - examples/docker_newton_example.py | 38 - examples/ec2_launch/app_main.py | 19 - examples/ec2_launch/ec2_launch_test.py | 59 -- examples/secretlib/secretlib/__init__.py | 2 - hyper_viz/base.py | 94 --- hyper_viz/hyper_viz.py | 12 - hyper_viz/rllab_interface.py | 62 -- requirements.txt | 4 - scripts/gcp/gcp_shutdown_script.sh | 20 - scripts/gcp/gcp_startup_script.sh | 95 --- scripts/pull_s3_logs.py | 22 - scripts/run_experiment_lite_doodad.py | 9 - scripts/setup_ec2.py | 395 ---------- setup.py | 0 37 files changed, 2 insertions(+), 2849 deletions(-) delete mode 100644 .gitignore delete mode 100644 LICENSE delete mode 100644 README.md delete mode 100644 doodad/__init__.py delete mode 100644 doodad/easy_sweep/__init__.py delete mode 100644 doodad/easy_sweep/hyper_sweep.py delete mode 100644 doodad/easy_sweep/launcher.py delete mode 100644 doodad/ec2/__init__.py delete mode 100644 doodad/ec2/autoconfig.py delete mode 100644 doodad/ec2/aws_util.py delete mode 100644 doodad/ec2/credentials.py delete mode 100644 doodad/gcp/gcp_util.py delete mode 100644 doodad/launch_tools.py delete mode 100644 doodad/mode.py delete mode 100644 doodad/mount.py delete mode 100644 doodad/relaunch.py delete mode 100644 doodad/ssh/__init__.py delete mode 100644 doodad/ssh/credentials.py delete mode 100644 doodad/utils.py delete mode 100644 examples/.gitignore delete mode 100644 examples/docker_checkpoint/app_main.py delete mode 100644 examples/docker_checkpoint/launch.py delete mode 100644 examples/docker_newton_example.py delete mode 100644 examples/ec2_launch/app_main.py delete mode 100644 examples/ec2_launch/ec2_launch_test.py delete mode 100644 examples/secretlib/secretlib/__init__.py delete mode 100644 hyper_viz/base.py delete mode 100644 hyper_viz/hyper_viz.py delete mode 100644 hyper_viz/rllab_interface.py delete mode 100644 requirements.txt delete mode 100644 scripts/gcp/gcp_shutdown_script.sh delete mode 100644 scripts/gcp/gcp_startup_script.sh delete mode 100755 scripts/pull_s3_logs.py delete mode 100644 scripts/run_experiment_lite_doodad.py delete mode 100644 scripts/setup_ec2.py delete mode 100644 setup.py diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 7604ea9..0000000 --- a/.gitignore +++ /dev/null @@ -1,103 +0,0 @@ -aws_config/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# dotenv -.env - -# virtualenv -.venv -venv/ -ENV/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 6ec89f7..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018 Justin Fu - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index bb90c1e..0000000 --- a/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# doodad - - -A library for launching python programs on different machines. Currently supports running locally and over EC2 and SSH (via Docker) with minimal (if any) modification to your existing program. - -EC2 code is based on [rllab](https://github.com/rll/rllab/)'s code. - - -## Setup - -- Add this repo to your pythonpath. -``` -export PYTHONPATH=$PYTHONPATH:/path/to/this/repo -``` - -- Install dependencies -``` -pip install -r requirements.txt -``` - -- (Optional) Set up EC2 -``` -python scripts/ec2_setup.py -``` - -- (Optional) Set up [Docker](https://docs.docker.com/engine/installation/). This is required on the target machine if running in a Docker-enabled mode. - -- (Optional) Set up GCP - - https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu - - https://cloud.google.com/compute/docs/tutorials/python-guide - - https://cloud.google.com/storage/docs/reference/libraries#client-libraries-install-python - - -## Example - -See [ec2_launch_test.py](https://github.com/justinjfu/doodad/blob/master/examples/ec2_launch/ec2_launch_test.py) for an example on how to run scripts on EC2, over SSH, or locally. - -## Tutorial - -See the [wiki](https://github.com/justinjfu/doodad/wiki/Home) - -## TODOs -- Add support for automatic experiment restarting (will require the user to write a save_state and restore_state function, or use something like CRIU) -- Fix output directories when using docker showing up as root permissions. diff --git a/doodad/__init__.py b/doodad/__init__.py deleted file mode 100644 index 198452b..0000000 --- a/doodad/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .launch_tools import * -from .arg_parse import get_args diff --git a/doodad/arg_parse.py b/doodad/arg_parse.py index c194814..e0a4c24 100644 --- a/doodad/arg_parse.py +++ b/doodad/arg_parse.py @@ -9,7 +9,7 @@ __ARGS = None -def __get_arg_config(): +def __get_unpickled_arg_config(): """ global __ARGS if __ARGS is not None: @@ -34,7 +34,7 @@ def __get_arg_config(): def get_args(key=None, default=None): - args = __get_arg_config() + args = __get_unpickled_arg_config() if args.args_data: if args.use_cloudpickle: diff --git a/doodad/easy_sweep/__init__.py b/doodad/easy_sweep/__init__.py deleted file mode 100644 index 77323fb..0000000 --- a/doodad/easy_sweep/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .launcher import DoodadSweeper diff --git a/doodad/easy_sweep/hyper_sweep.py b/doodad/easy_sweep/hyper_sweep.py deleted file mode 100644 index b19e0b1..0000000 --- a/doodad/easy_sweep/hyper_sweep.py +++ /dev/null @@ -1,106 +0,0 @@ -""" -Usage - -args = { -'param1': [1e-3, 1e-2, 1e-2], -'param2': [1,5,10,20], -} - -run_sweep_parallel(func, args) - -or - -run_sweep_serial(func, args) - -""" -import os -import itertools -import multiprocessing -import random -from datetime import datetime - -import doodad -from doodad.utils import REPO_DIR - - -class Sweeper(object): - def __init__(self, hyper_config, repeat, include_name=False): - self.hyper_config = hyper_config - self.repeat = repeat - self.include_name=include_name - - def __iter__(self): - count = 0 - for _ in range(self.repeat): - for config in itertools.product(*[val for val in self.hyper_config.values()]): - kwargs = {key:config[i] for i, key in enumerate(self.hyper_config.keys())} - if self.include_name: - timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - kwargs['exp_name'] = "%s_%d" % (timestamp, count) - count += 1 - yield kwargs - - -def run_sweep_serial(run_method, params, repeat=1): - sweeper = Sweeper(params, repeat) - for config in sweeper: - run_method(**config) - - -def kwargs_wrapper(args_method): - args, method = args_method - return method(**args) - - -def run_sweep_parallel(run_method, params, repeat=1, num_cpu=multiprocessing.cpu_count()): - sweeper = Sweeper(params, repeat) - pool = multiprocessing.Pool(num_cpu) - exp_args = [] - for config in sweeper: - exp_args.append((config, run_method)) - random.shuffle(exp_args) - pool.map(kwargs_wrapper, exp_args) - - -SCRIPTS_DIR = os.path.join(REPO_DIR, 'scripts') -def run_sweep_doodad(run_method, params, run_mode, mounts, repeat=1, test_one=False): - sweeper = Sweeper(params, repeat) - for config in sweeper: - def run_method_args(): - run_method(**config) - doodad.launch_python( - target = os.path.join(SCRIPTS_DIR, 'run_experiment_lite_doodad.py'), - mode=run_mode, - mount_points=mounts, - use_cloudpickle=True, - args = {'run_method': run_method_args}, - ) - if test_one: - break - - -def run_single_doodad(run_method, kwargs, run_mode, mounts, repeat=1): - """ Run a single function via doodad """ - sweeper = Sweeper(params, repeat) - def run_method_args(): - run_method(**kwargs) - doodad.launch_python( - target = os.path.join(SCRIPTS_DIR, 'run_experiment_lite_doodad.py'), - mode=run_mode, - mount_points=mounts, - use_cloudpickle=True, - args = {'run_method': run_method_args}, - ) - - -if __name__ == "__main__": - def example_run_method(exp_name, param1, param2='a', param3=3, param4=4): - import time - time.sleep(1.0) - print(exp_name, param1, param2, param3, param4) - sweep_op = { - 'param1': [1e-3, 1e-2, 1e-1], - 'param2': [1,5,10,20], - 'param3': [True, False] - } - run_sweep_parallel(example_run_method, sweep_op, repeat=2) diff --git a/doodad/easy_sweep/launcher.py b/doodad/easy_sweep/launcher.py deleted file mode 100644 index 7dce47b..0000000 --- a/doodad/easy_sweep/launcher.py +++ /dev/null @@ -1,99 +0,0 @@ -from datetime import datetime - -import doodad -import doodad.mode -import doodad.mount as mount -from doodad.utils import REPO_DIR -from doodad.easy_sweep.hyper_sweep import run_sweep_doodad, run_sweep_parallel, run_sweep_serial - -INSTANCE_TO_PRICE = { - 'c4.large': 0.03, - 'c4.xlarge': 0.05, - 'c4.2xlarge': 0.08, - 'p2.xlarge': 0.2, -} - -class DoodadSweeper(object): - def __init__(self, - mounts, - docker_img='python:3.5', - docker_output_dir='/data', - local_output_dir='data/docker', - ): - - self.image = docker_img - self.mode_local = doodad.mode.LocalDocker(image=docker_img) - - # always include doodad - mounts.append(mount.MountLocal(local_dir=REPO_DIR, pythonpath=True)) - self.mounts = mounts - self.mount_out_local = mount.MountLocal(local_dir=local_output_dir, mount_point=docker_output_dir, output=True) - self.mount_out_s3 = mount.MountS3(s3_path='exp_logs', mount_point=docker_output_dir, output=True) - - def run_sweep_serial(self, run_method, params, repeat=1): - run_sweep_serial(run_method, params, repeat=repeat) - - def run_sweep_parallel(self, run_method, params, repeat=1): - run_sweep_parallel(run_method, params, repeat=repeat) - - def run_test_docker(self, run_method, params, **kwargs): - run_sweep_doodad(run_method, params, run_mode=self.mode_local, - mounts=self.mounts+[self.mount_out_local], - test_one=True) - - def run_single_docker(self, run_method, kwargs): - run_single_doodad(run_method, kwargs, run_mode=self.mode_local, - mounts=self.mounts+[self.mount_out_local]) - - def run_sweep_ec2(self, run_method, params, bucket_name, - s3_log_name=None, add_date_to_logname=True, - region='us-east-2', instance_type='c4.xlarge', repeat=1): - if s3_log_name is None: - s3_log_name = 'unnamed_experiment' - if add_date_to_logname: - datestamp = datetime.now().strftime('%Y_%m_%d') - s3_log_name = '%s_%s' % (datestamp, s3_log_name) - - mode_ec2 = doodad.mode.EC2AutoconfigDocker( - image=self.image, - region=region, - s3_bucket=bucket_name, - instance_type=instance_type, - spot_price=INSTANCE_TO_PRICE[instance_type], - s3_log_prefix=s3_log_name, - ) - run_sweep_doodad(run_method, params, run_mode=mode_ec2, - mounts=self.mounts+[self.mount_out_s3], repeat=repeat) - - def run_single_ec2(self, run_method, kwargs, bucket_name, - s3_log_name=None, add_date_to_logname=True, - region='us-east-2', instance_type='c4.xlarge'): - if s3_log_name is None: - s3_log_name = 'unnamed_experiment' - if add_date_to_logname: - datestamp = datetime.now().strftime('%Y_%m_%d') - s3_log_name = '%s_%s' % (datestamp, s3_log_name) - - mode_ec2 = doodad.mode.EC2AutoconfigDocker( - image=self.image, - region=region, - s3_bucket=bucket_name, - instance_type=instance_type, - spot_price=INSTANCE_TO_PRICE[instance_type], - s3_log_prefix=s3_log_name, - ) - run_single_doodad(run_method, kwargs, run_mode=mode_ec2, - mounts=self.mounts+[self.mount_out_s3]) - -if __name__ == "__main__": - # test - def example_function(param1=0, param2='c'): - print(param1, param2) - sweep_params = { - 'param1': [0,1,2], - 'param2': ['a','b'] - } - SWEEPER = DoodadSweeper([], docker_img='justinfu/rl_base:0.1') - #SWEEPER.run_sweep_serial(example_function, sweep_params) - SWEEPER.run_sweep_ec2(example_function, sweep_params, bucket_name='doodad') - diff --git a/doodad/ec2/__init__.py b/doodad/ec2/__init__.py deleted file mode 100644 index 98806e2..0000000 --- a/doodad/ec2/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .credentials import AWSCredentials diff --git a/doodad/ec2/autoconfig.py b/doodad/ec2/autoconfig.py deleted file mode 100644 index 442c225..0000000 --- a/doodad/ec2/autoconfig.py +++ /dev/null @@ -1,43 +0,0 @@ -import configparser -import os -import json - -from doodad.utils import REPO_DIR - -class Autoconfig(object): - def __init__(self, filename=None): - if filename is None: - filename = os.path.join(REPO_DIR, 'aws_config', 'config.ini') - config = configparser.ConfigParser() - config.read(filename) - self.config = config - - def s3_bucket(self): - return self.config['default']['s3_bucket_name'] - - def iam_profile_name(self): - return self.config['default']['iam_instance_profile_name'] - - def aws_security_groups(self): - return self.config['default']['aws_security_groups'].split(',') - - def aws_security_group_ids(self): - id_dict = dict(self.config['aws_security_group_ids']) - for k in id_dict: - #json.loads(id_dict[k]) - id_dict[k] = eval(id_dict[k]) #TODO: Get rid of eval - return id_dict - - def aws_access_key(self): - return self.config['default']['aws_access_key'] - - def aws_access_secret(self): - return self.config['default']['aws_access_secret'] - - def aws_image_id(self, region): - return self.config['aws_image_ids'][region] - - def aws_key_name(self, region): - return self.config['aws_key_names'][region] - -AUTOCONFIG = Autoconfig() diff --git a/doodad/ec2/aws_util.py b/doodad/ec2/aws_util.py deleted file mode 100644 index def421c..0000000 --- a/doodad/ec2/aws_util.py +++ /dev/null @@ -1,33 +0,0 @@ -import subprocess - -def s3_exists(bucket, path, region=None): - cmd = 'aws s3 ls s3://%s/%s' % (bucket, path) - if region is not None: - cmd += ' --region %s'%region - try: - result = subprocess.check_output(cmd, shell=True) - except subprocess.CalledProcessError: - return False - - if result: - return True - else: - return False - - -def s3_upload(local_file_name, s3_bucket, s3_path, dry=False, region=None): - remote_path = "s3://%s/%s" % (s3_bucket, s3_path) - if region is None: - upload_cmd = ["aws", "s3", "cp", local_file_name, remote_path] - else: - upload_cmd = ["aws", "s3", "cp", '--region', region, local_file_name, remote_path] - print(' '.join(upload_cmd)) - if not dry: - subprocess.check_call(upload_cmd) - return remote_path - - -if __name__ == "__main__": - print(s3_exists('rail.ex2.a3c', 'jello/mount/tmp04cpn0zj.tar')) - print(s3_exists('rail.ex2.a3c', 'jello/mount/tmp04cpn0zj2.tar')) - diff --git a/doodad/ec2/credentials.py b/doodad/ec2/credentials.py deleted file mode 100644 index fd4059d..0000000 --- a/doodad/ec2/credentials.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import configparser -import io - -class AWSCredentials(object): - """ - Container for AWS credential information - - The from_env or from_config option can be used to avoid having key information inside source code - - Args: - aws_key (str): AWS key - aws_secret (str): AWS secret key - from_env (bool): If True, reads key and secret from environment variables - env_key (str, optional): Environment variable for AWS key. Default AWS_ACCESS_KEY. - env_secret_key (str, optional): Environment variable for AWS secret key. Default AWS_ACCESS_SECRET. - from_config (bool): If True, reads key from config file - config_filename (str, optional): - """ - def __init__(self, aws_key=None, aws_secret=None, - from_env=False, - from_config=False, - config_filename='~/.aws/credentials', - env_secret_key='AWS_ACCESS_SECRET', - env_key='AWS_ACCESS_KEY'): - self.key = aws_key - self.secret = aws_secret - self.from_env=from_env - if from_env: - self.key = os.environ.get(env_key) - self.secret = os.environ.get(env_secret_key) - if from_config: - with open(os.path.expanduser(config_filename)) as f: - sample_config = f.read() - config = configparser.RawConfigParser(allow_no_value=True) - config.read_string(sample_config) - self.key = config.get('default', 'aws_access_key_id') - self.secret = config.get('default', 'aws_secret_access_key') - - @property - def aws_key(self): - return self.key - - @property - def aws_secret_key(self): - return self.secret diff --git a/doodad/gcp/gcp_util.py b/doodad/gcp/gcp_util.py deleted file mode 100644 index 940a049..0000000 --- a/doodad/gcp/gcp_util.py +++ /dev/null @@ -1,55 +0,0 @@ -import os - -from doodad.utils import hash_file, call_and_wait, CommandBuilder, REPO_DIR - -GCP_STARTUP_SCRIPT_PATH = os.path.join(REPO_DIR, "scripts/gcp/gcp_startup_script.sh") -GCP_SHUTDOWN_SCRIPT_PATH = os.path.join(REPO_DIR, "scripts/gcp/gcp_shutdown_script.sh") - -def upload_file_to_gcp_storage( - bucket_name, - file_name, - remote_filename=None, - dry=False, - check_exists=True -): - from google.cloud import storage - storage_client = storage.Client() - bucket = storage_client.get_bucket(bucket_name) - - if remote_filename is None: - remote_filename = os.path.basename(file_name) - remote_path = 'doodad/mount/' + remote_filename - blob = bucket.blob(remote_path) - if check_exists and blob.exists(storage_client): - print("{remote_path} already exists".format(remote_path=remote_path)) - return remote_path - blob.upload_from_filename(file_name) - return remote_path - -def get_machine_type(zone, instance_type): - return "zones/{zone}/machineTypes/{instance_type}".format( - zone=zone, - instance_type=instance_type, - ) - -def get_gpu_type(project, zone, gpu_model): - """ - Check the available gpu models for each zone - https://cloud.google.com/compute/docs/gpus/ - """ - assert gpu_model in [ - 'nvidia-tesla-p4', - 'nvidia-tesla-k80', - 'nvidia-tesla-v100', - 'nvidia-tesla-p100' - ] - - return ( - "https://www.googleapis.com/compute/v1/" - "projects/{project}/zones/{zone}/acceleratorTypes/{gpu_model}".format( - project=project, - zone=zone, - gpu_model=gpu_model - ) - ) - diff --git a/doodad/launch_tools.py b/doodad/launch_tools.py deleted file mode 100644 index 292f630..0000000 --- a/doodad/launch_tools.py +++ /dev/null @@ -1,100 +0,0 @@ -import os - -from .mode import LOCAL, Local -from .arg_parse import encode_args, ARGS_DATA, USE_CLOUDPICKLE, CLOUDPICKLE_VERSION -from .mount import MountLocal - - -def launch_shell( - command, - mode=LOCAL, - dry=False, - mount_points=None, - ): - if mount_points is None: - mount_points = [] - mode.launch_command(command, dry=dry) - - -def launch_python( - target, - python_cmd='python', - mode=LOCAL, - mount_points=None, - args=None, - fake_display=False, - target_mount_dir='target', - use_cloudpickle=False, - target_mount=None, - launch_locally=None, - **launch_command_kwargs -): - """ - - :param target: Path to script to run. - :param python_cmd: - :param mode: - :param mount_points: - :param args: - :param dry: - :param fake_display: - :param target_mount_dir: - :param verbose: - :param use_cloudpickle: - :param target_mount: If set, ignore target and just use this as the target. - :return: - """ - if args is None: - args = {} - if mount_points is None: - mount_points = [] - if launch_locally is None: - launch_locally = isinstance(mode, Local) - - if target_mount is None: - # mount - target_dir = os.path.dirname(target) - if not target_mount_dir: - target_mount_dir = target_dir - target_mount_dir = os.path.join(target_mount_dir, os.path.basename(target_dir)) - if launch_locally: - target_mount = MountLocal(local_dir=target_dir) - else: - target_mount = MountLocal(local_dir=target_dir, mount_point=target_mount_dir) - mount_points = mount_points + [target_mount] - target_full_path = os.path.join(target_mount.mount_dir(), os.path.basename(target)) - - command = make_python_command( - target_full_path, - args=args, - python_cmd=python_cmd, - fake_display=fake_display, - use_cloudpickle=use_cloudpickle, - ) - mode.launch_command(command, mount_points=mount_points, - **launch_command_kwargs) - return target_mount - -HEADLESS = 'xvfb-run -a -s "-ac -screen 0 1400x900x24 +extension RANDR"' -def make_python_command( - target, - python_cmd='python', - args=None, - fake_display=False, - use_cloudpickle=False, -): - - if fake_display: - cmd = '{headless} {python_cmd} {target}'.format(headless=HEADLESS, python_cmd=python_cmd, target=target) - else: - cmd = '%s %s' % (python_cmd, target) - - args_encoded, cp_version = encode_args(args, cloudpickle=use_cloudpickle) - if args: - cmd = '%s=%s %s=%s %s=%s %s' % (ARGS_DATA, args_encoded, - USE_CLOUDPICKLE, str(int(use_cloudpickle)), - CLOUDPICKLE_VERSION, cp_version, - cmd) - - return cmd - diff --git a/doodad/mode.py b/doodad/mode.py deleted file mode 100644 index bc53db1..0000000 --- a/doodad/mode.py +++ /dev/null @@ -1,960 +0,0 @@ -import os -import stat -import subprocess -import tempfile -import uuid -import time -import base64 -import json - -try: - from StringIO import StringIO -except ImportError: - from io import StringIO - -from .mount import MountLocal, MountS3, MountGCP -from .utils import hash_file, call_and_wait, CommandBuilder, REPO_DIR -from .ec2.aws_util import s3_upload, s3_exists -from .gcp.gcp_util import GCP_STARTUP_SCRIPT_PATH, GCP_SHUTDOWN_SCRIPT_PATH, \ - upload_file_to_gcp_storage, get_machine_type, get_gpu_type - -class LaunchMode(object): - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): - raise NotImplementedError() - - -class Local(LaunchMode): - def __init__(self, skip_wait=False): - super(Local, self).__init__() - self.env = {} - self.skip_wait = skip_wait - - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): - if dry: - print(cmd); return - - commands = CommandBuilder() - # chdir to home dir - commands.append('cd %s' % (os.path.expanduser('~'))) - - # do mounting - py_path = [] - cleanup_commands = CommandBuilder() - for mount in mount_points: - print('mounting:', mount) - if isinstance(mount, MountLocal): - if not mount.no_remount: - mount.create_if_nonexistent() - commands.append('ln -s %s %s' % (mount.local_dir, mount.mount_point)) - #subprocess.call(symlink_cmd, shell=True) - if mount.cleanup: - cleanup_commands.append('rm "%s"' % mount.mount_point) - if mount.pythonpath: - py_path.append(mount.mount_point) - else: - raise NotImplementedError() - - # add pythonpath mounts - if py_path: - commands.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(py_path))) - - # Add main command - commands.append(cmd) - - # cleanup - commands.extend(cleanup_commands) - - # Call everything - commands.call_and_wait(verbose=verbose, dry=dry, - skip_wait=self.skip_wait) - -LOCAL = Local() - - -class DockerMode(LaunchMode): - def __init__(self, image='ubuntu:16.04', gpu=False): - super(DockerMode, self).__init__() - self.docker_image = image - self.docker_name = uuid.uuid4() - self.gpu = gpu - - def get_docker_cmd(self, main_cmd, extra_args='', use_tty=True, verbose=True, pythonpath=None, pre_cmd=None, post_cmd=None, - checkpoint=False, no_root=False, use_docker_generated_name=False): - cmd_list= CommandBuilder() - if pre_cmd: - cmd_list.extend(pre_cmd) - - if verbose: - if self.gpu: - cmd_list.append('echo \"Running in docker (gpu)\"') - else: - cmd_list.append('echo \"Running in docker\"') - if pythonpath: - cmd_list.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(pythonpath))) - if no_root: - # This should work if you're running a script - #cmd_list.append('useradd --uid $(id -u) --no-create-home --home-dir / doodaduser') - #cmd_list.append('su - doodaduser /bin/bash {script}') - - # this is a temp workaround - extra_args += ' -u $(id -u)' - - cmd_list.append(main_cmd) - if post_cmd: - cmd_list.extend(post_cmd) - - docker_name = self.docker_name - if docker_name and not use_docker_generated_name: - extra_args += ' --name %s '%docker_name - - if checkpoint: - # set up checkpoint stuff - use_tty = False - extra_args += ' -d ' # detach is optional - - if use_tty: - docker_prefix = 'docker run %s -ti %s /bin/bash -c ' % (extra_args, self.docker_image) - else: - docker_prefix = 'docker run %s %s /bin/bash -c ' % (extra_args, self.docker_image) - if self.gpu: - docker_prefix = 'nvidia-'+docker_prefix - main_cmd = cmd_list.to_string() - full_cmd = docker_prefix + ("\'%s\'" % main_cmd) - return full_cmd - - -class LocalDocker(DockerMode): - def __init__(self, checkpoints=None, skip_wait=False, **kwargs): - super(LocalDocker, self).__init__(**kwargs) - self.checkpoints = checkpoints - self.skip_wait = skip_wait - - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): - mnt_args = '' - py_path = [] - for mount in mount_points: - if isinstance(mount, MountLocal): - #mount_pnt = os.path.expanduser(mount.mount_point) - mount_pnt = mount.mount_dir() - mnt_args += ' -v %s:%s' % (mount.local_dir, mount_pnt) - call_and_wait('mkdir -p %s' % mount.local_dir) - if mount.pythonpath: - py_path.append(mount_pnt) - else: - raise NotImplementedError(type(mount)) - - full_cmd = self.get_docker_cmd(cmd, extra_args=mnt_args, pythonpath=py_path, - checkpoint=self.checkpoints) - call_and_wait(full_cmd, verbose=verbose, dry=dry, - skip_wait=self.skip_wait) - - -class SSHDocker(DockerMode): - TMP_DIR = '~/.remote_tmp' - - def __init__(self, credentials=None, **docker_args): - super(SSHDocker, self).__init__(**docker_args) - self.credentials = credentials - self.run_id = 'run_%s' % uuid.uuid4() - self.tmp_dir = os.path.join(SSHDocker.TMP_DIR, self.run_id) - self.checkpoint = None - - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): - py_path = [] - remote_cmds = CommandBuilder() - remote_cleanup_commands = CommandBuilder() - mnt_args = '' - - tmp_dir_cmd = 'mkdir -p %s' % self.tmp_dir - tmp_dir_cmd = self.credentials.get_ssh_bash_cmd(tmp_dir_cmd) - call_and_wait(tmp_dir_cmd, dry=dry, verbose=verbose) - - # SCP Code over - for mount in mount_points: - if isinstance(mount, MountLocal): - if mount.read_only: - with mount.gzip() as gzip_file: - # scp - base_name = os.path.basename(gzip_file) - #file_hash = hash_file(gzip_path) # TODO: store all code in a special "caches" folder - remote_mnt_dir = os.path.join(self.tmp_dir, os.path.splitext(base_name)[0]) - remote_tar = os.path.join(self.tmp_dir, base_name) - scp_cmd = self.credentials.get_scp_cmd(gzip_file, remote_tar) - call_and_wait(scp_cmd, dry=dry, verbose=verbose) - remote_cmds.append('mkdir -p %s' % remote_mnt_dir) - unzip_cmd = 'tar -xf %s -C %s' % (remote_tar, remote_mnt_dir) - remote_cmds.append(unzip_cmd) - mount_point = mount.mount_dir() - mnt_args += ' -v %s:%s' % (os.path.join(remote_mnt_dir, os.path.basename(mount.mount_point)) ,mount_point) - else: - #remote_cmds.append('mkdir -p %s' % mount.mount_point) - remote_cmds.append('mkdir -p %s' % mount.local_dir_raw) - mnt_args += ' -v %s:%s' % (mount.local_dir_raw, mount.mount_point) - - if mount.pythonpath: - py_path.append(mount_point) - else: - raise NotImplementedError() - - if self.checkpoint and self.checkpoint.restore: - raise NotImplementedError() - else: - docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) - - - remote_cmds.append(docker_cmd) - remote_cmds.extend(remote_cleanup_commands) - - with tempfile.NamedTemporaryFile('w+', suffix='.sh') as ntf: - for cmd in remote_cmds: - if verbose: - ntf.write('echo "%s$ %s"\n' % (self.credentials.user_host, cmd)) - ntf.write(cmd+'\n') - ntf.seek(0) - ssh_cmd = self.credentials.get_ssh_script_cmd(ntf.name) - - call_and_wait(ssh_cmd, dry=dry, verbose=verbose) - - -def dedent(s): - lines = [l.strip() for l in s.split('\n')] - return '\n'.join(lines) - -class EC2SpotDocker(DockerMode): - def __init__(self, - credentials, - region='us-west-1', - s3_bucket_region='us-west-1', - instance_type='m1.small', - spot_price=0.0, - s3_bucket=None, - terminate=True, - image_id=None, - aws_key_name=None, - iam_instance_profile_name='doodad', - s3_log_prefix='experiment', - s3_log_name=None, - security_group_ids=None, - security_groups=None, - aws_s3_path=None, - extra_ec2_instance_kwargs=None, - num_exps=1, - swap_size=4096, - **kwargs - ): - super(EC2SpotDocker, self).__init__(**kwargs) - if security_group_ids is None: - security_group_ids = [] - if security_groups is None: - security_groups = [] - self.credentials = credentials - self.region = region - self.s3_bucket_region = s3_bucket_region - self.spot_price = str(float(spot_price)) - self.instance_type = instance_type - self.terminate = terminate - self.s3_bucket = s3_bucket - self.image_id = image_id - self.aws_key_name = aws_key_name - self.s3_log_prefix = s3_log_prefix - self.s3_log_name = s3_log_name - self.security_group_ids = security_group_ids - self.security_groups = security_groups - self.iam_instance_profile_name = iam_instance_profile_name - self.extra_ec2_instance_kwargs = extra_ec2_instance_kwargs - self.num_exps = num_exps - self.swap_size = swap_size - self.checkpoint = None - - self.s3_mount_path = 's3://%s/doodad/mount' % self.s3_bucket - self.aws_s3_path = aws_s3_path or 's3://%s/doodad/logs' % self.s3_bucket - - def upload_file_to_s3(self, script_content, dry=False): - f = tempfile.NamedTemporaryFile(delete=False) - f.write(script_content.encode()) - f.close() - remote_path = os.path.join(self.s3_mount_path, 'oversize_bash_scripts', str(uuid.uuid4())) - subprocess.check_call(["aws", "s3", "cp", f.name, remote_path, - '--region', self.s3_bucket_region]) - os.unlink(f.name) - return remote_path - - def s3_upload(self, file_name, bucket, remote_filename=None, dry=False, check_exist=True): - if remote_filename is None: - remote_filename = os.path.basename(file_name) - remote_path = 'doodad/mount/'+remote_filename - if check_exist: - if s3_exists(bucket, remote_path, region=self.s3_bucket_region): - print('\t%s exists! ' % os.path.join(bucket, remote_path)) - return 's3://'+os.path.join(bucket, remote_path) - return s3_upload(file_name, bucket, remote_path, dry=dry, - region=self.s3_bucket_region) - - def make_timekey(self): - return '%d'%(int(time.time()*1000)) - - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): - default_config = dict( - image_id=self.image_id, - instance_type=self.instance_type, - key_name=self.aws_key_name, - spot_price=self.spot_price, - iam_instance_profile_name=self.iam_instance_profile_name, - security_groups=self.security_groups, - security_group_ids=self.security_group_ids, - network_interfaces=[], - ) - aws_config = dict(default_config) - if self.s3_log_name is None: - exp_name = "{}-{}".format(self.s3_log_prefix, self.make_timekey()) - else: - exp_name = self.s3_log_name - exp_prefix = self.s3_log_prefix - s3_base_dir = os.path.join(self.aws_s3_path, exp_prefix.replace("_", "-"), exp_name) - stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') - - sio = StringIO() - sio.write("#!/bin/bash\n") - sio.write("truncate -s 0 /home/ubuntu/user_data.log\n") - sio.write("{\n") - sio.write('die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n') - sio.write('EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n') - sio.write(""" - aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region} - """.format(exp_name=exp_name, aws_region=self.region)) - sio.write(""" - aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region} - """.format(exp_prefix=exp_prefix, aws_region=self.region)) - - # Add swap file - if self.gpu: - swap_location = '/mnt/swapfile' - else: - swap_location = '/var/swap.1' - sio.write( - 'sudo dd if=/dev/zero of={swap_location} bs=1M count={swap_size}\n' - .format(swap_location=swap_location, swap_size=self.swap_size)) - sio.write('sudo mkswap {swap_location}\n'.format(swap_location=swap_location)) - sio.write('sudo chmod 600 {swap_location}\n'.format(swap_location=swap_location)) - sio.write('sudo swapon {swap_location}\n'.format(swap_location=swap_location)) - - - sio.write("service docker start\n") - sio.write("docker --config /home/ubuntu/.docker pull {docker_image}\n".format(docker_image=self.docker_image)) - sio.write("export AWS_DEFAULT_REGION={aws_region}\n".format(aws_region=self.s3_bucket_region)) - sio.write(""" - curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip" - unzip awscli-bundle.zip - sudo ./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws - """) - - mnt_args = '' - py_path = [] - local_output_dir_and_s3_path = [] - max_sync_interval = 0 - for mount in mount_points: - print('Handling mount: ', mount) - if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects - if mount.read_only: - if mount.path_on_remote is None: - with mount.gzip() as gzip_file: - gzip_path = os.path.realpath(gzip_file) - file_hash = hash_file(gzip_path) - s3_path = self.s3_upload(gzip_path, self.s3_bucket, remote_filename=file_hash+'.tar') - mount.path_on_remote = s3_path - mount.local_file_hash = gzip_path - else: - file_hash = mount.local_file_hash - s3_path = mount.path_on_remote - remote_tar_name = '/tmp/'+file_hash+'.tar' - remote_unpack_name = '/tmp/'+file_hash - sio.write("aws s3 cp {s3_path} {remote_tar_name}\n".format(s3_path=s3_path, remote_tar_name=remote_tar_name)) - sio.write("mkdir -p {local_code_path}\n".format(local_code_path=remote_unpack_name)) - sio.write("tar -xvf {remote_tar_name} -C {local_code_path}\n".format( - local_code_path=remote_unpack_name, - remote_tar_name=remote_tar_name)) - mount_point = os.path.join('/mounts', mount.mount_point.replace('~/','')) - mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename(mount.local_dir)), mount_point) - if mount.pythonpath: - py_path.append(mount_point) - else: - raise ValueError() - elif isinstance(mount, MountS3): - # In theory the ec2_local_dir could be some random directory, - # but we make it the same as the mount directory for - # convenience. - # - # ec2_local_dir: directory visible to ec2 spot instance - # moint_point: directory visible to docker running inside ec2 - # spot instance - ec2_local_dir = mount.mount_point - s3_path = os.path.join(s3_base_dir, mount.s3_path) - if self.num_exps == 1: - stdout_log_s3_path = os.path.join(s3_path, 'stdout_$EC2_INSTANCE_ID.log') - if not mount.output: - raise NotImplementedError() - local_output_dir_and_s3_path.append( - (ec2_local_dir, s3_path) - ) - sio.write("mkdir -p {remote_dir}\n".format( - remote_dir=ec2_local_dir) - ) - mnt_args += ' -v %s:%s' % (ec2_local_dir, mount.mount_point) - - # Sync interval - sio.write(""" - while /bin/true; do - aws s3 sync --exclude '*' {include_string} {log_dir} {s3_path} - sleep {periodic_sync_interval} - done & echo sync initiated - """.format( - include_string=mount.include_string, - log_dir=ec2_local_dir, - s3_path=s3_path, - periodic_sync_interval=mount.sync_interval - )) - max_sync_interval = max(max_sync_interval, mount.sync_interval) - - # Sync on terminate. This catches the case where the spot - # instance gets terminated before the user script ends. - # - # This is hoping that there's at least 3 seconds between when - # the spot instance gets marked for termination and when it - # actually terminates. - sio.write(""" - while /bin/true; do - if [ -z $(curl -Is http://169.254.169.254/latest/meta-data/spot/termination-time | head -1 | grep 404 | cut -d \ -f 2) ] - then - logger "Running shutdown hook." - aws s3 cp --recursive {log_dir} {s3_path} - aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} - break - else - # Spot instance not yet marked for termination. - # This is hoping that there's at least 3 seconds - # between when the spot instance gets marked for - # termination and when it actually terminates. - sleep 3 - fi - done & echo log sync initiated - """.format( - log_dir=ec2_local_dir, - s3_path=s3_path, - stdout_log_s3_path=stdout_log_s3_path, - )) - else: - raise NotImplementedError() - - sio.write(""" - while /bin/true; do - aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} - sleep {periodic_sync_interval} - done & echo sync initiated - """.format( - stdout_log_s3_path=stdout_log_s3_path, - periodic_sync_interval=max_sync_interval - )) - - if self.gpu: - #sio.write('echo "LSMOD NVIDIA:"\n') - #sio.write("lsmod | grep nvidia\n") - #sio.write("echo 'Waiting for dpkg lock...'\n") - # wait for lock - #sio.write(""" - # while sudo fuser /var/lib/dpkg/lock >/dev/null 2>&1; do - # sleep 1 - # done - #""") - #sio.write("sudo apt-get install nvidia-modprobe\n") - #sio.write("wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb\n") - #sio.write("sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb\n") - sio.write(""" - for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done - systemctl start nvidia-docker - """) - sio.write("echo 'Testing nvidia-smi'\n") - sio.write("nvidia-smi\n") - sio.write("echo 'Testing nvidia-smi inside docker'\n") - sio.write("nvidia-docker run --rm {docker_image} nvidia-smi\n".format(docker_image=self.docker_image)) - - if self.checkpoint and self.checkpoint.restore: - raise NotImplementedError() - else: - docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) - assert self.num_exps > 0 - for _ in range(self.num_exps - 1): - sio.write(docker_cmd+' &\n') - sio.write(docker_cmd+'\n') - - # Sync all output mounts to s3 after running the user script - # Ideally the earlier while loop would be sufficient, but it might be - # the case that the earlier while loop isn't fast enough to catch a - # termination. So, we explicitly sync on termination. - for (local_output_dir, s3_dir_path) in local_output_dir_and_s3_path: - sio.write("aws s3 cp --recursive {local_dir} {s3_dir}\n".format( - local_dir=local_output_dir, - s3_dir=s3_dir_path - )) - - sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format( - stdout_log_s3_path, - )) - - # Wait for last sync - if max_sync_interval > 0: - sio.write("sleep {}\n".format(max_sync_interval + 5)) - - if self.terminate: - sio.write(""" - EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`" - aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region} - """.format(aws_region=self.region)) - sio.write("} >> /home/ubuntu/user_data.log 2>&1\n") - - full_script = dedent(sio.getvalue()) - import boto3 - import botocore - ec2 = boto3.client( - "ec2", - region_name=self.region, - aws_access_key_id=self.credentials.aws_key, - aws_secret_access_key=self.credentials.aws_secret_key, - ) - - if len(full_script) > 10000 or len(base64.b64encode(full_script.encode()).decode("utf-8")) > 10000: - s3_path = self.upload_file_to_s3(full_script, dry=dry) - sio = StringIO() - sio.write("#!/bin/bash\n") - sio.write(""" - aws s3 cp {s3_path} /home/ubuntu/remote_script.sh --region {aws_region} && \\ - chmod +x /home/ubuntu/remote_script.sh && \\ - bash /home/ubuntu/remote_script.sh - """.format(s3_path=s3_path, aws_region=self.s3_bucket_region)) - user_data = dedent(sio.getvalue()) - else: - user_data = full_script - - if verbose: - print(full_script) - with open("/tmp/full_ec2_script", "w") as f: - f.write(full_script) - - instance_args = dict( - ImageId=aws_config["image_id"], - KeyName=aws_config["key_name"], - UserData=user_data, - InstanceType=aws_config["instance_type"], - EbsOptimized=False, - SecurityGroups=aws_config["security_groups"], - SecurityGroupIds=aws_config["security_group_ids"], - NetworkInterfaces=aws_config["network_interfaces"], - IamInstanceProfile=dict( - Name=aws_config["iam_instance_profile_name"], - ), - #**config.AWS_EXTRA_CONFIGS, - ) - if self.extra_ec2_instance_kwargs is not None: - instance_args.update(self.extra_ec2_instance_kwargs) - - if verbose: - print("************************************************************") - print('UserData:', instance_args["UserData"]) - print("************************************************************") - instance_args["UserData"] = base64.b64encode(instance_args["UserData"].encode()).decode("utf-8") - spot_args = dict( - DryRun=dry, - InstanceCount=1, - LaunchSpecification=instance_args, - SpotPrice=aws_config["spot_price"], - # ClientToken=params_list[0]["exp_name"], - ) - - import pprint - - if verbose: - pprint.pprint(spot_args) - if not dry: - response = ec2.request_spot_instances(**spot_args) - print('Launched EC2 job - Server response:') - pprint.pprint(response) - print('*****'*5) - spot_request_id = response['SpotInstanceRequests'][ - 0]['SpotInstanceRequestId'] - for _ in range(10): - try: - ec2.create_tags( - Resources=[spot_request_id], - Tags=[ - {'Key': 'Name', 'Value': exp_name} - ], - ) - break - except botocore.exceptions.ClientError: - continue - - -class EC2AutoconfigDocker(EC2SpotDocker): - def __init__(self, - region='us-west-1', - s3_bucket=None, - image_id=None, - aws_key_name=None, - iam_profile=None, - **kwargs - ): - # find config file - from doodad.ec2.autoconfig import AUTOCONFIG - from doodad.ec2.credentials import AWSCredentials - s3_bucket = AUTOCONFIG.s3_bucket() if s3_bucket is None else s3_bucket - image_id = AUTOCONFIG.aws_image_id(region) if image_id is None else image_id - aws_key_name= AUTOCONFIG.aws_key_name(region) if aws_key_name is None else aws_key_name - iam_profile= AUTOCONFIG.iam_profile_name() if iam_profile is None else iam_profile - credentials=AWSCredentials(aws_key=AUTOCONFIG.aws_access_key(), aws_secret=AUTOCONFIG.aws_access_secret()) - security_group_ids = AUTOCONFIG.aws_security_group_ids()[region] - security_groups = AUTOCONFIG.aws_security_groups() - - super(EC2AutoconfigDocker, self).__init__( - s3_bucket=s3_bucket, - image_id=image_id, - aws_key_name=aws_key_name, - iam_instance_profile_name=iam_profile, - credentials=credentials, - region=region, - security_groups=security_groups, - security_group_ids=security_group_ids, - **kwargs - ) - - -class GCPDocker(DockerMode): - def __init__( - self, - zone="us-east4-a", - gcp_bucket_name=None, - instance_type='n1-standard-4', - image_name=None, - image_project=None, - disk_size:"Gb"=64, - terminate=True, - preemptible=True, - gcp_log_prefix='experiment', - gcp_log_name=None, - gcp_log_path=None, - gpu_kwargs=None, - **kwargs - ): - super(GCPDocker, self).__init__(**kwargs) - assert 'CLOUDSDK_CORE_PROJECT' in os.environ.keys() - self.project = os.environ['CLOUDSDK_CORE_PROJECT'] - self.zone = zone - self.gcp_bucket_name = gcp_bucket_name - self.instance_type = instance_type - self.terminate = terminate - self.disk_size = disk_size - self.image_project = image_project - self.image_name = image_name - self.preemptible = preemptible - - self.gcp_log_prefix = gcp_log_prefix - self.gcp_log_name = gcp_log_name - self.gcp_log_path = gcp_log_path or 'doodad/logs' - if self.gpu: - self.num_gpu = gpu_kwargs['num_gpu'] - self.gpu_model = gpu_kwargs['gpu_model'] - self.gpu_type = get_gpu_type(self.project, self.zone, self.gpu_model) - - import googleapiclient.discovery - self.compute = googleapiclient.discovery.build('compute', 'v1') - - def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): - if self.gcp_log_name is None: - exp_name = "{}-{}".format(self.gcp_log_prefix, EC2SpotDocker.make_timekey(self)) - else: - exp_name = self.gcp_log_name - exp_prefix = self.gcp_log_prefix - gcp_base_dir = os.path.join(self.gcp_log_path, exp_prefix.replace("_", "-"), exp_name) - - mnt_args = '' - py_path = [] - gcp_mount_info = [] - max_sync_interval = 0 - local_mounts = [] - for mount in mount_points: - print('Handling mount: ', mount) - if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects - if mount.read_only: - if mount.path_on_remote is None: - with mount.gzip() as gzip_file: - gzip_path = os.path.realpath(gzip_file) - file_hash = hash_file(gzip_path) - gcp_path = upload_file_to_gcp_storage( - bucket_name=self.gcp_bucket_name, - file_name=gzip_path, - remote_filename=file_hash+'.tar' - ) - mount.path_on_remote = gcp_path - mount.local_file_hash = file_hash - else: - file_hash = mount.local_file_hash - gcp_path = mount.path_on_remote - remote_unpack_name = '/tmp/'+file_hash - mount_point = os.path.join('/mounts', mount.mount_point.replace('~/','')) - mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename(mount.local_dir)), mount_point) - if mount.pythonpath: - py_path.append(mount_point) - local_mounts.append(file_hash) - else: - raise ValueError() - elif isinstance(mount, MountGCP): - gcp_local_dir = mount.mount_point - gcp_path = os.path.join(gcp_base_dir, mount.gcp_path) - if not mount.output: - raise NotImplementedError() - gcp_mount_info.append( - (gcp_local_dir, gcp_path, mount.include_string, mount.sync_interval) - ) - mnt_args += ' -v %s:%s' % (gcp_local_dir, mount.mount_point) - else: - raise NotImplementedError() - - docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) - - metadata = { - 'bucket_name': self.gcp_bucket_name, - 'docker_cmd': docker_cmd, - 'docker_image': self.docker_image, - 'local_mounts': json.dumps(local_mounts), - 'gcp_mounts': json.dumps(gcp_mount_info), - 'use_gpu': json.dumps(self.gpu), - 'terminate': json.dumps(self.terminate), - 'startup-script': open(GCP_STARTUP_SCRIPT_PATH, "r").read(), - 'shutdown-script': open(GCP_SHUTDOWN_SCRIPT_PATH, "r").read(), - } - # instance name must match regex '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"> - unique_name= "doodad" + str(uuid.uuid4()).replace("-", "") - self.create_instance(metadata, unique_name, exp_name, exp_prefix) - if verbose: - print(unique_name) - print(metadata) - - def create_instance(self, metadata, name, exp_name="", exp_prefix=""): - image_response = self.compute.images().get( - project=self.image_project, - image=self.image_name, - ).execute() - source_disk_image = image_response['selfLink'] - config = { - 'name': name, - 'machineType': get_machine_type(self.zone, self.instance_type), - 'disks': [{ - 'boot': True, - 'autoDelete': True, - 'initializeParams': { - 'sourceImage': source_disk_image, - 'diskSizeGb': self.disk_size, - } - }], - 'networkInterfaces': [{ - 'network': 'global/networks/default', - 'accessConfigs': [ - {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'} - ] - }], - 'serviceAccounts': [{ - 'email': 'default', - 'scopes': ['https://www.googleapis.com/auth/cloud-platform'] - }], - 'metadata': { - 'items': [ - {'key': key, 'value': value} - for key, value in metadata.items() - ] - }, - 'scheduling': { - "onHostMaintenance": "terminate", - "automaticRestart": False, - "preemptible": self.preemptible, - }, - "labels": { - "exp_name": exp_name, - "exp_prefix": exp_prefix, - } - } - if self.gpu: - config["guestAccelerators"] = [{ - "acceleratorType": self.gpu_type, - "acceleratorCount": self.num_gpu, - }] - return self.compute.instances().insert( - project=self.project, - zone=self.zone, - body=config - ).execute() - -class CodalabDocker(DockerMode): - def __init__(self): - super(CodalabDocker, self).__init__() - raise NotImplementedError() - - -class SingularityMode(LaunchMode): - def __init__(self, image, gpu=False, pre_cmd=None, - post_cmd=None, skip_wait=False): - super(SingularityMode, self).__init__() - self.singularity_image = image - self.gpu = gpu - self.pre_cmd = pre_cmd - self.post_cmd = post_cmd - self.skip_wait = skip_wait - - def get_singularity_cmd( - self, - main_cmd, - extra_args='', - verbose=True, - pythonpath=None, - ): - cmd_list= CommandBuilder() - if self.pre_cmd: - cmd_list.extend(self.pre_cmd) - - if verbose: - if self.gpu: - cmd_list.append('echo \"Running in singularity (gpu)\"') - else: - cmd_list.append('echo \"Running in singularity\"') - if pythonpath: - cmd_list.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(pythonpath))) - - cmd_list.append(main_cmd) - if self.post_cmd: - cmd_list.extend(self.post_cmd) - - if self.gpu: - extra_args += ' --nv ' - singularity_prefix = 'singularity exec %s %s /bin/bash -c ' % ( - extra_args, - self.singularity_image, - ) - main_cmd = cmd_list.to_string() - full_cmd = singularity_prefix + ("\'%s\'" % main_cmd) - return full_cmd - - -class LocalSingularity(SingularityMode): - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): - py_path = [] - for mount in mount_points: - if isinstance(mount, MountLocal): - if mount.pythonpath: - py_path.append(mount.local_dir) - else: - raise NotImplementedError(type(mount)) - - full_cmd = self.get_singularity_cmd( - cmd, - pythonpath=py_path, - verbose=verbose, - ) - call_and_wait(full_cmd, verbose=verbose, dry=dry, - skip_wait=self.skip_wait) - - -class SlurmSingularity(LocalSingularity): - # TODO: set up an auto-config - def __init__( - self, image, account_name, partition, time_in_mins, - qos=None, - nodes=1, - n_tasks=1, - n_gpus=1, - **kwargs - ): - super(SlurmSingularity, self).__init__(image, **kwargs) - self.account_name = account_name - self.partition = partition - self.time_in_mins = time_in_mins - self.nodes = nodes - self.n_tasks = n_tasks - self.n_gpus = n_gpus - - def create_slurm_command(self, cmd, mount_points=None, verbose=False): - py_path = [] - for mount in mount_points: - if isinstance(mount, MountLocal): - if mount.pythonpath: - py_path.append(mount.local_dir) - else: - raise NotImplementedError(type(mount)) - - singularity_cmd = self.get_singularity_cmd( - cmd, - pythonpath=py_path, - verbose=verbose, - ) - if self.gpu: - full_cmd = ( - "sbatch -A {account_name} -p {partition} -t {time}" - " -N {nodes} -n {n_tasks} --cpus-per-task={cpus_per_task}" - " --gres=gpu:{n_gpus} {cmd}".format( - account_name=self.account_name, - partition=self.partition, - time=self.time_in_mins, - nodes=self.nodes, - n_tasks=self.n_tasks, - cpus_per_task=2*self.n_gpus, - n_gpus=self.n_gpus, - cmd=singularity_cmd, - ) - ) - else: - full_cmd = "sbatch -A {account_name} -p {partition} -t {time} {cmd}".format( - account_name=self.account_name, - partition=self.partition, - time=self.time_in_mins, - cmd=singularity_cmd, - ) - if verbose: - print(full_cmd) - - def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): - full_cmd = self.create_slurm_command( - cmd, mount_points=mount_points, verbose=verbose, - ) - call_and_wait(full_cmd, dry=dry, skip_wait=self.skip_wait) - - -class ScriptSlurmSingularity(SlurmSingularity): - """ - Create or add to a script to run a bunch of slurm jobs. - """ - TMP_FILE = '/tmp/script_to_scp_over.sh' - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.is_first_time = False - - def set_first_time(self, is_first_time): - self.is_first_time = is_first_time - - def launch_command( - self, - cmd, - dry=False, - mount_points=None, - verbose=False, - ): - full_cmd = self.create_slurm_command( - cmd, mount_points=mount_points, verbose=verbose, - ) - if self.is_first_time: - with open(self.TMP_FILE, "w") as myfile: - myfile.write(full_cmd + '\n') - # make file executable - st = os.stat(self.TMP_FILE) - os.chmod(self.TMP_FILE, st.st_mode | stat.S_IEXEC) - print("Script generated! scp this script over:", self.TMP_FILE) - else: - with open(self.TMP_FILE, "a") as myfile: - myfile.write(full_cmd + '\n') - print("Script updated. scp this script over:", self.TMP_FILE) diff --git a/doodad/mount.py b/doodad/mount.py deleted file mode 100644 index 1b72591..0000000 --- a/doodad/mount.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -These objects are pointers to code/data you wish to give access -to a launched job. - -Each object defines a source and a mount point (where the directory will be visible -to the launched process) - -""" -import os -import tarfile -import tempfile -from contextlib import contextmanager - - -class Mount(object): - """ - Args: - mount_point (str): Location of directory visible to the running process - pythonpath (bool): If True, adds this folder to the $PYTHON_PATH environment variable - output (bool): If False, this is a "code" directory. If True, this should be an empty - "output" directory (nothing will be copied to remote) - """ - def __init__(self, mount_point=None, pythonpath=False, output=False): - self.pythonpath = pythonpath - self.read_only = not output - self.set_mount(mount_point) - self.path_on_remote = None - self.local_file_hash = None - - def set_mount(self, mount_point): - if mount_point: - self.mount_point = mount_point - else: - self.mount_point = mount_point - - -class MountLocal(Mount): - def __init__(self, local_dir, mount_point=None, cleanup=True, - filter_ext=('.pyc', '.log', '.git', '.mp4'), - filter_dir=('data',), - **kwargs): - super(MountLocal, self).__init__(mount_point=mount_point, **kwargs) - self.local_dir = os.path.realpath(os.path.expanduser(local_dir)) - self.local_dir_raw = local_dir - self.cleanup = cleanup - self.filter_ext = filter_ext - self.filter_dir = filter_dir - if mount_point is None: - self.set_mount(local_dir) - self.no_remount = True - else: - self.no_remount = False - #print('local_dir %s, mount_point %s(%s)' % (self.local_dir, self.mount_point, mount_point)) - - def create_if_nonexistent(self): - os.makedirs(self.local_dir, exist_ok=True) - - @contextmanager - def gzip(self): - """ - Return filepath to a gzipped version of this directory for uploading - """ - assert self.read_only - def filter_func(tar_info): - filt = any([tar_info.name.endswith(ext) for ext in self.filter_ext]) or any([ tar_info.name.endswith('/'+ext) for ext in self.filter_dir]) - if filt: - return None - return tar_info - with tempfile.NamedTemporaryFile('wb+', suffix='.tar') as tf: - # make a tar.gzip archive of directory - with tarfile.open(fileobj=tf, mode="w") as tar: - #tar.add(self.local_dir, arcname=os.path.splitext(os.path.basename(tf.name))[0], filter=filter_func) - tar.add(self.local_dir, arcname=os.path.basename(self.local_dir), filter=filter_func) - tf.seek(0) - yield tf.name - - def __str__(self): - return 'MountLocal@%s'%self.local_dir - - def mount_dir(self): - return os.path.join('/mounts', self.mount_point.replace('~/','')) - - -class MountGitRepo(Mount): - def __init__(self, git_url, git_credentials=None, **kwargs): - super(MountGitRepo, self).__init__(read_only=True, **kwargs) - self.git_url = git_url - self.git_credentials = git_credentials - raise NotImplementedError() - - -class MountGCP(Mount): - def __init__(self, gcp_path, gcp_bucket_name, sync_interval=15, output=False, - include_types=('*.txt', '*.csv', '*.json', '*.gz', '*.tar', '*.log', '*.pkl'), **kwargs): - super(MountGCP, self).__init__(**kwargs) - self.gcp_bucket_name = gcp_bucket_name - self.gcp_path = gcp_path - self.output = output - self.sync_interval = sync_interval - self.sync_on_terminate = True - self.include_types = include_types - - def __str__(self): - return 'MountGCP@gcp://%s/%s'% (self.gcp_bucket_name, self.gcp_path) - - @property - def include_string(self): - return ' '.join(['--include \'%s\''%type_ for type_ in self.include_types]) - -class MountS3(Mount): - def __init__(self, s3_path, s3_bucket=None, sync_interval=15, output=False, - include_types=('*.txt', '*.csv', '*.json', '*.gz', '*.tar', '*.log', '*.pkl'), **kwargs): - super(MountS3, self).__init__(**kwargs) - if s3_bucket is None: - # load from config - from doodad.ec2.autoconfig import AUTOCONFIG - s3_bucket = AUTOCONFIG.s3_bucket() - self.s3_bucket = s3_bucket - self.s3_path = s3_path - self.output = output - self.sync_interval = sync_interval - self.sync_on_terminate = True - self.include_types = include_types - - def __str__(self): - return 'MountS3@s3://%s/%s'% (self.s3_bucket, self.s3_path) - - @property - def include_string(self): - return ' '.join(['--include \'%s\''%type_ for type_ in self.include_types]) diff --git a/doodad/relaunch.py b/doodad/relaunch.py deleted file mode 100644 index 63dd207..0000000 --- a/doodad/relaunch.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Support for relaunching a run from a checkpoint -if a run terminates prematurely. - -Notes to self for docker checkpoints: - -- Need to enable experimental mode on docker (run ENABLE_CRIU_CMD) -- The docker process cannot be in interactive mode or use tty -- Checkpointing will stop the docker image and save a bunch of files to disk. You need to restart it to keep going. - -""" -import uuid - -INSTALL_CRIU_CMD = 'apt-get install -y criu' -ENABLE_CRIU_CMD = 'echo "{\"experimental\": true}" >> /etc/docker/daemon.json; systemctl restart docker' - -def checkpoint_cmd(docker_name, chk_name, chk_dir='/docker_checkpoints'): - return 'docker checkpoint create --checkpoint-dir=%s %s %s' % (chk_dir, docker_name, chk_name) - -def checkpoint_restore_cmd(docker_name, checkpoint_name, chk_dir='/docker_checkpoints'): - return 'docker start --checkpoint-dir=%s --checkpoint=%s %s' % (chk_dir, checkpoint_name, docker_name) - - -class CheckpointManager(object): - def __init__(self, restore=False, checkpoint_dir='/docker_checkpoints'): - self.checkpoint_name = uuid.uuid4() - self.checkpoint_dir = checkpoint_dir - self.restore = restore - - def checkpoint_and_tar_cmd(self, docker_name, tar_name, restart=True): - cmds = [] - checkpoint_name = self.checkpoint_name - chk_cmd = checkpoint_cmd(docker_name, checkpoint_name, chk_dir=self.checkpoint_dir) - cmds.append(chk_cmd) - - chk_dir = os.path.join(self.checkpoint_dir, checkpoint_name) - tar_cmd = 'tar -cvf %s %s ' % (tar_name, chk_dir) - cmds.append(tar_cmd) - - if restart: - restart_cmd = checkpoint_restore_cmd(docker_name, checkpoint_name, chk_dir=self.checkpoint_dir) - cmds.append(restart_cmd) - return ';'.join(cmds) - - def checkpoint_tar_loop_cmd(self, docker_name, tar_name, wait_interval=1): - chk_tar_cmd = self.checkpoint_and_tar_cmd(docker_name, tar_name) - """ - while /bin/true; do - {chk_tar_cmd} - sleep {wait_interval} - done & echo sync initiated - """.format(wait_interval=wait_interval, chk_tar_cmd=chk_tar_cmd, - diff --git a/doodad/ssh/__init__.py b/doodad/ssh/__init__.py deleted file mode 100644 index 95ed204..0000000 --- a/doodad/ssh/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .credentials import * diff --git a/doodad/ssh/credentials.py b/doodad/ssh/credentials.py deleted file mode 100644 index 73811e7..0000000 --- a/doodad/ssh/credentials.py +++ /dev/null @@ -1,66 +0,0 @@ -import os - -class SSHCredentials(object): - """ - Container for SSH credentials - - Args: - hostname (str): - username (str): - password (str, optional): - Authenticate via plain-text password. This features requires the 'sshpass' program to be installed. - This usage is not suggested due to security reasons. - identity_file (str, optional): - Path to a private key file for SSL public key authentication - """ - def __init__(self, hostname=None, username=None, password=None, identity_file=None): - assert password is not None or identity_file is not None, "One of password or identity_file must be provided" - self.hostname = hostname - self.username = username - self.password = password - self.identity_file = os.path.expanduser(identity_file) - - def get_ssh_cmd_prefix(self): - """ - Return a command prefix - Ex. - 'ssh user@host -i id_file ' - """ - cmd = 'ssh %s@%s' % (self.username, self.hostname) - if self.identity_file: - cmd += ' -i %s' % self.identity_file - elif self.password: - cmd = 'sshpass -p \'%s\' %s' % (self.password, cmd) - print('WARNING: Using password-based ssh is not secure! Please consider using identity files.') - else: - raise NotImplementedError() - return cmd + ' ' - - def get_ssh_bash_cmd(self, cmd): - prefix = self.get_ssh_cmd_prefix() - return prefix + " '%s'"%cmd - - def get_ssh_script_cmd(self, script_name): - cmd = 'ssh %s@%s' % (self.username, self.hostname) - if self.identity_file: - cmd += ' -i %s' % self.identity_file - else: - raise NotImplementedError() - cmd += " 'bash -s' < %s" % script_name - return cmd - - def get_scp_cmd(self, source, destination, recursive=True): - cmd = 'scp' - if recursive: - cmd += ' -r' - if self.identity_file: - cmd += ' -i %s' % self.identity_file - else: - raise NotImplementedError() - cmd += ' %s' % source - cmd += ' %s@%s:%s' % (self.username, self.hostname, destination) - return cmd - - @property - def user_host(self): - return '%s@%s' % (self.username, self.hostname) diff --git a/doodad/utils.py b/doodad/utils.py deleted file mode 100644 index 1989239..0000000 --- a/doodad/utils.py +++ /dev/null @@ -1,89 +0,0 @@ -import hashlib -import os -import subprocess -import contextlib -import tempfile - -THIS_FILE_DIR = os.path.dirname(os.path.realpath(__file__)) -REPO_DIR = os.path.dirname(THIS_FILE_DIR) -EXAMPLES_DIR = os.path.join(REPO_DIR, 'examples') - -HASH_BUF_SIZE = 65536 - -def hash_file(filename): - hasher = hashlib.md5() - with open(filename, 'rb') as f: - while True: - data = f.read(HASH_BUF_SIZE) - if not data: - break - hasher.update(data) - return hasher.hexdigest() - - -def call_and_wait(cmd, verbose=False, dry=False, skip_wait=False): - if dry or verbose: - print(cmd) - if not dry: - p = subprocess.Popen(cmd, shell=True) - if skip_wait: - return - try: - p.wait() - except KeyboardInterrupt: - try: - print("terminating") - p.terminate() - except OSError: - print("os error!") - pass - p.wait() - - -class CommandBuilder(object): - def __init__(self): - self.cmds = [] - - def add_command(self, cmd): - self.cmds.append(cmd) - - def append(self, cmd): - self.add_command(cmd) - - def extend(self, other): - if isinstance(other, CommandBuilder): - self.cmds.extend(other.cmds) - else: - self.cmds.extend(other) - - def to_string(self, separator=';'): - return ';'.join([str(cmd) for cmd in self.cmds]) - - def __str__(self): - return self.to_string() - - def __iter__(self): - for cmd in self.cmds: - yield cmd - - def call_and_wait(self, verbose=False, dry=False, skip_wait=False): - return call_and_wait( - self.to_string(), - verbose=verbose, - dry=dry, - skip_wait=skip_wait, - ) - - @contextlib.contextmanager - def as_script(self, suffix='.sh'): - """ - Usage: - with cmd_builder.as_script() as fname: - # do stuff with fname - """ - with tempfile.NamedTemporaryFile(suffix=suffix, mode='w+') as f: - for cmd in self.cmds: - f.write(cmd+'\n') - f.seek(0) - yield f.name - diff --git a/examples/.gitignore b/examples/.gitignore deleted file mode 100644 index 442d515..0000000 --- a/examples/.gitignore +++ /dev/null @@ -1 +0,0 @@ -tmp_output/ diff --git a/examples/docker_checkpoint/app_main.py b/examples/docker_checkpoint/app_main.py deleted file mode 100644 index 24651a0..0000000 --- a/examples/docker_checkpoint/app_main.py +++ /dev/null @@ -1,21 +0,0 @@ -import os -import time -import subprocess - -import doodad as dd - -print('Launching app_main!') - -# These are arguments passed in from launch_python -args_dict = dd.get_args() -print('My args are:', args_dict) - -k = 0 -while True: - k += 1 - subprocess.call('echo %d' % k, shell=True) - time.sleep(1.0) - -# Test proper mounting -print('Done!') - diff --git a/examples/docker_checkpoint/launch.py b/examples/docker_checkpoint/launch.py deleted file mode 100644 index 8410734..0000000 --- a/examples/docker_checkpoint/launch.py +++ /dev/null @@ -1,41 +0,0 @@ -import os - -import doodad as dd -import doodad.ec2 as ec2 -import doodad.ssh as ssh -import doodad.mount as mount -from doodad.utils import EXAMPLES_DIR, REPO_DIR - - -# Local run -mode_local = dd.mode.Local() - -# Local docker -mode_docker = dd.mode.LocalDocker( - image='python:3.5', -) - -# or this! Run experiment via docker on another machine through SSH -mode_ssh = dd.mode.SSHDocker( - image='python:3.5', - credentials=ssh.SSHCredentials(hostname='my.machine.name', username='my_username', identity_file='~/.ssh/id_rsa'), -) - -MY_RUN_MODE = mode_docker # CHANGE THIS - -# Set up code and output directories -mounts = [ - mount.MountLocal(local_dir=REPO_DIR, pythonpath=True), # Code -] - - -THIS_FILE_DIR = os.path.realpath(os.path.dirname(__file__)) -dd.launch_python( - target=os.path.join(THIS_FILE_DIR, 'app_main.py'), # point to a target script. If running remotely, this will be copied over - mode=MY_RUN_MODE, - mount_points=mounts, - args={ - 'arg1': 50, - } -) - diff --git a/examples/docker_newton_example.py b/examples/docker_newton_example.py deleted file mode 100644 index 20f6bba..0000000 --- a/examples/docker_newton_example.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Example script for using newton machines via docker + rllab -""" - -import doodad as dd -import doodad.ssh as ssh -import doodad.mount as mount - -MY_USERNAME = 'justin' - -# Use local mode to test code -mode_local = dd.mode.LocalDocker( - image='justinfu/rl_base:0.1' -) - -# Use docker mode to launch jobs on newton machine -mode_ssh = dd.mode.SSHDocker( - image='justinfu/rl_base:0.1', - credentials=ssh.SSHCredentials(hostname='newton2.banatao.berkeley.edu', - username='rail', identity_file='path/to/identity'), -) - -# Set up code and output directories -OUTPUT_DIR = '/mount/outputs' # this is the directory visible to the target script inside docker -mounts = [ - mount.MountLocal(local_dir='~/install/rllab', pythonpath=True), # point to your rllab - mount.MountLocal(local_dir='~/install/gym/.mujoco', mount_point='/root/.mujoco'), # point to your mujoco - - # this output directory will be visible on the remote machine - # TODO: this directory will have root permissions. For now you need to scp your data inside your script. - mount.MountLocal(local_dir='~/data/%s' % MY_USERNAME, mount_point=OUTPUT_DIR, output=True), -] - -pd.launch_python( - target='path/to/script.py', # point to a target script (absolute path). - mode=mode_ssh, - mount_points=mounts, -) diff --git a/examples/ec2_launch/app_main.py b/examples/ec2_launch/app_main.py deleted file mode 100644 index 59a3bf1..0000000 --- a/examples/ec2_launch/app_main.py +++ /dev/null @@ -1,19 +0,0 @@ -import os - -import doodad as dd - -import secretlib - -print('Launching app_main!') - -# These are arguments passed in from launch_python -args_dict = dd.get_args() -print('My args are:', args_dict) - -# Test proper mounting -out_dir = args_dict['output_dir'] -print('Writing secret (%s) to output dir (%s)' % (secretlib.SECRET, os.path.realpath(out_dir))) -with open( os.path.join(out_dir, 'my_secret.txt'), 'w') as f: - f.write(secretlib.SECRET) -print('Done!') - diff --git a/examples/ec2_launch/ec2_launch_test.py b/examples/ec2_launch/ec2_launch_test.py deleted file mode 100644 index 27d669e..0000000 --- a/examples/ec2_launch/ec2_launch_test.py +++ /dev/null @@ -1,59 +0,0 @@ -import os - -import doodad as dd -import doodad.ec2 as ec2 -import doodad.ssh as ssh -import doodad.mount as mount -from doodad.utils import EXAMPLES_DIR, REPO_DIR - - -# Local docker -mode_docker = dd.mode.LocalDocker( - image='python:3.5', -) - -# or this! Run experiment via docker on another machine through SSH -mode_ssh = dd.mode.SSHDocker( - image='python:3.5', - credentials=ssh.SSHCredentials(hostname='my.machine.name', username='my_username', identity_file='~/.ssh/id_rsa'), -) - -# or use this! -mode_ec2=None -#mode_ec2 = dd.mode.EC2AutoconfigDocker( -# image='python:3.5', -# region='us-west-1', -# instance_type='m3.medium', -# spot_price=0.02, -#) - -MY_RUN_MODE = mode_docker # CHANGE THIS - -# Set up code and output directories -OUTPUT_DIR = '/example/outputs' # this is the directory visible to the target -mounts = [ - mount.MountLocal(local_dir=REPO_DIR, pythonpath=True), # Code - mount.MountLocal(local_dir=os.path.join(EXAMPLES_DIR, 'secretlib'), pythonpath=True), # Code -] - -if MY_RUN_MODE == mode_ec2: - output_mount = mount.MountS3(s3_path='outputs', mount_point=OUTPUT_DIR, output=True) # use this for ec2 -else: - output_mount = mount.MountLocal(local_dir=os.path.join(EXAMPLES_DIR, 'tmp_output'), - mount_point=OUTPUT_DIR, output=True) -mounts.append(output_mount) - -print(mounts) - -THIS_FILE_DIR = os.path.realpath(os.path.dirname(__file__)) -dd.launch_python( - target=os.path.join(THIS_FILE_DIR, 'app_main.py'), # point to a target script. If running remotely, this will be copied over - mode=MY_RUN_MODE, - mount_points=mounts, - args={ - 'arg1': 50, - 'arg2': 25, - 'output_dir': OUTPUT_DIR, - } -) - diff --git a/examples/secretlib/secretlib/__init__.py b/examples/secretlib/secretlib/__init__.py deleted file mode 100644 index aa4938d..0000000 --- a/examples/secretlib/secretlib/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ - -SECRET = 'randomlib_secret_message123' diff --git a/hyper_viz/base.py b/hyper_viz/base.py deleted file mode 100644 index d81c0ab..0000000 --- a/hyper_viz/base.py +++ /dev/null @@ -1,94 +0,0 @@ -import numpy as np -from collections import defaultdict -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D - -class Experiment(object): - def __init__(self, params_dict, values, performance=0.0): - self.params = params_dict - self.values = values - self.performance = performance - - def satisfies_param(param_k, param_v): - return self.params[param_k] == param_v - - -def find_unique_params(experiments): - params_dict = defaultdict(set) - for experiment in experiments: - exp_params = experiment.params - for k in exp_params: - v = exp_params[k] - if isinstance(v, set): - raise NotImplementedError() - params_dict[k].add(v) - for k in list(params_dict.keys()): - if len(params_dict[k]) == 1: - del params_dict[k] - return params_dict - - -def make_3d_plot(experiments, xkey, ykey, logx=True, logy=True): - fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - z = [exp.performance for exp in experiments] - x = [exp.params[xkey] for exp in experiments] - y = [exp.params[ykey] for exp in experiments] - ax.set_xlabel(xkey) - ax.set_ylabel(ykey) - ax.set_zlabel('Performance') - - if logx: - x = np.log(np.array(x)+1e-5)/np.log(10) - ax.set_xlabel('log '+xkey) - if logy: - y = np.log(np.array(y)+1e-5)/np.log(10) - ax.set_ylabel('log '+ykey) - - ax.scatter(x, y, z) - plt.show() - - -def resize_ticks(data, maxlen): - ldata = len(data) - ticks_per_data = maxlen/ldata - resized = [] - j = 0 - for i in range(maxlen): - if (i%ticks_per_data == 0): - resized.append(data[j]) - j+=1 - else: - resized.append('') - return resized - -def make_2d_plot(experiments, xkey, ykey, title=None): - fig = plt.figure() - ax = fig.add_subplot(111) - z = [exp.performance for exp in experiments] - x = [exp.params[xkey] for exp in experiments] - y = [exp.params[ykey] for exp in experiments] - ax.set_xlabel(xkey) - ax.set_ylabel(ykey) - x_vals_uniq = sorted(list(set([exp.params[xkey] for exp in experiments]))) - y_vals_uniq = sorted(list(set([exp.params[ykey] for exp in experiments]))) - - xticks = ax.get_xticks().tolist() - ax.set_xticklabels(resize_ticks(x_vals_uniq, len(xticks))) - yticks = ax.get_yticks().tolist() - ax.set_yticklabels(resize_ticks(y_vals_uniq, len(yticks))) - - data = np.zeros((len(x_vals_uniq), len(y_vals_uniq))) - for i in range(len(x_vals_uniq)): - for j in range(len(y_vals_uniq)): - exps = [exp for exp in experiments if (exp.params[xkey] == x_vals_uniq[i]) and (exp.params[ykey] == y_vals_uniq[j])] - avg_perf = np.mean([exp.performance for exp in exps]) - data[i,j] = avg_perf - ax.imshow(data.T, cmap='YlOrRd') - for i in range(len(x_vals_uniq)): - for j in range(len(y_vals_uniq)): - ax.annotate('%.2f'%data[i,j], xy=(i-0.3, j)) - - if title is not None: - plt.title(title) - plt.show() diff --git a/hyper_viz/hyper_viz.py b/hyper_viz/hyper_viz.py deleted file mode 100644 index e73c39a..0000000 --- a/hyper_viz/hyper_viz.py +++ /dev/null @@ -1,12 +0,0 @@ -import argparse - -from rllab_interface import get_experiments -from base import * - -if __name__ == "__main__": - #exps = get_experiments('hopper_monotone', perf_key='Returns Average') - exps = get_experiments('walker_09_06_17', perf_key='Returns Average') - env_name = exps[0].params['env_params:gym_name'] - make_2d_plot(exps, xkey='algo_params:n_updates_per_time_step', ykey='algo_params:monotone_constraint_wt', - title=env_name) - diff --git a/hyper_viz/rllab_interface.py b/hyper_viz/rllab_interface.py deleted file mode 100644 index 6c29e32..0000000 --- a/hyper_viz/rllab_interface.py +++ /dev/null @@ -1,62 +0,0 @@ -import csv -import numpy as np -import os -import json -from collections import defaultdict - -from base import Experiment - -N_PERF = 5 - - -def get_experiments(dirname, perf_key='AverageReturn'): - # look recursively for directories containing a params.json file - print('get_experiments looking in %s' % dirname) - lsdir = list(os.listdir(dirname)) - if any(params_file in lsdir for params_file in ['variant.json', 'params.json']): - print('\t Found experiment directory') - params_file_name = 'params.json' - if 'variant.json' in lsdir: - params_file_name = 'variant.json' - return [parse_exp_dir(dirname, params_file_name, perf_key=perf_key)] - else: - exps = [] - for item in lsdir: - full_path = os.path.join(dirname, item) - if os.path.isdir(full_path): - exps.extend(get_experiments(full_path)) - return exps - - -def flatten_kv_dict(orig_dict, join_char=':'): - """ - >>> flatten_kv_dict({'a': {'b': 2, 'c': 3}, 'd': 4}) - """ - flat_dict = {} - for k in orig_dict: - v = orig_dict[k] - if isinstance(v, dict): - flattened_dict = flatten_kv_dict(v, join_char=join_char) - for k_ in flattened_dict: - flat_dict['%s%s%s' % (k, join_char, k_)] = flattened_dict[k_] - else: - flat_dict[k] = v - return flat_dict - - -def parse_exp_dir(dirname, params_file_name, perf_key='AverageReturn'): - with open(os.path.join(dirname, params_file_name)) as f: - params_dict = json.loads(f.read()) - params_dict = flatten_kv_dict(params_dict) - with open(os.path.join(dirname, 'progress.csv')) as f: - readCSVFile = csv.reader(f, delimiter=',') - for i, row in enumerate(readCSVFile): - if i==0: - headers = row - keyValueMap = [[] for _ in range(len(row))] - else: - for j, rowItem in enumerate(row): - keyValueMap[j].append(float(rowItem)) - keyValueMap = dict(zip(headers, keyValueMap)) - return Experiment(params_dict, keyValueMap, performance=np.mean(keyValueMap[perf_key][-N_PERF:])) - diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 87dd31a..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -boto3 -boto -cloudpickle -awscli diff --git a/scripts/gcp/gcp_shutdown_script.sh b/scripts/gcp/gcp_shutdown_script.sh deleted file mode 100644 index 99cf3ff..0000000 --- a/scripts/gcp/gcp_shutdown_script.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -query_metadata() { - attribute_name=$1 - curl http://metadata/computeMetadata/v1/instance/attributes/$attribute_name -H "Metadata-Flavor: Google" -} - -bucket_name=$(query_metadata bucket_name) -gcp_mounts=$(query_metadata gcp_mounts) -instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") - -num_gcp_mounts=$(jq length <<< $gcp_mounts) -for ((i=0;i<$num_gcp_mounts;i++)); do - gcp_mount_info=$(jq .[$i] <<< $gcp_mounts) - # assume gcp_mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple - local_path=$(jq .[0] <<< $gcp_mount_info | tr -d '"') - gcp_bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') - gsutil -m rsync -r $local_path gs://$bucket_name/$gcp_bucket_path -done - -gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log diff --git a/scripts/gcp/gcp_startup_script.sh b/scripts/gcp/gcp_startup_script.sh deleted file mode 100644 index 9338d87..0000000 --- a/scripts/gcp/gcp_startup_script.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash -install_docker() { - sudo apt-get install -y --no-install-recommends \ - apt-transport-https \ - curl \ - software-properties-common - curl -fsSL 'https://sks-keyservers.net/pks/lookup?op=get&search=0xee6d536cf7dc86e2d7d56f59a178ac6c6238f52e' | sudo apt-key add - - sudo add-apt-repository \ - "deb https://packages.docker.com/1.12/apt/repo/ \ - ubuntu-$(lsb_release -cs) \ - main" - sudo apt-get update - sudo apt-get -y install docker-engine - sudo usermod -a -G docker ubuntu -} - -query_metadata() { - attribute_name=$1 - curl http://metadata/computeMetadata/v1/instance/attributes/$attribute_name -H "Metadata-Flavor: Google" -} - -{ - bucket_name=$(query_metadata bucket_name) - docker_cmd=$(query_metadata docker_cmd) - docker_image=$(query_metadata docker_image) - local_mounts=$(query_metadata local_mounts) - gcp_mounts=$(query_metadata gcp_mounts) - use_gpu=$(query_metadata use_gpu) - terminate=$(query_metadata terminate) - instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") - echo "bucket_name:" $bucket_name - echo "docker_cmd:" $docker_cmd - echo "docker_image:" $docker_image - echo "local_mounts:" $local_mounts - echo "gcp_mounts:" $gcp_mounts - echo "use_gpu:" $use_gpu - echo "terminate:" $terminate - echo "instance_name:" $instance_name - - sudo apt-get update - #install_docker - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do - sleep 1 - done - sudo apt-get install -y jq git unzip - die() { status=$1; shift; echo "FATAL: $*"; exit $status; } - service docker start - docker --config /home/ubuntu/.docker pull $docker_image - - num_local_mounts=$(jq length <<< $local_mounts) - for ((i=0;i<$num_local_mounts;i++)); do - local_mount=$(jq .[$i] <<< $local_mounts | tr -d '"') - echo "Mounting " $local_mount - gsutil cp gs://$bucket_name/doodad/mount/$local_mount.tar /tmp/$local_mount.tar - mkdir -p /tmp/$local_mount - tar -xvf /tmp/$local_mount.tar -C /tmp/$local_mount - done - - num_gcp_mounts=$(jq length <<< $gcp_mounts) - for ((i=0;i<$num_gcp_mounts;i++)); do - gcp_mount_info=$(jq .[$i] <<< $gcp_mounts) - # assume _mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple - local_path=$(jq .[0] <<< $gcp_mount_info | tr -d '"') - gcp_bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') - include_string=$(jq .[2] <<< $gcp_mount_info | tr -d '"') - periodic_sync_interval=$(jq .[3] <<< $gcp_mount_info | tr -d '"') - while /bin/true; do - gsutil -m rsync -r $local_path gs://$bucket_name/$gcp_bucket_path - sleep $periodic_sync_interval - done & echo sync from $local_path to gs://$bucket_name/$gcp_bucket_path initiated - done - while /bin/true; do - gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log - sleep 300 - done & - - if [ "$use_gpu" = "true" ]; then - for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done - systemctl start nvidia-docker - echo 'Testing nvidia-smi' - nvidia-smi - echo 'Testing nvidia-smi inside docker' - nvidia-docker run --rm $docker_image nvidia-smi - fi - - echo $docker_cmd >> run_docker_command.sh - bash run_docker_command.sh - - if [ "$terminate" = "true" ]; then - echo "Finished experiment. Terminating" - zone=$(curl http://metadata/computeMetadata/v1/instance/zone -H "Metadata-Flavor: Google") - zone="${zone##*/}" - gcloud compute instances delete $instance_name --zone $zone --quiet - fi -} >> /home/ubuntu/user_data.log 2>&1 diff --git a/scripts/pull_s3_logs.py b/scripts/pull_s3_logs.py deleted file mode 100755 index e4dd705..0000000 --- a/scripts/pull_s3_logs.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -import subprocess -import argparse - -def aws_sync(bucket_name, s3_log_dir, target_dir, exclude='*.pkl'): - cmd = 'aws s3 sync s3://%s/doodad/logs/%s %s --exclude %s' % (bucket_name, s3_log_dir, target_dir, exclude) - subprocess.call(cmd, shell=True) - -def main(): - - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument('log_dir', type=str, help='S3 Log dir') - parser.add_argument('-b', '--bucket', type=str, default='doodad', help='S3 Bucket') - parser.add_argument('-e', '--exclude', type=str, default='*.pkl', help='Exclude') - - args = parser.parse_args() - s3_log_dir = args.log_dir - os.makedirs(s3_log_dir, exist_ok=True) - aws_sync(args.bucket, s3_log_dir, s3_log_dir, exclude=args.exclude) - -if __name__ == "__main__": - main() diff --git a/scripts/run_experiment_lite_doodad.py b/scripts/run_experiment_lite_doodad.py deleted file mode 100644 index 9f7d7f4..0000000 --- a/scripts/run_experiment_lite_doodad.py +++ /dev/null @@ -1,9 +0,0 @@ -import doodad -try: - import cloudpickle -except ImportError as e: - raise ImportError("cloudpickle must be installed inside the docker image") -def failure(): - raise ValueError("Must provide run_method via doodad args!") -fn = doodad.get_args('run_method', failure) -fn() diff --git a/scripts/setup_ec2.py b/scripts/setup_ec2.py deleted file mode 100644 index 45ed52e..0000000 --- a/scripts/setup_ec2.py +++ /dev/null @@ -1,395 +0,0 @@ -""" -AWS Setup script - -Based on rllab's setup_ec2 -""" - -import boto3 -import re -import sys -import json -import botocore -import os - -from string import Template -from collections import OrderedDict -from boto.s3.connection import Location - -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -REPO_DIR = os.path.dirname(SCRIPT_DIR) -CONFIG_DIR = os.path.join(REPO_DIR, 'aws_config') - -ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", None) -if ACCESS_KEY is None: - raise ValueError('Please set the $AWS_ACCESS_KEY environment variable') -ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", None) -if ACCESS_SECRET is None: - raise ValueError('Please set the $AWS_ACCESS_SECRET environment variable') -S3_BUCKET_NAME = os.environ.get("DOODAD_S3_BUCKET", None) -if S3_BUCKET_NAME is None: - raise ValueError('Please set the $DOODAD_S3_BUCKET environment variable') -PREFIX = os.environ.get("RLLAB_PREFIX", "") - -SECURITY_GROUP_NAME = PREFIX + "doodad-sg" -INSTANCE_PROFILE_NAME = PREFIX + "doodad" -INSTANCE_ROLE_NAME = PREFIX + "doodad" - -ALL_REGION_AWS_SECURITY_GROUP_IDS = {} -ALL_REGION_AWS_KEY_NAMES = {} - -ALL_SUBNET_INFO = {} - -REGIONS = [ - "ap-northeast-1", - "ap-northeast-2", - "ap-south-1", - "ap-southeast-1", - "ap-southeast-2", - "eu-central-1", - "eu-west-1", - "sa-east-1", - "us-east-1", - "us-east-2", - "us-west-1", - "us-west-2", -] - -INI_FILE_TEMPLATE = Template(""" -[default] -iam_instance_profile_name=$instance_profile_name -aws_security_groups=$security_group_name -s3_bucket_name=$s3_bucket_name -aws_access_key=$aws_access_key -aws_access_secret=$aws_access_secret - -[aws_image_ids] -ap-northeast-1=ami-c42689a5 -ap-northeast-2=ami-865b8fe8 -ap-south-1=ami-ea9feb85 -ap-southeast-1=ami-c74aeaa4 -ap-southeast-2=ami-0792ae64 -eu-central-1=ami-f652a999 -eu-west-1=ami-8c0a5dff -sa-east-1=ami-3f2cb053 -us-east-1=ami-de5171c9 -us-east-2=ami-e0481285 -us-west-1=ami-efb5ff8f -us-west-2=ami-53903033 - -[aws_key_names] -$all_region_aws_key_names - -[aws_security_group_ids] -$all_region_aws_security_group_ids - -[subnet_info] -$all_subnet_info -""") - - -def setup_iam(): - iam_client = boto3.client( - "iam", - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=ACCESS_SECRET, - ) - iam = boto3.resource('iam', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET) - - # delete existing role if it exists - try: - existing_role = iam.Role(INSTANCE_ROLE_NAME) - existing_role.load() - # if role exists, delete and recreate - response = query_yes_no( - "There is an existing role named %s. Proceed to delete everything and recreate?" % - INSTANCE_ROLE_NAME, - default="no", allow_skip=True) - if response == "skip": - return - elif not response: - sys.exit() - else: - pass - print("Listing instance profiles...") - inst_profiles = existing_role.instance_profiles.all() - for prof in inst_profiles: - for role in prof.roles: - print("Removing role %s from instance profile %s" % (role.name, prof.name)) - prof.remove_role(RoleName=role.name) - print("Deleting instance profile %s" % prof.name) - prof.delete() - for policy in existing_role.policies.all(): - print("Deleting inline policy %s" % policy.name) - policy.delete() - for policy in existing_role.attached_policies.all(): - print("Detaching policy %s" % policy.arn) - existing_role.detach_policy(PolicyArn=policy.arn) - print("Deleting role") - existing_role.delete() - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == 'NoSuchEntity': - pass - else: - raise e - - print("Creating role %s " % INSTANCE_ROLE_NAME) - iam_client.create_role( - Path='/', - RoleName=INSTANCE_ROLE_NAME, - AssumeRolePolicyDocument=json.dumps({'Version': '2012-10-17', 'Statement': [ - {'Action': 'sts:AssumeRole', 'Effect': 'Allow', 'Principal': {'Service': 'ec2.amazonaws.com'}}]}) - ) - - role = iam.Role(INSTANCE_ROLE_NAME) - print("Attaching policies") - role.attach_policy(PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess') - role.attach_policy(PolicyArn='arn:aws:iam::aws:policy/ResourceGroupsandTagEditorFullAccess') - - print("Creating inline policies") - iam_client.put_role_policy( - RoleName=role.name, - PolicyName='CreateTags', - PolicyDocument=json.dumps({ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": ["ec2:CreateTags"], - "Resource": ["*"] - } - ] - }) - ) - iam_client.put_role_policy( - RoleName=role.name, - PolicyName='TerminateInstances', - PolicyDocument=json.dumps({ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "Stmt1458019101000", - "Effect": "Allow", - "Action": [ - "ec2:TerminateInstances" - ], - "Resource": [ - "*" - ] - } - ] - }) - ) - - print("Creating instance profile %s" % INSTANCE_PROFILE_NAME) - iam_client.create_instance_profile( - InstanceProfileName=INSTANCE_PROFILE_NAME, - Path='/' - ) - print("Adding role %s to instance profile %s" % (INSTANCE_ROLE_NAME, INSTANCE_PROFILE_NAME)) - iam_client.add_role_to_instance_profile( - InstanceProfileName=INSTANCE_PROFILE_NAME, - RoleName=INSTANCE_ROLE_NAME - ) - - -def setup_s3(): - print("Creating S3 bucket at s3://%s" % S3_BUCKET_NAME) - s3_client = boto3.client( - "s3", - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=ACCESS_SECRET, - ) - try: - s3_client.create_bucket( - ACL='private', - Bucket=S3_BUCKET_NAME, - CreateBucketConfiguration={ - 'LocationConstraint': 'us-west-1'} - ) - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == 'BucketAlreadyExists': - raise ValueError("Bucket %s already exists. Please reconfigure S3_BUCKET_NAME" % S3_BUCKET_NAME) from e - elif e.response['Error']['Code'] == 'BucketAlreadyOwnedByYou': - print("Bucket already created by you") - else: - raise e - print("S3 bucket created") - - -def setup_ec2(): - for region in REGIONS: - print("Setting up region %s" % region) - - ec2 = boto3.resource( - "ec2", - region_name=region, - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=ACCESS_SECRET, - ) - ec2_client = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=ACCESS_SECRET, - ) - existing_vpcs = list(ec2.vpcs.all()) - assert len(existing_vpcs) >= 1 - vpc = existing_vpcs[0] - print("Creating security group in VPC %s" % str(vpc.id)) - try: - security_group = vpc.create_security_group( - GroupName=SECURITY_GROUP_NAME, Description='Security group for doodad' - ) - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == 'InvalidGroup.Duplicate': - sgs = list(vpc.security_groups.filter(GroupNames=[SECURITY_GROUP_NAME])) - security_group = sgs[0] - else: - raise e - - ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id] - - ec2_client.create_tags(Resources=[security_group.id], Tags=[{'Key': 'Name', 'Value': SECURITY_GROUP_NAME}]) - try: - security_group.authorize_ingress(FromPort=22, ToPort=22, IpProtocol='tcp', CidrIp='0.0.0.0/0') - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == 'InvalidPermission.Duplicate': - pass - else: - raise e - print("Security group created with id %s" % str(security_group.id)) - - key_name = PREFIX + ('doodad-%s' % region) - try: - print("Trying to create key pair with name %s" % key_name) - key_pair = ec2_client.create_key_pair(KeyName=key_name) - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate': - if not query_yes_no("Key pair with name %s exists. Proceed to delete and recreate?" % key_name, "no"): - sys.exit() - print("Deleting existing key pair with name %s" % key_name) - ec2_client.delete_key_pair(KeyName=key_name) - print("Recreating key pair with name %s" % key_name) - key_pair = ec2_client.create_key_pair(KeyName=key_name) - else: - raise e - - key_pair_folder_path = os.path.join(CONFIG_DIR, "private", "key_pairs") - file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name) - - print("Saving keypair file") - os.makedirs(key_pair_folder_path, exist_ok=True) - with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600), 'w') as handle: - handle.write(key_pair['KeyMaterial'] + '\n') - - # adding pem file to ssh - # os.system("ssh-add %s" % file_name) - - ALL_REGION_AWS_KEY_NAMES[region] = key_name - print(ALL_REGION_AWS_KEY_NAMES) - print(ALL_REGION_AWS_SECURITY_GROUP_IDS) - - subnets_info = get_subnets_info(REGIONS) # this could be done at the same time than the above, keep it here for now - for key, value in subnets_info.items(): - ALL_SUBNET_INFO[key] = value - - -def get_subnets_info(regions): - clients = [] - for region in regions: - client = boto3.client( - "ec2", - region_name=region, - aws_access_key_id=ACCESS_KEY, - aws_secret_access_key=ACCESS_SECRET, - ) - client.region = region - clients.append(client) - subnet_info = OrderedDict() - for client in clients: - # first find the group - security_group = client.describe_security_groups()['SecurityGroups'][0]['GroupId'] - subnets = client.describe_subnets()['Subnets'] - for subnet in subnets: - subnet_info[subnet['AvailabilityZone']] = dict(SubnetID=subnet['SubnetId'], Groups=security_group) - return subnet_info - - -def dict_to_ini(data): - s = '' - for key in data: - s += '%s=%s\n' % (key, data[key]) - return s - -def write_config(): - print("Writing config file...") - content = INI_FILE_TEMPLATE.substitute( - #all_region_aws_key_names=json.dumps(ALL_REGION_AWS_KEY_NAMES, indent=4), - #all_subnet_info=json.dumps(ALL_SUBNET_INFO, indent=4), # CF - #all_region_aws_security_group_ids=json.dumps(ALL_REGION_AWS_SECURITY_GROUP_IDS, indent=4), - all_region_aws_key_names=dict_to_ini(ALL_REGION_AWS_KEY_NAMES), - all_subnet_info=dict_to_ini(ALL_SUBNET_INFO), # CF - all_region_aws_security_group_ids=dict_to_ini(ALL_REGION_AWS_SECURITY_GROUP_IDS), - s3_bucket_name=S3_BUCKET_NAME, - security_group_name=SECURITY_GROUP_NAME, - instance_profile_name=INSTANCE_PROFILE_NAME, - instance_role_name=INSTANCE_ROLE_NAME, - aws_access_key=ACCESS_KEY, - aws_access_secret=ACCESS_SECRET, - ) - - config_personal_file = os.path.join(CONFIG_DIR, "config.ini") - if os.path.exists(config_personal_file): - if not query_yes_no("%s exists. Override?" % os.path.basename(config_personal_file), "no"): - sys.exit() - with open(config_personal_file, "wb") as f: - f.write(content.encode("utf-8")) - - -def setup(): - print("Using prefix: %s" % PREFIX) - setup_s3() - setup_iam() - setup_ec2() - write_config() - - -def query_yes_no(question, default="yes", allow_skip=False): - """Ask a yes/no question via raw_input() and return their answer. - - "question" is a string that is presented to the user. - "default" is the presumed answer if the user just hits . - It must be "yes" (the default), "no" or None (meaning - an answer is required of the user). - - The "answer" return value is True for "yes" or False for "no". - """ - valid = {"yes": True, "y": True, "ye": True, - "no": False, "n": False} - if allow_skip: - valid["skip"] = "skip" - if default is None: - prompt = " [y/n] " - elif default == "yes": - prompt = " [Y/n] " - elif default == "no": - prompt = " [y/N] " - else: - raise ValueError("invalid default answer: '%s'" % default) - if allow_skip: - prompt += " or skip" - while True: - sys.stdout.write(question + prompt) - choice = input().lower() - if default is not None and choice == '': - return valid[default] - elif choice in valid: - return valid[choice] - else: - sys.stdout.write("Please respond with 'yes' or 'no' " - "(or 'y' or 'n').\n") - - -if __name__ == "__main__": - setup() - # setup_ec2() diff --git a/setup.py b/setup.py deleted file mode 100644 index e69de29..0000000 From a711b3de15eb4e1aae53f45fcaa48c4d3493a27b Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 04:06:15 -0800 Subject: [PATCH 30/34] removed vcs xml --- .idea/vcs.xml | 6 ----- doodad/arg_parse.py | 66 --------------------------------------------- 2 files changed, 72 deletions(-) delete mode 100644 .idea/vcs.xml delete mode 100644 doodad/arg_parse.py diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/doodad/arg_parse.py b/doodad/arg_parse.py deleted file mode 100644 index e0a4c24..0000000 --- a/doodad/arg_parse.py +++ /dev/null @@ -1,66 +0,0 @@ -import pickle -import base64 -import argparse -import os - -ARGS_DATA = 'DOODAD_ARGS_DATA' -USE_CLOUDPICKLE = 'DOODAD_USE_CLOUDPICKLE' -CLOUDPICKLE_VERSION = 'DOODAD_CLOUDPICKLE_VERSION' - - -__ARGS = None -def __get_unpickled_arg_config(): - """ - global __ARGS - if __ARGS is not None: - return __ARGS - #TODO: use environment variables rather than command-line arguments - parser = argparse.ArgumentParser() - parser.add_argument('--use_cloudpickle', type=bool, default=False) - parser.add_argument('--'+ARGS_DATA, type=str, default='') - parser.add_argument('--output_dir', type=str, default='/tmp/expt/') - args = parser.parse_args() - __ARGS = args - """ - args_data = os.environ.get(ARGS_DATA, {}) - cloudpickle_version = os.environ.get(CLOUDPICKLE_VERSION, 'n/a') - use_cloudpickle = bool(int(os.environ.get(USE_CLOUDPICKLE, '0'))) - - args = lambda : None # hack - use function as namespace - args.args_data = args_data - args.use_cloudpickle = use_cloudpickle - args.cloudpickle_version = cloudpickle_version - return args - - -def get_args(key=None, default=None): - args = __get_unpickled_arg_config() - - if args.args_data: - if args.use_cloudpickle: - import cloudpickle - assert args.cloudpickle_version == cloudpickle.__version__, "Cloudpickle versions do not match! (host) %s vs (remote) %s" % (args.cloudpickle_version, cloudpickle.__version__) - data = cloudpickle.loads(base64.b64decode(args.args_data)) - else: - data = pickle.loads(base64.b64decode(args.args_data)) - else: - data = {} - - if key is not None: - return data.get(key, default) - return data - -def encode_args(call_args, cloudpickle=False): - """ - Encode call_args dictionary as a base64 string - """ - assert isinstance(call_args, dict) - - if cloudpickle: - import cloudpickle - cpickle_version = cloudpickle.__version__ - data = base64.b64encode(cloudpickle.dumps(call_args)).decode("utf-8") - else: - data = base64.b64encode(pickle.dumps(call_args)).decode("utf-8") - cpickle_version = 'n/a' - return data, cpickle_version From 5f9e9bd0ffb4d3f392dee1497c3997dfbe2f8bff Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 04:06:45 -0800 Subject: [PATCH 31/34] get arg config rename --- doodad/arg_parse.py | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 doodad/arg_parse.py diff --git a/doodad/arg_parse.py b/doodad/arg_parse.py new file mode 100644 index 0000000..e0a4c24 --- /dev/null +++ b/doodad/arg_parse.py @@ -0,0 +1,66 @@ +import pickle +import base64 +import argparse +import os + +ARGS_DATA = 'DOODAD_ARGS_DATA' +USE_CLOUDPICKLE = 'DOODAD_USE_CLOUDPICKLE' +CLOUDPICKLE_VERSION = 'DOODAD_CLOUDPICKLE_VERSION' + + +__ARGS = None +def __get_unpickled_arg_config(): + """ + global __ARGS + if __ARGS is not None: + return __ARGS + #TODO: use environment variables rather than command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument('--use_cloudpickle', type=bool, default=False) + parser.add_argument('--'+ARGS_DATA, type=str, default='') + parser.add_argument('--output_dir', type=str, default='/tmp/expt/') + args = parser.parse_args() + __ARGS = args + """ + args_data = os.environ.get(ARGS_DATA, {}) + cloudpickle_version = os.environ.get(CLOUDPICKLE_VERSION, 'n/a') + use_cloudpickle = bool(int(os.environ.get(USE_CLOUDPICKLE, '0'))) + + args = lambda : None # hack - use function as namespace + args.args_data = args_data + args.use_cloudpickle = use_cloudpickle + args.cloudpickle_version = cloudpickle_version + return args + + +def get_args(key=None, default=None): + args = __get_unpickled_arg_config() + + if args.args_data: + if args.use_cloudpickle: + import cloudpickle + assert args.cloudpickle_version == cloudpickle.__version__, "Cloudpickle versions do not match! (host) %s vs (remote) %s" % (args.cloudpickle_version, cloudpickle.__version__) + data = cloudpickle.loads(base64.b64decode(args.args_data)) + else: + data = pickle.loads(base64.b64decode(args.args_data)) + else: + data = {} + + if key is not None: + return data.get(key, default) + return data + +def encode_args(call_args, cloudpickle=False): + """ + Encode call_args dictionary as a base64 string + """ + assert isinstance(call_args, dict) + + if cloudpickle: + import cloudpickle + cpickle_version = cloudpickle.__version__ + data = base64.b64encode(cloudpickle.dumps(call_args)).decode("utf-8") + else: + data = base64.b64encode(pickle.dumps(call_args)).decode("utf-8") + cpickle_version = 'n/a' + return data, cpickle_version From 2d686d9f9db93a297e6a666d20fb43dba9113786 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 04:08:53 -0800 Subject: [PATCH 32/34] reup --- .DS_Store | Bin 0 -> 6148 bytes .gitignore | 104 +++ LICENSE | 21 + README.md | 44 ++ doodad/__init__.py | 2 + doodad/easy_sweep/__init__.py | 1 + doodad/easy_sweep/hyper_sweep.py | 106 +++ doodad/easy_sweep/launcher.py | 99 +++ doodad/ec2/__init__.py | 1 + doodad/ec2/autoconfig.py | 43 + doodad/ec2/aws_util.py | 33 + doodad/ec2/credentials.py | 46 ++ doodad/gcp/gcp_util.py | 55 ++ doodad/launch_tools.py | 100 +++ doodad/mode.py | 961 +++++++++++++++++++++++ doodad/mount.py | 130 +++ doodad/relaunch.py | 53 ++ doodad/ssh/__init__.py | 1 + doodad/ssh/credentials.py | 66 ++ doodad/utils.py | 89 +++ examples/.gitignore | 1 + examples/docker_checkpoint/app_main.py | 21 + examples/docker_checkpoint/launch.py | 41 + examples/docker_newton_example.py | 38 + examples/ec2_launch/app_main.py | 19 + examples/ec2_launch/ec2_launch_test.py | 59 ++ examples/secretlib/secretlib/__init__.py | 2 + hyper_viz/base.py | 94 +++ hyper_viz/hyper_viz.py | 12 + hyper_viz/rllab_interface.py | 62 ++ requirements.txt | 4 + scripts/gcp/gcp_shutdown_script.sh | 20 + scripts/gcp/gcp_startup_script.sh | 95 +++ scripts/pull_s3_logs.py | 22 + scripts/run_experiment_lite_doodad.py | 9 + scripts/setup_ec2.py | 395 ++++++++++ setup.py | 7 + 37 files changed, 2856 insertions(+) create mode 100644 .DS_Store create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 doodad/__init__.py create mode 100644 doodad/easy_sweep/__init__.py create mode 100644 doodad/easy_sweep/hyper_sweep.py create mode 100644 doodad/easy_sweep/launcher.py create mode 100644 doodad/ec2/__init__.py create mode 100644 doodad/ec2/autoconfig.py create mode 100644 doodad/ec2/aws_util.py create mode 100644 doodad/ec2/credentials.py create mode 100644 doodad/gcp/gcp_util.py create mode 100644 doodad/launch_tools.py create mode 100644 doodad/mode.py create mode 100644 doodad/mount.py create mode 100644 doodad/relaunch.py create mode 100644 doodad/ssh/__init__.py create mode 100644 doodad/ssh/credentials.py create mode 100644 doodad/utils.py create mode 100644 examples/.gitignore create mode 100644 examples/docker_checkpoint/app_main.py create mode 100644 examples/docker_checkpoint/launch.py create mode 100644 examples/docker_newton_example.py create mode 100644 examples/ec2_launch/app_main.py create mode 100644 examples/ec2_launch/ec2_launch_test.py create mode 100644 examples/secretlib/secretlib/__init__.py create mode 100644 hyper_viz/base.py create mode 100644 hyper_viz/hyper_viz.py create mode 100644 hyper_viz/rllab_interface.py create mode 100644 requirements.txt create mode 100644 scripts/gcp/gcp_shutdown_script.sh create mode 100644 scripts/gcp/gcp_startup_script.sh create mode 100755 scripts/pull_s3_logs.py create mode 100644 scripts/run_experiment_lite_doodad.py create mode 100644 scripts/setup_ec2.py create mode 100644 setup.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f496074e67a270c2653451917f7ecfa2287bd492 GIT binary patch literal 6148 zcmeHK%}OId5U!3<$Cbq#1ohSzZ#g7n*LV?z7!Tfr5k2SCfP1b-ob^QmZZd_@N2yZZqyq!2>`Cf=Wy#$7KI9zK8ax?vC}rP4o^o6FBH zELuw_KdnDbTcl3nCjGeP_D)nU@LQg-4}}QFw(A`Qt+Y{C-4#*d22raw5q^3cM7~IC zq96IO8t1@G+L)^*Y7{mGgYxf6rN}GW+ruIsRI8;T|5e!<4lQeKv%Ir^*15bITn}&V zHN_17jY?Jwj^F`}nX3)Y%T7~tx;pT-`c_;=)a!mfX#zYy-u+k6Gso))|6Ot2_QbqQ zeucAt--XkQLJ`TYYP2>rIW@!pF+dD_hyk;cSj!*ErX*s382HW%$o+w%O0*4T8r9Z; zjk*FL=FzPLZFP!4/dev/null 2>&1; do + # sleep 1 + # done + #""") + #sio.write("sudo apt-get install nvidia-modprobe\n") + #sio.write("wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb\n") + #sio.write("sudo dpkg -i /tmp/nvidia-docker*.deb && rm /tmp/nvidia-docker*.deb\n") + sio.write(""" + for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done + systemctl start nvidia-docker + """) + sio.write("echo 'Testing nvidia-smi'\n") + sio.write("nvidia-smi\n") + sio.write("echo 'Testing nvidia-smi inside docker'\n") + sio.write("nvidia-docker run --rm {docker_image} nvidia-smi\n".format(docker_image=self.docker_image)) + + if self.checkpoint and self.checkpoint.restore: + raise NotImplementedError() + else: + docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) + assert self.num_exps > 0 + for _ in range(self.num_exps - 1): + sio.write(docker_cmd+' &\n') + sio.write(docker_cmd+'\n') + + # Sync all output mounts to s3 after running the user script + # Ideally the earlier while loop would be sufficient, but it might be + # the case that the earlier while loop isn't fast enough to catch a + # termination. So, we explicitly sync on termination. + for (local_output_dir, s3_dir_path) in local_output_dir_and_s3_path: + sio.write("aws s3 cp --recursive {local_dir} {s3_dir}\n".format( + local_dir=local_output_dir, + s3_dir=s3_dir_path + )) + + sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format( + stdout_log_s3_path, + )) + + # Wait for last sync + if max_sync_interval > 0: + sio.write("sleep {}\n".format(max_sync_interval + 5)) + + if self.terminate: + sio.write(""" + EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`" + aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region} + """.format(aws_region=self.region)) + sio.write("} >> /home/ubuntu/user_data.log 2>&1\n") + + full_script = dedent(sio.getvalue()) + import boto3 + import botocore + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.credentials.aws_key, + aws_secret_access_key=self.credentials.aws_secret_key, + ) + + if len(full_script) > 10000 or len(base64.b64encode(full_script.encode()).decode("utf-8")) > 10000: + s3_path = self.upload_file_to_s3(full_script, dry=dry) + sio = StringIO() + sio.write("#!/bin/bash\n") + sio.write(""" + aws s3 cp {s3_path} /home/ubuntu/remote_script.sh --region {aws_region} && \\ + chmod +x /home/ubuntu/remote_script.sh && \\ + bash /home/ubuntu/remote_script.sh + """.format(s3_path=s3_path, aws_region=self.s3_bucket_region)) + user_data = dedent(sio.getvalue()) + else: + user_data = full_script + + if verbose: + print(full_script) + with open("/tmp/full_ec2_script", "w") as f: + f.write(full_script) + + instance_args = dict( + ImageId=aws_config["image_id"], + KeyName=aws_config["key_name"], + UserData=user_data, + InstanceType=aws_config["instance_type"], + EbsOptimized=False, + SecurityGroups=aws_config["security_groups"], + SecurityGroupIds=aws_config["security_group_ids"], + NetworkInterfaces=aws_config["network_interfaces"], + IamInstanceProfile=dict( + Name=aws_config["iam_instance_profile_name"], + ), + #**config.AWS_EXTRA_CONFIGS, + ) + if self.extra_ec2_instance_kwargs is not None: + instance_args.update(self.extra_ec2_instance_kwargs) + + if verbose: + print("************************************************************") + print('UserData:', instance_args["UserData"]) + print("************************************************************") + instance_args["UserData"] = base64.b64encode(instance_args["UserData"].encode()).decode("utf-8") + spot_args = dict( + DryRun=dry, + InstanceCount=1, + LaunchSpecification=instance_args, + SpotPrice=aws_config["spot_price"], + # ClientToken=params_list[0]["exp_name"], + ) + + import pprint + + if verbose: + pprint.pprint(spot_args) + if not dry: + response = ec2.request_spot_instances(**spot_args) + print('Launched EC2 job - Server response:') + pprint.pprint(response) + print('*****'*5) + spot_request_id = response['SpotInstanceRequests'][ + 0]['SpotInstanceRequestId'] + for _ in range(10): + try: + ec2.create_tags( + Resources=[spot_request_id], + Tags=[ + {'Key': 'Name', 'Value': exp_name} + ], + ) + break + except botocore.exceptions.ClientError: + continue + + +class EC2AutoconfigDocker(EC2SpotDocker): + def __init__(self, + region='us-west-1', + s3_bucket=None, + image_id=None, + aws_key_name=None, + iam_profile=None, + **kwargs + ): + # find config file + from doodad.ec2.autoconfig import AUTOCONFIG + from doodad.ec2.credentials import AWSCredentials + s3_bucket = AUTOCONFIG.s3_bucket() if s3_bucket is None else s3_bucket + image_id = AUTOCONFIG.aws_image_id(region) if image_id is None else image_id + aws_key_name= AUTOCONFIG.aws_key_name(region) if aws_key_name is None else aws_key_name + iam_profile= AUTOCONFIG.iam_profile_name() if iam_profile is None else iam_profile + credentials=AWSCredentials(aws_key=AUTOCONFIG.aws_access_key(), aws_secret=AUTOCONFIG.aws_access_secret()) + security_group_ids = AUTOCONFIG.aws_security_group_ids()[region] + security_groups = AUTOCONFIG.aws_security_groups() + + super(EC2AutoconfigDocker, self).__init__( + s3_bucket=s3_bucket, + image_id=image_id, + aws_key_name=aws_key_name, + iam_instance_profile_name=iam_profile, + credentials=credentials, + region=region, + security_groups=security_groups, + security_group_ids=security_group_ids, + **kwargs + ) + + +class GCPDocker(DockerMode): + def __init__( + self, + zone="us-east4-a", + gcp_bucket_name=None, + instance_type='n1-standard-4', + image_name=None, + image_project=None, + disk_size:"Gb"=64, + terminate=True, + preemptible=True, + gcp_log_prefix='experiment', + gcp_log_name=None, + gcp_log_path=None, + gpu_kwargs=None, + **kwargs + ): + super(GCPDocker, self).__init__(**kwargs) + assert 'CLOUDSDK_CORE_PROJECT' in os.environ.keys() + self.project = os.environ['CLOUDSDK_CORE_PROJECT'] + self.zone = zone + self.gcp_bucket_name = gcp_bucket_name + self.instance_type = instance_type + self.terminate = terminate + self.disk_size = disk_size + self.image_project = image_project + self.image_name = image_name + self.preemptible = preemptible + + self.gcp_log_prefix = gcp_log_prefix + self.gcp_log_name = gcp_log_name + self.gcp_log_path = gcp_log_path or 'doodad/logs' + if self.gpu: + self.num_gpu = gpu_kwargs['num_gpu'] + self.gpu_model = gpu_kwargs['gpu_model'] + self.gpu_type = get_gpu_type(self.project, self.zone, self.gpu_model) + + import googleapiclient.discovery + self.compute = googleapiclient.discovery.build('compute', 'v1') + + def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): + if self.gcp_log_name is None: + exp_name = "{}-{}".format(self.gcp_log_prefix, EC2SpotDocker.make_timekey(self)) + else: + exp_name = self.gcp_log_name + exp_prefix = self.gcp_log_prefix + gcp_base_dir = os.path.join(self.gcp_log_path, exp_prefix.replace("_", "-"), exp_name) + + mnt_args = '' + py_path = [] + gcp_mount_info = [] + max_sync_interval = 0 + local_mounts = [] + for mount in mount_points: + print('Handling mount: ', mount) + if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects + if mount.read_only: + if mount.path_on_remote is None: + with mount.gzip() as gzip_file: + gzip_path = os.path.realpath(gzip_file) + file_hash = hash_file(gzip_path) + gcp_path = upload_file_to_gcp_storage( + bucket_name=self.gcp_bucket_name, + file_name=gzip_path, + remote_filename=file_hash+'.tar' + ) + mount.path_on_remote = gcp_path + mount.local_file_hash = file_hash + else: + file_hash = mount.local_file_hash + gcp_path = mount.path_on_remote + remote_unpack_name = '/tmp/'+file_hash + mount_point = os.path.join('/mounts', mount.mount_point.replace('~/','')) + mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename(mount.local_dir)), mount_point) + if mount.pythonpath: + py_path.append(mount_point) + local_mounts.append(file_hash) + else: + raise ValueError() + elif isinstance(mount, MountGCP): + gcp_local_dir = mount.mount_point + gcp_path = os.path.join(gcp_base_dir, mount.gcp_path) + if not mount.output: + raise NotImplementedError() + gcp_mount_info.append( + (gcp_local_dir, gcp_path, mount.include_string, mount.sync_interval) + ) + mnt_args += ' -v %s:%s' % (gcp_local_dir, mount.mount_point) + else: + raise NotImplementedError() + + docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path) + + metadata = { + 'bucket_name': self.gcp_bucket_name, + 'docker_cmd': docker_cmd, + 'docker_image': self.docker_image, + 'local_mounts': json.dumps(local_mounts), + 'gcp_mounts': json.dumps(gcp_mount_info), + 'use_gpu': json.dumps(self.gpu), + 'terminate': json.dumps(self.terminate), + 'startup-script': open(GCP_STARTUP_SCRIPT_PATH, "r").read(), + 'shutdown-script': open(GCP_SHUTDOWN_SCRIPT_PATH, "r").read(), + } + # instance name must match regex '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"> + unique_name= "doodad" + str(uuid.uuid4()).replace("-", "") + self.create_instance(metadata, unique_name, exp_name, exp_prefix) + if verbose: + print(unique_name) + print(metadata) + + def create_instance(self, metadata, name, exp_name="", exp_prefix=""): + image_response = self.compute.images().get( + project=self.image_project, + image=self.image_name, + ).execute() + source_disk_image = image_response['selfLink'] + config = { + 'name': name, + 'machineType': get_machine_type(self.zone, self.instance_type), + 'disks': [{ + 'boot': True, + 'autoDelete': True, + 'initializeParams': { + 'sourceImage': source_disk_image, + 'diskSizeGb': self.disk_size, + } + }], + 'networkInterfaces': [{ + 'network': 'global/networks/default', + 'accessConfigs': [ + {'type': 'ONE_TO_ONE_NAT', 'name': 'External NAT'} + ] + }], + 'serviceAccounts': [{ + 'email': 'default', + 'scopes': ['https://www.googleapis.com/auth/cloud-platform'] + }], + 'metadata': { + 'items': [ + {'key': key, 'value': value} + for key, value in metadata.items() + ] + }, + 'scheduling': { + "onHostMaintenance": "terminate", + "automaticRestart": False, + "preemptible": self.preemptible, + }, + "labels": { + "exp_name": exp_name, + "exp_prefix": exp_prefix, + } + } + if self.gpu: + config["guestAccelerators"] = [{ + "acceleratorType": self.gpu_type, + "acceleratorCount": self.num_gpu, + }] + return self.compute.instances().insert( + project=self.project, + zone=self.zone, + body=config + ).execute() + +class CodalabDocker(DockerMode): + def __init__(self): + super(CodalabDocker, self).__init__() + raise NotImplementedError() + + +class SingularityMode(LaunchMode): + def __init__(self, image, gpu=False, pre_cmd=None, + post_cmd=None, skip_wait=False): + super(SingularityMode, self).__init__() + self.singularity_image = image + self.gpu = gpu + self.pre_cmd = pre_cmd + self.post_cmd = post_cmd + self.skip_wait = skip_wait + + def get_singularity_cmd( + self, + main_cmd, + extra_args='', + verbose=True, + pythonpath=None, + ): + cmd_list= CommandBuilder() + if self.pre_cmd: + cmd_list.extend(self.pre_cmd) + + if verbose: + if self.gpu: + cmd_list.append('echo \"Running in singularity (gpu)\"') + else: + cmd_list.append('echo \"Running in singularity\"') + if pythonpath: + cmd_list.append('export PYTHONPATH=$PYTHONPATH:%s' % (':'.join(pythonpath))) + + cmd_list.append(main_cmd) + if self.post_cmd: + cmd_list.extend(self.post_cmd) + + if self.gpu: + extra_args += ' --nv ' + singularity_prefix = 'singularity exec %s %s /bin/bash -c ' % ( + extra_args, + self.singularity_image, + ) + main_cmd = cmd_list.to_string() + full_cmd = singularity_prefix + ("\'%s\'" % main_cmd) + return full_cmd + + +class LocalSingularity(SingularityMode): + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): + py_path = [] + for mount in mount_points: + if isinstance(mount, MountLocal): + if mount.pythonpath: + py_path.append(mount.local_dir) + else: + raise NotImplementedError(type(mount)) + + full_cmd = self.get_singularity_cmd( + cmd, + pythonpath=py_path, + verbose=verbose, + ) + call_and_wait(full_cmd, verbose=verbose, dry=dry, + skip_wait=self.skip_wait) + + +class SlurmSingularity(LocalSingularity): + # TODO: set up an auto-config + def __init__( + self, image, account_name, partition, time_in_mins, + qos=None, + nodes=1, + n_tasks=1, + n_gpus=1, + **kwargs + ): + super(SlurmSingularity, self).__init__(image, **kwargs) + self.account_name = account_name + self.partition = partition + self.time_in_mins = time_in_mins + self.nodes = nodes + self.n_tasks = n_tasks + self.n_gpus = n_gpus + + def create_slurm_command(self, cmd, mount_points=None, verbose=False): + py_path = [] + for mount in mount_points: + if isinstance(mount, MountLocal): + if mount.pythonpath: + py_path.append(mount.local_dir) + else: + raise NotImplementedError(type(mount)) + + singularity_cmd = self.get_singularity_cmd( + cmd, + pythonpath=py_path, + verbose=verbose, + ) + if self.gpu: + full_cmd = ( + "sbatch -A {account_name} -p {partition} -t {time}" + " -N {nodes} -n {n_tasks} --cpus-per-task={cpus_per_task}" + " --gres=gpu:{n_gpus} {cmd}".format( + account_name=self.account_name, + partition=self.partition, + time=self.time_in_mins, + nodes=self.nodes, + n_tasks=self.n_tasks, + cpus_per_task=2*self.n_gpus, + n_gpus=self.n_gpus, + cmd=singularity_cmd, + ) + ) + else: + full_cmd = "sbatch -A {account_name} -p {partition} -t {time} {cmd}".format( + account_name=self.account_name, + partition=self.partition, + time=self.time_in_mins, + cmd=singularity_cmd, + ) + if verbose: + print(full_cmd) + + def launch_command(self, cmd, mount_points=None, dry=False, verbose=False): + full_cmd = self.create_slurm_command( + cmd, mount_points=mount_points, verbose=verbose, + ) + call_and_wait(full_cmd, dry=dry, skip_wait=self.skip_wait) + + +class ScriptSlurmSingularity(SlurmSingularity): + """ + Create or add to a script to run a bunch of slurm jobs. + """ + TMP_FILE = '/tmp/script_to_scp_over.sh' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.is_first_time = False + + def set_first_time(self, is_first_time): + self.is_first_time = is_first_time + + def launch_command( + self, + cmd, + dry=False, + mount_points=None, + verbose=False, + ): + full_cmd = self.create_slurm_command( + cmd, mount_points=mount_points, verbose=verbose, + ) + if self.is_first_time: + with open(self.TMP_FILE, "w") as myfile: + myfile.write(full_cmd + '\n') + # make file executable + st = os.stat(self.TMP_FILE) + os.chmod(self.TMP_FILE, st.st_mode | stat.S_IEXEC) + print("Script generated! scp this script over:", self.TMP_FILE) + else: + with open(self.TMP_FILE, "a") as myfile: + myfile.write(full_cmd + '\n') + print("Script updated. scp this script over:", self.TMP_FILE) diff --git a/doodad/mount.py b/doodad/mount.py new file mode 100644 index 0000000..1b72591 --- /dev/null +++ b/doodad/mount.py @@ -0,0 +1,130 @@ +""" +These objects are pointers to code/data you wish to give access +to a launched job. + +Each object defines a source and a mount point (where the directory will be visible +to the launched process) + +""" +import os +import tarfile +import tempfile +from contextlib import contextmanager + + +class Mount(object): + """ + Args: + mount_point (str): Location of directory visible to the running process + pythonpath (bool): If True, adds this folder to the $PYTHON_PATH environment variable + output (bool): If False, this is a "code" directory. If True, this should be an empty + "output" directory (nothing will be copied to remote) + """ + def __init__(self, mount_point=None, pythonpath=False, output=False): + self.pythonpath = pythonpath + self.read_only = not output + self.set_mount(mount_point) + self.path_on_remote = None + self.local_file_hash = None + + def set_mount(self, mount_point): + if mount_point: + self.mount_point = mount_point + else: + self.mount_point = mount_point + + +class MountLocal(Mount): + def __init__(self, local_dir, mount_point=None, cleanup=True, + filter_ext=('.pyc', '.log', '.git', '.mp4'), + filter_dir=('data',), + **kwargs): + super(MountLocal, self).__init__(mount_point=mount_point, **kwargs) + self.local_dir = os.path.realpath(os.path.expanduser(local_dir)) + self.local_dir_raw = local_dir + self.cleanup = cleanup + self.filter_ext = filter_ext + self.filter_dir = filter_dir + if mount_point is None: + self.set_mount(local_dir) + self.no_remount = True + else: + self.no_remount = False + #print('local_dir %s, mount_point %s(%s)' % (self.local_dir, self.mount_point, mount_point)) + + def create_if_nonexistent(self): + os.makedirs(self.local_dir, exist_ok=True) + + @contextmanager + def gzip(self): + """ + Return filepath to a gzipped version of this directory for uploading + """ + assert self.read_only + def filter_func(tar_info): + filt = any([tar_info.name.endswith(ext) for ext in self.filter_ext]) or any([ tar_info.name.endswith('/'+ext) for ext in self.filter_dir]) + if filt: + return None + return tar_info + with tempfile.NamedTemporaryFile('wb+', suffix='.tar') as tf: + # make a tar.gzip archive of directory + with tarfile.open(fileobj=tf, mode="w") as tar: + #tar.add(self.local_dir, arcname=os.path.splitext(os.path.basename(tf.name))[0], filter=filter_func) + tar.add(self.local_dir, arcname=os.path.basename(self.local_dir), filter=filter_func) + tf.seek(0) + yield tf.name + + def __str__(self): + return 'MountLocal@%s'%self.local_dir + + def mount_dir(self): + return os.path.join('/mounts', self.mount_point.replace('~/','')) + + +class MountGitRepo(Mount): + def __init__(self, git_url, git_credentials=None, **kwargs): + super(MountGitRepo, self).__init__(read_only=True, **kwargs) + self.git_url = git_url + self.git_credentials = git_credentials + raise NotImplementedError() + + +class MountGCP(Mount): + def __init__(self, gcp_path, gcp_bucket_name, sync_interval=15, output=False, + include_types=('*.txt', '*.csv', '*.json', '*.gz', '*.tar', '*.log', '*.pkl'), **kwargs): + super(MountGCP, self).__init__(**kwargs) + self.gcp_bucket_name = gcp_bucket_name + self.gcp_path = gcp_path + self.output = output + self.sync_interval = sync_interval + self.sync_on_terminate = True + self.include_types = include_types + + def __str__(self): + return 'MountGCP@gcp://%s/%s'% (self.gcp_bucket_name, self.gcp_path) + + @property + def include_string(self): + return ' '.join(['--include \'%s\''%type_ for type_ in self.include_types]) + +class MountS3(Mount): + def __init__(self, s3_path, s3_bucket=None, sync_interval=15, output=False, + include_types=('*.txt', '*.csv', '*.json', '*.gz', '*.tar', '*.log', '*.pkl'), **kwargs): + super(MountS3, self).__init__(**kwargs) + if s3_bucket is None: + # load from config + from doodad.ec2.autoconfig import AUTOCONFIG + s3_bucket = AUTOCONFIG.s3_bucket() + self.s3_bucket = s3_bucket + self.s3_path = s3_path + self.output = output + self.sync_interval = sync_interval + self.sync_on_terminate = True + self.include_types = include_types + + def __str__(self): + return 'MountS3@s3://%s/%s'% (self.s3_bucket, self.s3_path) + + @property + def include_string(self): + return ' '.join(['--include \'%s\''%type_ for type_ in self.include_types]) diff --git a/doodad/relaunch.py b/doodad/relaunch.py new file mode 100644 index 0000000..63dd207 --- /dev/null +++ b/doodad/relaunch.py @@ -0,0 +1,53 @@ +""" +Support for relaunching a run from a checkpoint +if a run terminates prematurely. + +Notes to self for docker checkpoints: + +- Need to enable experimental mode on docker (run ENABLE_CRIU_CMD) +- The docker process cannot be in interactive mode or use tty +- Checkpointing will stop the docker image and save a bunch of files to disk. You need to restart it to keep going. + +""" +import uuid + +INSTALL_CRIU_CMD = 'apt-get install -y criu' +ENABLE_CRIU_CMD = 'echo "{\"experimental\": true}" >> /etc/docker/daemon.json; systemctl restart docker' + +def checkpoint_cmd(docker_name, chk_name, chk_dir='/docker_checkpoints'): + return 'docker checkpoint create --checkpoint-dir=%s %s %s' % (chk_dir, docker_name, chk_name) + +def checkpoint_restore_cmd(docker_name, checkpoint_name, chk_dir='/docker_checkpoints'): + return 'docker start --checkpoint-dir=%s --checkpoint=%s %s' % (chk_dir, checkpoint_name, docker_name) + + +class CheckpointManager(object): + def __init__(self, restore=False, checkpoint_dir='/docker_checkpoints'): + self.checkpoint_name = uuid.uuid4() + self.checkpoint_dir = checkpoint_dir + self.restore = restore + + def checkpoint_and_tar_cmd(self, docker_name, tar_name, restart=True): + cmds = [] + checkpoint_name = self.checkpoint_name + chk_cmd = checkpoint_cmd(docker_name, checkpoint_name, chk_dir=self.checkpoint_dir) + cmds.append(chk_cmd) + + chk_dir = os.path.join(self.checkpoint_dir, checkpoint_name) + tar_cmd = 'tar -cvf %s %s ' % (tar_name, chk_dir) + cmds.append(tar_cmd) + + if restart: + restart_cmd = checkpoint_restore_cmd(docker_name, checkpoint_name, chk_dir=self.checkpoint_dir) + cmds.append(restart_cmd) + return ';'.join(cmds) + + def checkpoint_tar_loop_cmd(self, docker_name, tar_name, wait_interval=1): + chk_tar_cmd = self.checkpoint_and_tar_cmd(docker_name, tar_name) + """ + while /bin/true; do + {chk_tar_cmd} + sleep {wait_interval} + done & echo sync initiated + """.format(wait_interval=wait_interval, chk_tar_cmd=chk_tar_cmd, + diff --git a/doodad/ssh/__init__.py b/doodad/ssh/__init__.py new file mode 100644 index 0000000..95ed204 --- /dev/null +++ b/doodad/ssh/__init__.py @@ -0,0 +1 @@ +from .credentials import * diff --git a/doodad/ssh/credentials.py b/doodad/ssh/credentials.py new file mode 100644 index 0000000..73811e7 --- /dev/null +++ b/doodad/ssh/credentials.py @@ -0,0 +1,66 @@ +import os + +class SSHCredentials(object): + """ + Container for SSH credentials + + Args: + hostname (str): + username (str): + password (str, optional): + Authenticate via plain-text password. This features requires the 'sshpass' program to be installed. + This usage is not suggested due to security reasons. + identity_file (str, optional): + Path to a private key file for SSL public key authentication + """ + def __init__(self, hostname=None, username=None, password=None, identity_file=None): + assert password is not None or identity_file is not None, "One of password or identity_file must be provided" + self.hostname = hostname + self.username = username + self.password = password + self.identity_file = os.path.expanduser(identity_file) + + def get_ssh_cmd_prefix(self): + """ + Return a command prefix + Ex. + 'ssh user@host -i id_file ' + """ + cmd = 'ssh %s@%s' % (self.username, self.hostname) + if self.identity_file: + cmd += ' -i %s' % self.identity_file + elif self.password: + cmd = 'sshpass -p \'%s\' %s' % (self.password, cmd) + print('WARNING: Using password-based ssh is not secure! Please consider using identity files.') + else: + raise NotImplementedError() + return cmd + ' ' + + def get_ssh_bash_cmd(self, cmd): + prefix = self.get_ssh_cmd_prefix() + return prefix + " '%s'"%cmd + + def get_ssh_script_cmd(self, script_name): + cmd = 'ssh %s@%s' % (self.username, self.hostname) + if self.identity_file: + cmd += ' -i %s' % self.identity_file + else: + raise NotImplementedError() + cmd += " 'bash -s' < %s" % script_name + return cmd + + def get_scp_cmd(self, source, destination, recursive=True): + cmd = 'scp' + if recursive: + cmd += ' -r' + if self.identity_file: + cmd += ' -i %s' % self.identity_file + else: + raise NotImplementedError() + cmd += ' %s' % source + cmd += ' %s@%s:%s' % (self.username, self.hostname, destination) + return cmd + + @property + def user_host(self): + return '%s@%s' % (self.username, self.hostname) diff --git a/doodad/utils.py b/doodad/utils.py new file mode 100644 index 0000000..1989239 --- /dev/null +++ b/doodad/utils.py @@ -0,0 +1,89 @@ +import hashlib +import os +import subprocess +import contextlib +import tempfile + +THIS_FILE_DIR = os.path.dirname(os.path.realpath(__file__)) +REPO_DIR = os.path.dirname(THIS_FILE_DIR) +EXAMPLES_DIR = os.path.join(REPO_DIR, 'examples') + +HASH_BUF_SIZE = 65536 + +def hash_file(filename): + hasher = hashlib.md5() + with open(filename, 'rb') as f: + while True: + data = f.read(HASH_BUF_SIZE) + if not data: + break + hasher.update(data) + return hasher.hexdigest() + + +def call_and_wait(cmd, verbose=False, dry=False, skip_wait=False): + if dry or verbose: + print(cmd) + if not dry: + p = subprocess.Popen(cmd, shell=True) + if skip_wait: + return + try: + p.wait() + except KeyboardInterrupt: + try: + print("terminating") + p.terminate() + except OSError: + print("os error!") + pass + p.wait() + + +class CommandBuilder(object): + def __init__(self): + self.cmds = [] + + def add_command(self, cmd): + self.cmds.append(cmd) + + def append(self, cmd): + self.add_command(cmd) + + def extend(self, other): + if isinstance(other, CommandBuilder): + self.cmds.extend(other.cmds) + else: + self.cmds.extend(other) + + def to_string(self, separator=';'): + return ';'.join([str(cmd) for cmd in self.cmds]) + + def __str__(self): + return self.to_string() + + def __iter__(self): + for cmd in self.cmds: + yield cmd + + def call_and_wait(self, verbose=False, dry=False, skip_wait=False): + return call_and_wait( + self.to_string(), + verbose=verbose, + dry=dry, + skip_wait=skip_wait, + ) + + @contextlib.contextmanager + def as_script(self, suffix='.sh'): + """ + Usage: + with cmd_builder.as_script() as fname: + # do stuff with fname + """ + with tempfile.NamedTemporaryFile(suffix=suffix, mode='w+') as f: + for cmd in self.cmds: + f.write(cmd+'\n') + f.seek(0) + yield f.name + diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 0000000..442d515 --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1 @@ +tmp_output/ diff --git a/examples/docker_checkpoint/app_main.py b/examples/docker_checkpoint/app_main.py new file mode 100644 index 0000000..24651a0 --- /dev/null +++ b/examples/docker_checkpoint/app_main.py @@ -0,0 +1,21 @@ +import os +import time +import subprocess + +import doodad as dd + +print('Launching app_main!') + +# These are arguments passed in from launch_python +args_dict = dd.get_args() +print('My args are:', args_dict) + +k = 0 +while True: + k += 1 + subprocess.call('echo %d' % k, shell=True) + time.sleep(1.0) + +# Test proper mounting +print('Done!') + diff --git a/examples/docker_checkpoint/launch.py b/examples/docker_checkpoint/launch.py new file mode 100644 index 0000000..8410734 --- /dev/null +++ b/examples/docker_checkpoint/launch.py @@ -0,0 +1,41 @@ +import os + +import doodad as dd +import doodad.ec2 as ec2 +import doodad.ssh as ssh +import doodad.mount as mount +from doodad.utils import EXAMPLES_DIR, REPO_DIR + + +# Local run +mode_local = dd.mode.Local() + +# Local docker +mode_docker = dd.mode.LocalDocker( + image='python:3.5', +) + +# or this! Run experiment via docker on another machine through SSH +mode_ssh = dd.mode.SSHDocker( + image='python:3.5', + credentials=ssh.SSHCredentials(hostname='my.machine.name', username='my_username', identity_file='~/.ssh/id_rsa'), +) + +MY_RUN_MODE = mode_docker # CHANGE THIS + +# Set up code and output directories +mounts = [ + mount.MountLocal(local_dir=REPO_DIR, pythonpath=True), # Code +] + + +THIS_FILE_DIR = os.path.realpath(os.path.dirname(__file__)) +dd.launch_python( + target=os.path.join(THIS_FILE_DIR, 'app_main.py'), # point to a target script. If running remotely, this will be copied over + mode=MY_RUN_MODE, + mount_points=mounts, + args={ + 'arg1': 50, + } +) + diff --git a/examples/docker_newton_example.py b/examples/docker_newton_example.py new file mode 100644 index 0000000..20f6bba --- /dev/null +++ b/examples/docker_newton_example.py @@ -0,0 +1,38 @@ +""" +Example script for using newton machines via docker + rllab +""" + +import doodad as dd +import doodad.ssh as ssh +import doodad.mount as mount + +MY_USERNAME = 'justin' + +# Use local mode to test code +mode_local = dd.mode.LocalDocker( + image='justinfu/rl_base:0.1' +) + +# Use docker mode to launch jobs on newton machine +mode_ssh = dd.mode.SSHDocker( + image='justinfu/rl_base:0.1', + credentials=ssh.SSHCredentials(hostname='newton2.banatao.berkeley.edu', + username='rail', identity_file='path/to/identity'), +) + +# Set up code and output directories +OUTPUT_DIR = '/mount/outputs' # this is the directory visible to the target script inside docker +mounts = [ + mount.MountLocal(local_dir='~/install/rllab', pythonpath=True), # point to your rllab + mount.MountLocal(local_dir='~/install/gym/.mujoco', mount_point='/root/.mujoco'), # point to your mujoco + + # this output directory will be visible on the remote machine + # TODO: this directory will have root permissions. For now you need to scp your data inside your script. + mount.MountLocal(local_dir='~/data/%s' % MY_USERNAME, mount_point=OUTPUT_DIR, output=True), +] + +pd.launch_python( + target='path/to/script.py', # point to a target script (absolute path). + mode=mode_ssh, + mount_points=mounts, +) diff --git a/examples/ec2_launch/app_main.py b/examples/ec2_launch/app_main.py new file mode 100644 index 0000000..59a3bf1 --- /dev/null +++ b/examples/ec2_launch/app_main.py @@ -0,0 +1,19 @@ +import os + +import doodad as dd + +import secretlib + +print('Launching app_main!') + +# These are arguments passed in from launch_python +args_dict = dd.get_args() +print('My args are:', args_dict) + +# Test proper mounting +out_dir = args_dict['output_dir'] +print('Writing secret (%s) to output dir (%s)' % (secretlib.SECRET, os.path.realpath(out_dir))) +with open( os.path.join(out_dir, 'my_secret.txt'), 'w') as f: + f.write(secretlib.SECRET) +print('Done!') + diff --git a/examples/ec2_launch/ec2_launch_test.py b/examples/ec2_launch/ec2_launch_test.py new file mode 100644 index 0000000..27d669e --- /dev/null +++ b/examples/ec2_launch/ec2_launch_test.py @@ -0,0 +1,59 @@ +import os + +import doodad as dd +import doodad.ec2 as ec2 +import doodad.ssh as ssh +import doodad.mount as mount +from doodad.utils import EXAMPLES_DIR, REPO_DIR + + +# Local docker +mode_docker = dd.mode.LocalDocker( + image='python:3.5', +) + +# or this! Run experiment via docker on another machine through SSH +mode_ssh = dd.mode.SSHDocker( + image='python:3.5', + credentials=ssh.SSHCredentials(hostname='my.machine.name', username='my_username', identity_file='~/.ssh/id_rsa'), +) + +# or use this! +mode_ec2=None +#mode_ec2 = dd.mode.EC2AutoconfigDocker( +# image='python:3.5', +# region='us-west-1', +# instance_type='m3.medium', +# spot_price=0.02, +#) + +MY_RUN_MODE = mode_docker # CHANGE THIS + +# Set up code and output directories +OUTPUT_DIR = '/example/outputs' # this is the directory visible to the target +mounts = [ + mount.MountLocal(local_dir=REPO_DIR, pythonpath=True), # Code + mount.MountLocal(local_dir=os.path.join(EXAMPLES_DIR, 'secretlib'), pythonpath=True), # Code +] + +if MY_RUN_MODE == mode_ec2: + output_mount = mount.MountS3(s3_path='outputs', mount_point=OUTPUT_DIR, output=True) # use this for ec2 +else: + output_mount = mount.MountLocal(local_dir=os.path.join(EXAMPLES_DIR, 'tmp_output'), + mount_point=OUTPUT_DIR, output=True) +mounts.append(output_mount) + +print(mounts) + +THIS_FILE_DIR = os.path.realpath(os.path.dirname(__file__)) +dd.launch_python( + target=os.path.join(THIS_FILE_DIR, 'app_main.py'), # point to a target script. If running remotely, this will be copied over + mode=MY_RUN_MODE, + mount_points=mounts, + args={ + 'arg1': 50, + 'arg2': 25, + 'output_dir': OUTPUT_DIR, + } +) + diff --git a/examples/secretlib/secretlib/__init__.py b/examples/secretlib/secretlib/__init__.py new file mode 100644 index 0000000..aa4938d --- /dev/null +++ b/examples/secretlib/secretlib/__init__.py @@ -0,0 +1,2 @@ + +SECRET = 'randomlib_secret_message123' diff --git a/hyper_viz/base.py b/hyper_viz/base.py new file mode 100644 index 0000000..d81c0ab --- /dev/null +++ b/hyper_viz/base.py @@ -0,0 +1,94 @@ +import numpy as np +from collections import defaultdict +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D + +class Experiment(object): + def __init__(self, params_dict, values, performance=0.0): + self.params = params_dict + self.values = values + self.performance = performance + + def satisfies_param(param_k, param_v): + return self.params[param_k] == param_v + + +def find_unique_params(experiments): + params_dict = defaultdict(set) + for experiment in experiments: + exp_params = experiment.params + for k in exp_params: + v = exp_params[k] + if isinstance(v, set): + raise NotImplementedError() + params_dict[k].add(v) + for k in list(params_dict.keys()): + if len(params_dict[k]) == 1: + del params_dict[k] + return params_dict + + +def make_3d_plot(experiments, xkey, ykey, logx=True, logy=True): + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + z = [exp.performance for exp in experiments] + x = [exp.params[xkey] for exp in experiments] + y = [exp.params[ykey] for exp in experiments] + ax.set_xlabel(xkey) + ax.set_ylabel(ykey) + ax.set_zlabel('Performance') + + if logx: + x = np.log(np.array(x)+1e-5)/np.log(10) + ax.set_xlabel('log '+xkey) + if logy: + y = np.log(np.array(y)+1e-5)/np.log(10) + ax.set_ylabel('log '+ykey) + + ax.scatter(x, y, z) + plt.show() + + +def resize_ticks(data, maxlen): + ldata = len(data) + ticks_per_data = maxlen/ldata + resized = [] + j = 0 + for i in range(maxlen): + if (i%ticks_per_data == 0): + resized.append(data[j]) + j+=1 + else: + resized.append('') + return resized + +def make_2d_plot(experiments, xkey, ykey, title=None): + fig = plt.figure() + ax = fig.add_subplot(111) + z = [exp.performance for exp in experiments] + x = [exp.params[xkey] for exp in experiments] + y = [exp.params[ykey] for exp in experiments] + ax.set_xlabel(xkey) + ax.set_ylabel(ykey) + x_vals_uniq = sorted(list(set([exp.params[xkey] for exp in experiments]))) + y_vals_uniq = sorted(list(set([exp.params[ykey] for exp in experiments]))) + + xticks = ax.get_xticks().tolist() + ax.set_xticklabels(resize_ticks(x_vals_uniq, len(xticks))) + yticks = ax.get_yticks().tolist() + ax.set_yticklabels(resize_ticks(y_vals_uniq, len(yticks))) + + data = np.zeros((len(x_vals_uniq), len(y_vals_uniq))) + for i in range(len(x_vals_uniq)): + for j in range(len(y_vals_uniq)): + exps = [exp for exp in experiments if (exp.params[xkey] == x_vals_uniq[i]) and (exp.params[ykey] == y_vals_uniq[j])] + avg_perf = np.mean([exp.performance for exp in exps]) + data[i,j] = avg_perf + ax.imshow(data.T, cmap='YlOrRd') + for i in range(len(x_vals_uniq)): + for j in range(len(y_vals_uniq)): + ax.annotate('%.2f'%data[i,j], xy=(i-0.3, j)) + + if title is not None: + plt.title(title) + plt.show() diff --git a/hyper_viz/hyper_viz.py b/hyper_viz/hyper_viz.py new file mode 100644 index 0000000..e73c39a --- /dev/null +++ b/hyper_viz/hyper_viz.py @@ -0,0 +1,12 @@ +import argparse + +from rllab_interface import get_experiments +from base import * + +if __name__ == "__main__": + #exps = get_experiments('hopper_monotone', perf_key='Returns Average') + exps = get_experiments('walker_09_06_17', perf_key='Returns Average') + env_name = exps[0].params['env_params:gym_name'] + make_2d_plot(exps, xkey='algo_params:n_updates_per_time_step', ykey='algo_params:monotone_constraint_wt', + title=env_name) + diff --git a/hyper_viz/rllab_interface.py b/hyper_viz/rllab_interface.py new file mode 100644 index 0000000..6c29e32 --- /dev/null +++ b/hyper_viz/rllab_interface.py @@ -0,0 +1,62 @@ +import csv +import numpy as np +import os +import json +from collections import defaultdict + +from base import Experiment + +N_PERF = 5 + + +def get_experiments(dirname, perf_key='AverageReturn'): + # look recursively for directories containing a params.json file + print('get_experiments looking in %s' % dirname) + lsdir = list(os.listdir(dirname)) + if any(params_file in lsdir for params_file in ['variant.json', 'params.json']): + print('\t Found experiment directory') + params_file_name = 'params.json' + if 'variant.json' in lsdir: + params_file_name = 'variant.json' + return [parse_exp_dir(dirname, params_file_name, perf_key=perf_key)] + else: + exps = [] + for item in lsdir: + full_path = os.path.join(dirname, item) + if os.path.isdir(full_path): + exps.extend(get_experiments(full_path)) + return exps + + +def flatten_kv_dict(orig_dict, join_char=':'): + """ + >>> flatten_kv_dict({'a': {'b': 2, 'c': 3}, 'd': 4}) + """ + flat_dict = {} + for k in orig_dict: + v = orig_dict[k] + if isinstance(v, dict): + flattened_dict = flatten_kv_dict(v, join_char=join_char) + for k_ in flattened_dict: + flat_dict['%s%s%s' % (k, join_char, k_)] = flattened_dict[k_] + else: + flat_dict[k] = v + return flat_dict + + +def parse_exp_dir(dirname, params_file_name, perf_key='AverageReturn'): + with open(os.path.join(dirname, params_file_name)) as f: + params_dict = json.loads(f.read()) + params_dict = flatten_kv_dict(params_dict) + with open(os.path.join(dirname, 'progress.csv')) as f: + readCSVFile = csv.reader(f, delimiter=',') + for i, row in enumerate(readCSVFile): + if i==0: + headers = row + keyValueMap = [[] for _ in range(len(row))] + else: + for j, rowItem in enumerate(row): + keyValueMap[j].append(float(rowItem)) + keyValueMap = dict(zip(headers, keyValueMap)) + return Experiment(params_dict, keyValueMap, performance=np.mean(keyValueMap[perf_key][-N_PERF:])) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..87dd31a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +boto3 +boto +cloudpickle +awscli diff --git a/scripts/gcp/gcp_shutdown_script.sh b/scripts/gcp/gcp_shutdown_script.sh new file mode 100644 index 0000000..99cf3ff --- /dev/null +++ b/scripts/gcp/gcp_shutdown_script.sh @@ -0,0 +1,20 @@ +#!/bin/bash +query_metadata() { + attribute_name=$1 + curl http://metadata/computeMetadata/v1/instance/attributes/$attribute_name -H "Metadata-Flavor: Google" +} + +bucket_name=$(query_metadata bucket_name) +gcp_mounts=$(query_metadata gcp_mounts) +instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") + +num_gcp_mounts=$(jq length <<< $gcp_mounts) +for ((i=0;i<$num_gcp_mounts;i++)); do + gcp_mount_info=$(jq .[$i] <<< $gcp_mounts) + # assume gcp_mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple + local_path=$(jq .[0] <<< $gcp_mount_info | tr -d '"') + gcp_bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') + gsutil -m rsync -r $local_path gs://$bucket_name/$gcp_bucket_path +done + +gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log diff --git a/scripts/gcp/gcp_startup_script.sh b/scripts/gcp/gcp_startup_script.sh new file mode 100644 index 0000000..9338d87 --- /dev/null +++ b/scripts/gcp/gcp_startup_script.sh @@ -0,0 +1,95 @@ +#!/bin/bash +install_docker() { + sudo apt-get install -y --no-install-recommends \ + apt-transport-https \ + curl \ + software-properties-common + curl -fsSL 'https://sks-keyservers.net/pks/lookup?op=get&search=0xee6d536cf7dc86e2d7d56f59a178ac6c6238f52e' | sudo apt-key add - + sudo add-apt-repository \ + "deb https://packages.docker.com/1.12/apt/repo/ \ + ubuntu-$(lsb_release -cs) \ + main" + sudo apt-get update + sudo apt-get -y install docker-engine + sudo usermod -a -G docker ubuntu +} + +query_metadata() { + attribute_name=$1 + curl http://metadata/computeMetadata/v1/instance/attributes/$attribute_name -H "Metadata-Flavor: Google" +} + +{ + bucket_name=$(query_metadata bucket_name) + docker_cmd=$(query_metadata docker_cmd) + docker_image=$(query_metadata docker_image) + local_mounts=$(query_metadata local_mounts) + gcp_mounts=$(query_metadata gcp_mounts) + use_gpu=$(query_metadata use_gpu) + terminate=$(query_metadata terminate) + instance_name=$(curl http://metadata/computeMetadata/v1/instance/name -H "Metadata-Flavor: Google") + echo "bucket_name:" $bucket_name + echo "docker_cmd:" $docker_cmd + echo "docker_image:" $docker_image + echo "local_mounts:" $local_mounts + echo "gcp_mounts:" $gcp_mounts + echo "use_gpu:" $use_gpu + echo "terminate:" $terminate + echo "instance_name:" $instance_name + + sudo apt-get update + #install_docker + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do + sleep 1 + done + sudo apt-get install -y jq git unzip + die() { status=$1; shift; echo "FATAL: $*"; exit $status; } + service docker start + docker --config /home/ubuntu/.docker pull $docker_image + + num_local_mounts=$(jq length <<< $local_mounts) + for ((i=0;i<$num_local_mounts;i++)); do + local_mount=$(jq .[$i] <<< $local_mounts | tr -d '"') + echo "Mounting " $local_mount + gsutil cp gs://$bucket_name/doodad/mount/$local_mount.tar /tmp/$local_mount.tar + mkdir -p /tmp/$local_mount + tar -xvf /tmp/$local_mount.tar -C /tmp/$local_mount + done + + num_gcp_mounts=$(jq length <<< $gcp_mounts) + for ((i=0;i<$num_gcp_mounts;i++)); do + gcp_mount_info=$(jq .[$i] <<< $gcp_mounts) + # assume _mount_info is a (local_path, bucket_path, include_string, periodic_sync_interval) tuple + local_path=$(jq .[0] <<< $gcp_mount_info | tr -d '"') + gcp_bucket_path=$(jq .[1] <<< $gcp_mount_info | tr -d '"') + include_string=$(jq .[2] <<< $gcp_mount_info | tr -d '"') + periodic_sync_interval=$(jq .[3] <<< $gcp_mount_info | tr -d '"') + while /bin/true; do + gsutil -m rsync -r $local_path gs://$bucket_name/$gcp_bucket_path + sleep $periodic_sync_interval + done & echo sync from $local_path to gs://$bucket_name/$gcp_bucket_path initiated + done + while /bin/true; do + gsutil cp /home/ubuntu/user_data.log gs://$bucket_name/$gcp_bucket_path/${instance_name}_stdout.log + sleep 300 + done & + + if [ "$use_gpu" = "true" ]; then + for i in {1..800}; do su -c "nvidia-modprobe -u -c=0" ubuntu && break || sleep 3; done + systemctl start nvidia-docker + echo 'Testing nvidia-smi' + nvidia-smi + echo 'Testing nvidia-smi inside docker' + nvidia-docker run --rm $docker_image nvidia-smi + fi + + echo $docker_cmd >> run_docker_command.sh + bash run_docker_command.sh + + if [ "$terminate" = "true" ]; then + echo "Finished experiment. Terminating" + zone=$(curl http://metadata/computeMetadata/v1/instance/zone -H "Metadata-Flavor: Google") + zone="${zone##*/}" + gcloud compute instances delete $instance_name --zone $zone --quiet + fi +} >> /home/ubuntu/user_data.log 2>&1 diff --git a/scripts/pull_s3_logs.py b/scripts/pull_s3_logs.py new file mode 100755 index 0000000..e4dd705 --- /dev/null +++ b/scripts/pull_s3_logs.py @@ -0,0 +1,22 @@ +import os +import subprocess +import argparse + +def aws_sync(bucket_name, s3_log_dir, target_dir, exclude='*.pkl'): + cmd = 'aws s3 sync s3://%s/doodad/logs/%s %s --exclude %s' % (bucket_name, s3_log_dir, target_dir, exclude) + subprocess.call(cmd, shell=True) + +def main(): + + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('log_dir', type=str, help='S3 Log dir') + parser.add_argument('-b', '--bucket', type=str, default='doodad', help='S3 Bucket') + parser.add_argument('-e', '--exclude', type=str, default='*.pkl', help='Exclude') + + args = parser.parse_args() + s3_log_dir = args.log_dir + os.makedirs(s3_log_dir, exist_ok=True) + aws_sync(args.bucket, s3_log_dir, s3_log_dir, exclude=args.exclude) + +if __name__ == "__main__": + main() diff --git a/scripts/run_experiment_lite_doodad.py b/scripts/run_experiment_lite_doodad.py new file mode 100644 index 0000000..9f7d7f4 --- /dev/null +++ b/scripts/run_experiment_lite_doodad.py @@ -0,0 +1,9 @@ +import doodad +try: + import cloudpickle +except ImportError as e: + raise ImportError("cloudpickle must be installed inside the docker image") +def failure(): + raise ValueError("Must provide run_method via doodad args!") +fn = doodad.get_args('run_method', failure) +fn() diff --git a/scripts/setup_ec2.py b/scripts/setup_ec2.py new file mode 100644 index 0000000..45ed52e --- /dev/null +++ b/scripts/setup_ec2.py @@ -0,0 +1,395 @@ +""" +AWS Setup script + +Based on rllab's setup_ec2 +""" + +import boto3 +import re +import sys +import json +import botocore +import os + +from string import Template +from collections import OrderedDict +from boto.s3.connection import Location + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +REPO_DIR = os.path.dirname(SCRIPT_DIR) +CONFIG_DIR = os.path.join(REPO_DIR, 'aws_config') + +ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", None) +if ACCESS_KEY is None: + raise ValueError('Please set the $AWS_ACCESS_KEY environment variable') +ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", None) +if ACCESS_SECRET is None: + raise ValueError('Please set the $AWS_ACCESS_SECRET environment variable') +S3_BUCKET_NAME = os.environ.get("DOODAD_S3_BUCKET", None) +if S3_BUCKET_NAME is None: + raise ValueError('Please set the $DOODAD_S3_BUCKET environment variable') +PREFIX = os.environ.get("RLLAB_PREFIX", "") + +SECURITY_GROUP_NAME = PREFIX + "doodad-sg" +INSTANCE_PROFILE_NAME = PREFIX + "doodad" +INSTANCE_ROLE_NAME = PREFIX + "doodad" + +ALL_REGION_AWS_SECURITY_GROUP_IDS = {} +ALL_REGION_AWS_KEY_NAMES = {} + +ALL_SUBNET_INFO = {} + +REGIONS = [ + "ap-northeast-1", + "ap-northeast-2", + "ap-south-1", + "ap-southeast-1", + "ap-southeast-2", + "eu-central-1", + "eu-west-1", + "sa-east-1", + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", +] + +INI_FILE_TEMPLATE = Template(""" +[default] +iam_instance_profile_name=$instance_profile_name +aws_security_groups=$security_group_name +s3_bucket_name=$s3_bucket_name +aws_access_key=$aws_access_key +aws_access_secret=$aws_access_secret + +[aws_image_ids] +ap-northeast-1=ami-c42689a5 +ap-northeast-2=ami-865b8fe8 +ap-south-1=ami-ea9feb85 +ap-southeast-1=ami-c74aeaa4 +ap-southeast-2=ami-0792ae64 +eu-central-1=ami-f652a999 +eu-west-1=ami-8c0a5dff +sa-east-1=ami-3f2cb053 +us-east-1=ami-de5171c9 +us-east-2=ami-e0481285 +us-west-1=ami-efb5ff8f +us-west-2=ami-53903033 + +[aws_key_names] +$all_region_aws_key_names + +[aws_security_group_ids] +$all_region_aws_security_group_ids + +[subnet_info] +$all_subnet_info +""") + + +def setup_iam(): + iam_client = boto3.client( + "iam", + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=ACCESS_SECRET, + ) + iam = boto3.resource('iam', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET) + + # delete existing role if it exists + try: + existing_role = iam.Role(INSTANCE_ROLE_NAME) + existing_role.load() + # if role exists, delete and recreate + response = query_yes_no( + "There is an existing role named %s. Proceed to delete everything and recreate?" % + INSTANCE_ROLE_NAME, + default="no", allow_skip=True) + if response == "skip": + return + elif not response: + sys.exit() + else: + pass + print("Listing instance profiles...") + inst_profiles = existing_role.instance_profiles.all() + for prof in inst_profiles: + for role in prof.roles: + print("Removing role %s from instance profile %s" % (role.name, prof.name)) + prof.remove_role(RoleName=role.name) + print("Deleting instance profile %s" % prof.name) + prof.delete() + for policy in existing_role.policies.all(): + print("Deleting inline policy %s" % policy.name) + policy.delete() + for policy in existing_role.attached_policies.all(): + print("Detaching policy %s" % policy.arn) + existing_role.detach_policy(PolicyArn=policy.arn) + print("Deleting role") + existing_role.delete() + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'NoSuchEntity': + pass + else: + raise e + + print("Creating role %s " % INSTANCE_ROLE_NAME) + iam_client.create_role( + Path='/', + RoleName=INSTANCE_ROLE_NAME, + AssumeRolePolicyDocument=json.dumps({'Version': '2012-10-17', 'Statement': [ + {'Action': 'sts:AssumeRole', 'Effect': 'Allow', 'Principal': {'Service': 'ec2.amazonaws.com'}}]}) + ) + + role = iam.Role(INSTANCE_ROLE_NAME) + print("Attaching policies") + role.attach_policy(PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess') + role.attach_policy(PolicyArn='arn:aws:iam::aws:policy/ResourceGroupsandTagEditorFullAccess') + + print("Creating inline policies") + iam_client.put_role_policy( + RoleName=role.name, + PolicyName='CreateTags', + PolicyDocument=json.dumps({ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["ec2:CreateTags"], + "Resource": ["*"] + } + ] + }) + ) + iam_client.put_role_policy( + RoleName=role.name, + PolicyName='TerminateInstances', + PolicyDocument=json.dumps({ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "Stmt1458019101000", + "Effect": "Allow", + "Action": [ + "ec2:TerminateInstances" + ], + "Resource": [ + "*" + ] + } + ] + }) + ) + + print("Creating instance profile %s" % INSTANCE_PROFILE_NAME) + iam_client.create_instance_profile( + InstanceProfileName=INSTANCE_PROFILE_NAME, + Path='/' + ) + print("Adding role %s to instance profile %s" % (INSTANCE_ROLE_NAME, INSTANCE_PROFILE_NAME)) + iam_client.add_role_to_instance_profile( + InstanceProfileName=INSTANCE_PROFILE_NAME, + RoleName=INSTANCE_ROLE_NAME + ) + + +def setup_s3(): + print("Creating S3 bucket at s3://%s" % S3_BUCKET_NAME) + s3_client = boto3.client( + "s3", + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=ACCESS_SECRET, + ) + try: + s3_client.create_bucket( + ACL='private', + Bucket=S3_BUCKET_NAME, + CreateBucketConfiguration={ + 'LocationConstraint': 'us-west-1'} + ) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'BucketAlreadyExists': + raise ValueError("Bucket %s already exists. Please reconfigure S3_BUCKET_NAME" % S3_BUCKET_NAME) from e + elif e.response['Error']['Code'] == 'BucketAlreadyOwnedByYou': + print("Bucket already created by you") + else: + raise e + print("S3 bucket created") + + +def setup_ec2(): + for region in REGIONS: + print("Setting up region %s" % region) + + ec2 = boto3.resource( + "ec2", + region_name=region, + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=ACCESS_SECRET, + ) + ec2_client = boto3.client( + "ec2", + region_name=region, + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=ACCESS_SECRET, + ) + existing_vpcs = list(ec2.vpcs.all()) + assert len(existing_vpcs) >= 1 + vpc = existing_vpcs[0] + print("Creating security group in VPC %s" % str(vpc.id)) + try: + security_group = vpc.create_security_group( + GroupName=SECURITY_GROUP_NAME, Description='Security group for doodad' + ) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'InvalidGroup.Duplicate': + sgs = list(vpc.security_groups.filter(GroupNames=[SECURITY_GROUP_NAME])) + security_group = sgs[0] + else: + raise e + + ALL_REGION_AWS_SECURITY_GROUP_IDS[region] = [security_group.id] + + ec2_client.create_tags(Resources=[security_group.id], Tags=[{'Key': 'Name', 'Value': SECURITY_GROUP_NAME}]) + try: + security_group.authorize_ingress(FromPort=22, ToPort=22, IpProtocol='tcp', CidrIp='0.0.0.0/0') + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'InvalidPermission.Duplicate': + pass + else: + raise e + print("Security group created with id %s" % str(security_group.id)) + + key_name = PREFIX + ('doodad-%s' % region) + try: + print("Trying to create key pair with name %s" % key_name) + key_pair = ec2_client.create_key_pair(KeyName=key_name) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'InvalidKeyPair.Duplicate': + if not query_yes_no("Key pair with name %s exists. Proceed to delete and recreate?" % key_name, "no"): + sys.exit() + print("Deleting existing key pair with name %s" % key_name) + ec2_client.delete_key_pair(KeyName=key_name) + print("Recreating key pair with name %s" % key_name) + key_pair = ec2_client.create_key_pair(KeyName=key_name) + else: + raise e + + key_pair_folder_path = os.path.join(CONFIG_DIR, "private", "key_pairs") + file_name = os.path.join(key_pair_folder_path, "%s.pem" % key_name) + + print("Saving keypair file") + os.makedirs(key_pair_folder_path, exist_ok=True) + with os.fdopen(os.open(file_name, os.O_WRONLY | os.O_CREAT, 0o600), 'w') as handle: + handle.write(key_pair['KeyMaterial'] + '\n') + + # adding pem file to ssh + # os.system("ssh-add %s" % file_name) + + ALL_REGION_AWS_KEY_NAMES[region] = key_name + print(ALL_REGION_AWS_KEY_NAMES) + print(ALL_REGION_AWS_SECURITY_GROUP_IDS) + + subnets_info = get_subnets_info(REGIONS) # this could be done at the same time than the above, keep it here for now + for key, value in subnets_info.items(): + ALL_SUBNET_INFO[key] = value + + +def get_subnets_info(regions): + clients = [] + for region in regions: + client = boto3.client( + "ec2", + region_name=region, + aws_access_key_id=ACCESS_KEY, + aws_secret_access_key=ACCESS_SECRET, + ) + client.region = region + clients.append(client) + subnet_info = OrderedDict() + for client in clients: + # first find the group + security_group = client.describe_security_groups()['SecurityGroups'][0]['GroupId'] + subnets = client.describe_subnets()['Subnets'] + for subnet in subnets: + subnet_info[subnet['AvailabilityZone']] = dict(SubnetID=subnet['SubnetId'], Groups=security_group) + return subnet_info + + +def dict_to_ini(data): + s = '' + for key in data: + s += '%s=%s\n' % (key, data[key]) + return s + +def write_config(): + print("Writing config file...") + content = INI_FILE_TEMPLATE.substitute( + #all_region_aws_key_names=json.dumps(ALL_REGION_AWS_KEY_NAMES, indent=4), + #all_subnet_info=json.dumps(ALL_SUBNET_INFO, indent=4), # CF + #all_region_aws_security_group_ids=json.dumps(ALL_REGION_AWS_SECURITY_GROUP_IDS, indent=4), + all_region_aws_key_names=dict_to_ini(ALL_REGION_AWS_KEY_NAMES), + all_subnet_info=dict_to_ini(ALL_SUBNET_INFO), # CF + all_region_aws_security_group_ids=dict_to_ini(ALL_REGION_AWS_SECURITY_GROUP_IDS), + s3_bucket_name=S3_BUCKET_NAME, + security_group_name=SECURITY_GROUP_NAME, + instance_profile_name=INSTANCE_PROFILE_NAME, + instance_role_name=INSTANCE_ROLE_NAME, + aws_access_key=ACCESS_KEY, + aws_access_secret=ACCESS_SECRET, + ) + + config_personal_file = os.path.join(CONFIG_DIR, "config.ini") + if os.path.exists(config_personal_file): + if not query_yes_no("%s exists. Override?" % os.path.basename(config_personal_file), "no"): + sys.exit() + with open(config_personal_file, "wb") as f: + f.write(content.encode("utf-8")) + + +def setup(): + print("Using prefix: %s" % PREFIX) + setup_s3() + setup_iam() + setup_ec2() + write_config() + + +def query_yes_no(question, default="yes", allow_skip=False): + """Ask a yes/no question via raw_input() and return their answer. + + "question" is a string that is presented to the user. + "default" is the presumed answer if the user just hits . + It must be "yes" (the default), "no" or None (meaning + an answer is required of the user). + + The "answer" return value is True for "yes" or False for "no". + """ + valid = {"yes": True, "y": True, "ye": True, + "no": False, "n": False} + if allow_skip: + valid["skip"] = "skip" + if default is None: + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " + else: + raise ValueError("invalid default answer: '%s'" % default) + if allow_skip: + prompt += " or skip" + while True: + sys.stdout.write(question + prompt) + choice = input().lower() + if default is not None and choice == '': + return valid[default] + elif choice in valid: + return valid[choice] + else: + sys.stdout.write("Please respond with 'yes' or 'no' " + "(or 'y' or 'n').\n") + + +if __name__ == "__main__": + setup() + # setup_ec2() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6914311 --- /dev/null +++ b/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup + +setup(name='doodad', + version='0.0.1', + description="Doodad: easy setup of AWS EC2/S3", + install_requires=[], +) \ No newline at end of file From 99995fd55a528a849f0d7057937d3c4403a62780 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 04:10:36 -0800 Subject: [PATCH 33/34] gitingore --- .gitignore | 104 ----------------------------------------------------- 1 file changed, 104 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c3f3bbf..0000000 --- a/.gitignore +++ /dev/null @@ -1,104 +0,0 @@ -.idea/ -aws_config/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# dotenv -.env - -# virtualenv -.venv -venv/ -ENV/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ From 4e8f37542ac44490c78bfcede81ce0d77e61d42f Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 12 Jan 2019 04:13:31 -0800 Subject: [PATCH 34/34] gitignore --- .gitignogre | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 .gitignogre diff --git a/.gitignogre b/.gitignogre new file mode 100644 index 0000000..c3f3bbf --- /dev/null +++ b/.gitignogre @@ -0,0 +1,104 @@ +.idea/ +aws_config/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/