diff --git a/changes/9596.feature.md b/changes/9596.feature.md new file mode 100644 index 00000000000..e91867f06aa --- /dev/null +++ b/changes/9596.feature.md @@ -0,0 +1 @@ +Dockerize the agent component with DooD (Docker-out-of-Docker) support, including krunner path sharing, REPL port binding fixes, and comprehensive Docker deployment documentation for all six components diff --git a/docker-compose.monorepo.yml b/docker-compose.monorepo.yml index 883973f3f63..8b940723e58 100644 --- a/docker-compose.monorepo.yml +++ b/docker-compose.monorepo.yml @@ -20,6 +20,31 @@ services: - "host.docker.internal:host-gateway" restart: unless-stopped + backend-ai-agent: + image: backend.ai/agent:26.1.0rc1 + container_name: backend-ai-agent + networks: + - half + ports: + - "6001:6001" # RPC (Manager -> Agent) + - "6003:6003" # Metrics/Internal API + # NOTE: Do NOT map service ports (30000-31000) here. + # In DooD mode, session containers bind these ports directly on the host. + # Mapping them to the agent container would cause port conflicts. + volumes: + - ./agent.toml:/etc/backend.ai/agent.toml:ro + - /var/run/docker.sock:/var/run/docker.sock # DooD: host Docker daemon + - /tmp/backend.ai/ipc:/tmp/backend.ai/ipc # IPC with session containers + - /tmp/scratches:/tmp/scratches # Scratch storage (must use host path) + - /tmp/backend-ai-krunner:/tmp/backend-ai-krunner # Shared krunner files for DooD + environment: + - PYTHONUNBUFFERED=1 + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + depends_on: + - backend-ai-manager + backend-ai-webserver: image: backend.ai/webserver:26.1.0rc1 container_name: backend-ai-webserver @@ -35,6 +60,51 @@ services: depends_on: - backend-ai-manager + backend-ai-storage-proxy: + image: backend.ai/storage-proxy:26.1.0rc1 + container_name: backend-ai-storage-proxy + networks: + - half + ports: + - "6021:6021" # Client API + - "6022:6022" # Manager API + volumes: + - ./storage-proxy.toml:/etc/backend.ai/storage-proxy.toml:ro + - ./vfroot:/vfroot + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + + backend-ai-appproxy-coordinator: + image: backend.ai/appproxy-coordinator:26.1.0rc1 + container_name: backend-ai-appproxy-coordinator + networks: + - half + ports: + - "10200:10200" + volumes: + - ./proxy-coordinator.toml:/etc/backend.ai/proxy-coordinator.toml:ro + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + depends_on: + - backend-ai-manager + + backend-ai-appproxy-worker: + image: backend.ai/appproxy-worker:26.1.0rc1 + container_name: backend-ai-appproxy-worker + networks: + - half + ports: + - "10205-10300:10205-10300" # Port proxy range + volumes: + - ./proxy-worker.toml:/etc/backend.ai/proxy-worker.toml:ro + extra_hosts: + - "host.docker.internal:host-gateway" + restart: unless-stopped + depends_on: + - backend-ai-appproxy-coordinator + networks: half: external: true diff --git a/docker/agent-entrypoint.sh b/docker/agent-entrypoint.sh new file mode 100644 index 00000000000..e245626e7e6 --- /dev/null +++ b/docker/agent-entrypoint.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +# DooD (Docker-out-of-Docker) krunner path setup +# +# In DooD mode, the agent creates session containers via the host Docker daemon. +# When the agent bind-mounts krunner files (runner, kernel, helpers) into session +# containers, Docker resolves the source paths on the HOST filesystem, not inside +# the agent container. To make these paths accessible from the host, we: +# 1. Copy krunner packages to a shared volume path (/tmp/backend-ai-krunner/) +# 2. Replace the originals with symlinks so importlib.resources.files() resolves +# to the shared path (via Path.resolve() in resolve_krunner_filepath) + +KRUNNER_SHARED="/tmp/backend-ai-krunner" + +if [ -d "$KRUNNER_SHARED" ]; then + SITE_PKG=$(python3 -c "import site; print(site.getsitepackages()[0])") + for pkg in runner kernel helpers; do + src="$SITE_PKG/ai/backend/$pkg" + dst="$KRUNNER_SHARED/$pkg" + if [ -d "$src" ] && [ ! -L "$src" ]; then + # First run: copy packages to shared volume and create symlinks + cp -r "$src" "$dst" + rm -rf "$src" + ln -s "$dst" "$src" + elif [ -L "$src" ]; then + # Already symlinked (container restart) + : + fi + done +fi + +exec "$@" diff --git a/docker/backend.ai-agent.dockerfile b/docker/backend.ai-agent.dockerfile new file mode 100644 index 00000000000..b1c74c7afed --- /dev/null +++ b/docker/backend.ai-agent.dockerfile @@ -0,0 +1,37 @@ +ARG PYTHON_VERSION +FROM python:${PYTHON_VERSION} AS builder +ARG PKGVER +COPY dist /dist +COPY requirements.txt /requirements.txt +# Install dependencies from requirements.txt to respect version constraints +RUN pip wheel --wheel-dir=/wheels --no-cache-dir -r /requirements.txt +# Install backend.ai packages from /dist (these are not in requirements.txt or PyPI) +RUN pip wheel --wheel-dir=/wheels --no-cache-dir backend.ai-agent==${PKGVER} --find-links=/dist --no-deps + +FROM python:${PYTHON_VERSION} +COPY --from=builder /wheels /wheels +COPY dist /dist +# Install all wheels and also look in /dist for backend.ai packages +RUN pip install --no-cache-dir --find-links=/dist /wheels/*.whl + +# Install Docker CLI for DooD (Docker-out-of-Docker) operations +RUN install -m 0755 -d /etc/apt/keyrings \ + && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \ + && chmod a+r /etc/apt/keyrings/docker.asc \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian $(. /etc/os-release && echo "$VERSION_CODENAME") stable" > /etc/apt/sources.list.d/docker.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends docker-ce-cli \ + && rm -rf /var/lib/apt/lists/* + +# Create necessary directories +RUN mkdir -p /tmp/backend.ai/ipc /var/log/backend.ai /etc/backend.ai \ + /var/lib/backend.ai + +# Set working directory +WORKDIR /app + +COPY docker/agent-entrypoint.sh /usr/local/bin/agent-entrypoint.sh +RUN chmod +x /usr/local/bin/agent-entrypoint.sh + +ENTRYPOINT ["/usr/local/bin/agent-entrypoint.sh"] +CMD ["backend.ai", "ag", "start-server", "-f", "/etc/backend.ai/agent.toml"] diff --git a/docs/install/install-docker.rst b/docs/install/install-docker.rst index 284a8df2ebe..43dc09faf31 100644 --- a/docs/install/install-docker.rst +++ b/docs/install/install-docker.rst @@ -1,7 +1,8 @@ Install from Docker Containers ============================== -This guide explains how to run Backend.AI components (manager, webserver) using Docker containers built from the monorepo. +This guide explains how to run Backend.AI components using Docker containers built from the monorepo. +All six components are covered: manager, agent, webserver, storage-proxy, appproxy-coordinator, and appproxy-worker. Prerequisites ------------- @@ -24,10 +25,29 @@ Prerequisites docker exec backendai-backendai-half-etcd-1 etcdctl put /sorna/local/config/redis/addr "host.docker.internal:8110" +4. Register storage-proxy volumes in etcd: + + .. code-block:: bash + + docker exec backendai-backendai-half-etcd-1 etcdctl put /sorna/local/volumes '{"default:volume1": {"backend": "vfs", "path": "/vfroot/volume1"}}' + + Adjust the volume name and path as needed. The ``path`` must match what the storage-proxy + container mounts (see the storage-proxy configuration below). + +5. Set AppProxy address in the scaling group: + + .. code-block:: bash + + docker exec backendai-backendai-half-db-1 psql -U postgres -d backend -c \ + "UPDATE scaling_groups SET wsproxy_addr = 'http://backend-ai-appproxy-coordinator:10200' WHERE name = 'default';" + + This tells the manager to reach the AppProxy coordinator via the Docker network name. + The ``advertised_addr`` in the AppProxy coordinator config controls what the browser sees. + Building Docker Images ---------------------- -Build the manager and webserver Docker images: +Build the Docker images: .. code-block:: bash @@ -48,6 +68,34 @@ Build the manager and webserver Docker images: --build-arg PKGVER=${PKGVER} \ . + # Build agent image + docker build -f docker/backend.ai-agent.dockerfile \ + -t backend.ai/agent:${PKGVER} \ + --build-arg PYTHON_VERSION=3.13 \ + --build-arg PKGVER=${PKGVER} \ + . + + # Build storage-proxy image + docker build -f docker/backend.ai-storage-proxy.dockerfile \ + -t backend.ai/storage-proxy:${PKGVER} \ + --build-arg PYTHON_VERSION=3.13 \ + --build-arg PKGVER=${PKGVER} \ + . + + # Build appproxy-coordinator image + docker build -f docker/backend.ai-appproxy-coordinator.dockerfile \ + -t backend.ai/appproxy-coordinator:${PKGVER} \ + --build-arg PYTHON_VERSION=3.13 \ + --build-arg PKGVER=${PKGVER} \ + . + + # Build appproxy-worker image + docker build -f docker/backend.ai-appproxy-worker.dockerfile \ + -t backend.ai/appproxy-worker:${PKGVER} \ + --build-arg PYTHON_VERSION=3.13 \ + --build-arg PKGVER=${PKGVER} \ + . + Configuration Files ------------------- @@ -107,6 +155,71 @@ Create ``manager.toml`` with the following content: log-level = "INFO" endpoint = "http://host.docker.internal:4317" +Agent Configuration +~~~~~~~~~~~~~~~~~~~ + +Create ``agent.toml`` with the following content: + +.. code-block:: toml + + # Backend.AI Agent configuration for Docker containers (DooD mode) + + [etcd] + namespace = "local" + addr = { host = "host.docker.internal", port = 8121 } + user = "" + password = "" + + [agent] + mode = "docker" + rpc-listen-addr = { host = "0.0.0.0", port = 6001 } + advertised-rpc-addr = { host = "backend-ai-agent", port = 6001 } + service-addr = { host = "0.0.0.0", port = 6003 } + agent-sock-port = 6007 + scaling-group = "default" + ipc-base-path = "/tmp/backend.ai/ipc" + var-base-path = "/var/lib/backend.ai" + image-commit-path = "/tmp/backend.ai/commit/" + + [container] + port-range = [30000, 31000] + bind-host = "host.docker.internal" + advertised-host = "host.docker.internal" + sandbox-type = "docker" + scratch-type = "hostdir" + scratch-root = "/tmp/scratches" + scratch-size = "1G" + + [resource] + reserved-cpu = 1 + reserved-mem = "1G" + reserved-disk = "8G" + + [logging] + level = "INFO" + drivers = ["console"] + + [logging.console] + colored = true + format = "verbose" + + [debug] + enabled = true + +The agent runs in DooD (Docker-out-of-Docker) mode, using the host's Docker daemon +via the mounted ``/var/run/docker.sock``. Key configuration points: + +- ``agent.advertised-rpc-addr`` must use the container service name (``backend-ai-agent``) + so the manager can reach the agent via the Docker network +- ``container.bind-host`` and ``container.advertised-host`` must be ``host.docker.internal`` + so session containers (created on the host Docker daemon) can reach the agent +- ``container.scratch-root`` must be a **host-accessible path** (e.g. ``/tmp/scratches``). + In DooD mode, the agent tells Docker to bind-mount scratch directories into session containers, + and Docker resolves these paths on the **host** filesystem, not inside the agent container +- The agent entrypoint script automatically sets up krunner file sharing for DooD. + It copies ``runner``, ``kernel``, and ``helpers`` packages to ``/tmp/backend-ai-krunner/`` + and creates symlinks so Docker can find them on the host + Webserver Configuration ~~~~~~~~~~~~~~~~~~~~~~~ @@ -119,7 +232,7 @@ Create ``webserver.conf`` with the following content: [service] ip = "0.0.0.0" port = 8090 - wsproxy.url = "" + wsproxy.url = "http://localhost:10200" mode = "webui" enable_signup = false allow_anonymous_change_password = false @@ -160,7 +273,6 @@ Create ``webserver.conf`` with the following content: [ui] menu_blocklist = "pipeline" - menu_inactivelist = "statistics" [api] domain = "default" @@ -173,8 +285,7 @@ Create ``webserver.conf`` with the following content: [session] [session.redis] - # Use the halfstack Redis service name for container networking - addr = "backendai-half-redis-node01:8110" + addr = "host.docker.internal:8111" [session.redis.redis_helper_config] socket_timeout = 5.0 @@ -191,12 +302,6 @@ Create ``webserver.conf`` with the following content: colored = true format = "verbose" - [logging.pkg-ns] - "" = "WARNING" - "aiotools" = "INFO" - "aiohttp" = "INFO" - "ai.backend" = "INFO" - [debug] enabled = true @@ -209,62 +314,199 @@ Create ``webserver.conf`` with the following content: enabled = true endpoint = "http://host.docker.internal:4000" -Key configuration points for Docker containers: +- ``wsproxy.url`` must point to the AppProxy coordinator URL **as seen by the browser** + (``http://localhost:10200``), not the container-to-container address + +Storage-Proxy Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Create ``storage-proxy.toml`` with the following content: + +.. code-block:: toml + + # Backend.AI Storage Proxy configuration for Docker containers + + [etcd] + namespace = "local" + addr = { host = "host.docker.internal", port = 8121 } + user = "" + password = "" + + [storage-proxy] + node-id = "i-storage-proxy-01" + num-proc = 2 + pid-file = "/var/log/backend.ai/storage-proxy.pid" + event-loop = "uvloop" + scandir-limit = 1000 + max-upload-size = "100g" + secret = "some-secret-shared-with-manager" + + [storage-proxy.client] + service-addr = { host = "0.0.0.0", port = 6021 } + + [storage-proxy.manager] + service-addr = { host = "0.0.0.0", port = 6022 } + ipc-base-path = "/tmp/backend.ai/ipc" + + [volume.volume1] + backend = "vfs" + path = "/vfroot/volume1" + + [logging] + level = "INFO" + drivers = ["console"] + + [logging.console] + colored = true + format = "verbose" + + [debug] + enabled = true + +The ``[volume.volume1]`` section must match the etcd volumes configuration from the prerequisites. + +AppProxy Coordinator Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Create ``proxy-coordinator.toml`` with the following content: + +.. code-block:: toml + + # Backend.AI AppProxy Coordinator configuration for Docker containers + + [db] + type = "postgresql" + name = "appproxy" + user = "appproxy" + password = "develove" + pool_size = 8 + max_overflow = 64 + addr = { host = "host.docker.internal", port = 8101 } + + [redis] + addr = { host = "host.docker.internal", port = 8111 } + + [proxy_coordinator] + tls_listen = false + tls_advertised = false + allow_unauthorized_configure_request = true + + [proxy_coordinator.bind_addr] + host = "0.0.0.0" + port = 10200 + + [proxy_coordinator.advertised_addr] + host = "localhost" + port = 10200 -- Use ``host.docker.internal`` to access services on the host machine (etcd, PostgreSQL, OTEL) -- Use container service names for inter-container communication (``backend-ai-manager``, ``backendai-half-redis-node01``) -- The webserver endpoint uses multiple values: first for container-to-container, second for browser access + [secrets] + api_secret = "some_api_secret" + jwt_secret = "some_jwt_secret" + + [permit_hash] + secret = "some_permit_hash_secret" + + [logging] + level = "INFO" + drivers = ["console"] + + [logging.console] + colored = true + format = "verbose" + + [debug] + enabled = true + log-events = true + +- ``proxy_coordinator.advertised_addr`` must use ``localhost`` because this address is sent + to the browser for direct AppProxy connections +- ``proxy_coordinator.bind_addr`` uses ``0.0.0.0`` to accept connections from both the + manager container and the host network + +AppProxy Worker Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Create ``proxy-worker.toml`` with the following content: + +.. code-block:: toml + + # Backend.AI AppProxy Worker configuration for Docker containers + + [redis] + addr = { host = "host.docker.internal", port = 8111 } + + [proxy_worker] + coordinator_endpoint = "http://backend-ai-appproxy-coordinator:10200" + authority = "worker-1" + tls_listen = false + tls_advertised = false + frontend_mode = "port" + protocol = "http" + accepted_traffics = ["inference", "interactive"] + api_bind_addr = { host = "0.0.0.0", port = 10201 } + api_advertised_addr = { host = "localhost", port = 10201 } + + [proxy_worker.port_proxy] + bind_host = "0.0.0.0" + advertised_host = "localhost" + bind_port_range = [10205, 10300] + + [secrets] + api_secret = "some_api_secret" + jwt_secret = "some_jwt_secret" + + [permit_hash] + secret = "some_permit_hash_secret" + + [logging] + level = "INFO" + drivers = ["console"] + + [logging.console] + colored = true + format = "verbose" + + [debug] + enabled = true + log-events = true + +- ``coordinator_endpoint`` uses the container name for inter-container communication +- ``api_advertised_addr`` and ``port_proxy.advertised_host`` must use ``localhost`` + because these addresses are sent to the browser + +Address Configuration Summary +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In a Docker environment, each address serves a different audience. Use the correct hostname +for each: + +.. list-table:: + :header-rows: 1 + :widths: 40 30 30 + + * - Purpose + - Hostname + - Why + * - Container-to-container (e.g. manager to agent RPC, worker to coordinator) + - Docker service name (e.g. ``backend-ai-agent``) + - Resolved via Docker DNS on the shared network + * - Container-to-halfstack (e.g. to etcd, PostgreSQL, Redis) + - ``host.docker.internal`` + - Halfstack ports are mapped on the host + * - Browser-facing (e.g. AppProxy advertised addresses, ``wsproxy.url``) + - ``localhost`` + - Browser runs on the host machine + * - Scaling group ``wsproxy_addr`` (manager to AppProxy) + - ``backend-ai-appproxy-coordinator`` + - Manager container calls AppProxy coordinator container Running with Docker Compose ---------------------------- -The easiest way to run the components is using docker-compose. - -The ``docker-compose.monorepo.yml`` file includes both manager and webserver: - -.. code-block:: yaml - - version: "3.8" - - services: - backend-ai-manager: - image: backend.ai/manager:26.1.0rc1 - container_name: backend-ai-manager - networks: - - half - ports: - - "8091:8091" - volumes: - - ./fixtures:/app/fixtures:ro - - ./manager.toml:/etc/backend.ai/manager.toml:ro - - ./logs:/var/log/backend.ai - - /tmp/backend.ai/ipc:/tmp/backend.ai/ipc - - /var/run/docker.sock:/var/run/docker.sock - environment: - - PYTHONUNBUFFERED=1 - extra_hosts: - - "host.docker.internal:host-gateway" - restart: unless-stopped - - backend-ai-webserver: - image: backend.ai/webserver:26.1.0rc1 - container_name: backend-ai-webserver - networks: - - half - ports: - - "8090:8090" - volumes: - - ./webserver.conf:/etc/backend.ai/webserver.conf:ro - extra_hosts: - - "host.docker.internal:host-gateway" - restart: unless-stopped - depends_on: - - backend-ai-manager - - networks: - half: - external: true - name: backendai_half +The easiest way to run all components is using docker-compose. + +The ``docker-compose.monorepo.yml`` file includes all six services: manager, agent, +webserver, storage-proxy, appproxy-coordinator, and appproxy-worker. Start the services: @@ -293,7 +535,31 @@ Alternatively, you can run the containers manually: -v /var/run/docker.sock:/var/run/docker.sock \ -e PYTHONUNBUFFERED=1 \ --restart unless-stopped \ - backend.ai/manager:26.1.0rc1 + backend.ai/manager:${PKGVER} + +**Agent (DooD):** + +.. code-block:: bash + + docker run -d \ + --name backend-ai-agent \ + --network backendai_half \ + --add-host host.docker.internal:host-gateway \ + -p 6001:6001 -p 6003:6003 \ + -v $(pwd)/agent.toml:/etc/backend.ai/agent.toml:ro \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -v /tmp/backend.ai/ipc:/tmp/backend.ai/ipc \ + -v /tmp/scratches:/tmp/scratches \ + -v /tmp/backend-ai-krunner:/tmp/backend-ai-krunner \ + -e PYTHONUNBUFFERED=1 \ + --restart unless-stopped \ + backend.ai/agent:${PKGVER} + +.. note:: + + Do **not** map ports 30000-31000 to the agent container. In DooD mode, session containers + bind these ports directly on the host Docker daemon. Mapping them to the agent container + would cause port conflicts. **Webserver:** @@ -306,7 +572,47 @@ Alternatively, you can run the containers manually: -p 8090:8090 \ -v $(pwd)/webserver.conf:/etc/backend.ai/webserver.conf:ro \ --restart unless-stopped \ - backend.ai/webserver:26.1.0rc1 + backend.ai/webserver:${PKGVER} + +**Storage-Proxy:** + +.. code-block:: bash + + docker run -d \ + --name backend-ai-storage-proxy \ + --network backendai_half \ + --add-host host.docker.internal:host-gateway \ + -p 6021:6021 -p 6022:6022 \ + -v $(pwd)/storage-proxy.toml:/etc/backend.ai/storage-proxy.toml:ro \ + -v $(pwd)/vfroot:/vfroot \ + --restart unless-stopped \ + backend.ai/storage-proxy:${PKGVER} + +**AppProxy Coordinator:** + +.. code-block:: bash + + docker run -d \ + --name backend-ai-appproxy-coordinator \ + --network backendai_half \ + --add-host host.docker.internal:host-gateway \ + -p 10200:10200 \ + -v $(pwd)/proxy-coordinator.toml:/etc/backend.ai/proxy-coordinator.toml:ro \ + --restart unless-stopped \ + backend.ai/appproxy-coordinator:${PKGVER} + +**AppProxy Worker:** + +.. code-block:: bash + + docker run -d \ + --name backend-ai-appproxy-worker \ + --network backendai_half \ + --add-host host.docker.internal:host-gateway \ + -p 10201:10201 -p 10205-10300:10205-10300 \ + -v $(pwd)/proxy-worker.toml:/etc/backend.ai/proxy-worker.toml:ro \ + --restart unless-stopped \ + backend.ai/appproxy-worker:${PKGVER} Accessing the Services ---------------------- @@ -315,20 +621,25 @@ After starting the containers: - Web UI: http://localhost:8090 - Manager API: http://localhost:8091 +- AppProxy: http://localhost:10200 - Default credentials: ``admin@lablup.com`` / ``wJalrXUt`` Troubleshooting --------------- Container fails to start -~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~ Check logs: .. code-block:: bash docker logs backend-ai-manager + docker logs backend-ai-agent docker logs backend-ai-webserver + docker logs backend-ai-storage-proxy + docker logs backend-ai-appproxy-coordinator + docker logs backend-ai-appproxy-worker Connection issues ~~~~~~~~~~~~~~~~~ @@ -339,6 +650,41 @@ Ensure all halfstack services are running: docker compose -f docker-compose.halfstack-ha.yml ps +AppProxy 503 / "Failed to fetch" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the browser shows "Failed to connect to AppProxy" or "Failed to fetch" when opening +session apps: + +1. Verify the scaling group has the correct ``wsproxy_addr``: + + .. code-block:: bash + + docker exec backendai-backendai-half-db-1 psql -U postgres -d backend -c \ + "SELECT wsproxy_addr FROM scaling_groups WHERE name = 'default';" + + It should show ``http://backend-ai-appproxy-coordinator:10200``. + +2. Verify AppProxy advertised addresses use ``localhost``: + + .. code-block:: bash + + curl -s http://localhost:10200/status | python3 -m json.tool + + The ``advertise_address`` should be ``http://localhost:10200``. + +3. Verify the webserver ``wsproxy.url`` is set to ``http://localhost:10200``. + +DooD agent: session creation fails +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If session containers fail to start with "bind source path does not exist": + +- Ensure ``/tmp/backend-ai-krunner`` is mounted as a volume to the agent container. + The entrypoint script populates this directory with krunner files on first start. +- Ensure ``scratch-root`` in the agent config uses a host-accessible path + (e.g. ``/tmp/scratches``, not ``/app/scratches``). + pycares version mismatch ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -347,7 +693,7 @@ If you encounter DNS resolution errors, verify the pycares version: .. code-block:: bash - docker run --rm backend.ai/webserver:26.1.0rc1 pip list | grep pycares + docker run --rm backend.ai/webserver:${PKGVER} pip list | grep pycares It should show version 4.11.0. @@ -364,5 +710,7 @@ Or manually: .. code-block:: bash - docker stop backend-ai-manager backend-ai-webserver - docker rm backend-ai-manager backend-ai-webserver + docker stop backend-ai-manager backend-ai-agent backend-ai-webserver \ + backend-ai-storage-proxy backend-ai-appproxy-coordinator backend-ai-appproxy-worker + docker rm backend-ai-manager backend-ai-agent backend-ai-webserver \ + backend-ai-storage-proxy backend-ai-appproxy-coordinator backend-ai-appproxy-worker diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index f5900257006..bc1bd502e56 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -1083,8 +1083,15 @@ async def start_container( if sport["name"] in self.protected_services: protected_service_ports.update(sport["container_ports"]) for eport in exposed_ports: - if eport in self.repl_ports or eport in protected_service_ports: # always protected - host_ips.append("127.0.0.1") + if eport in self.repl_ports or eport in protected_service_ports: + # In DooD (Docker-out-of-Docker) mode, the agent runs inside a container + # but creates session containers via the host Docker daemon. + # REPL ports must be accessible from the agent container, so they need to + # bind to the same host as other ports when advertised_host is configured. + if advertised_kernel_host: + host_ips.append(str(container_bind_host)) + else: + host_ips.append("127.0.0.1") else: host_ips.append(str(container_bind_host)) if len(host_ips) != len(host_ports) or len(host_ports) != len(exposed_ports): diff --git a/src/ai/backend/agent/docker/kernel.py b/src/ai/backend/agent/docker/kernel.py index 6b2d96e1419..38e3a9f0510 100644 --- a/src/ai/backend/agent/docker/kernel.py +++ b/src/ai/backend/agent/docker/kernel.py @@ -95,7 +95,7 @@ async def create_code_runner( self.kernel_id, self.session_id, event_producer, - kernel_host="127.0.0.1", # repl ports are always bound to 127.0.0.1 + kernel_host=self.data.get("kernel_host", "127.0.0.1"), repl_in_port=self.data["repl_in_port"], repl_out_port=self.data["repl_out_port"], exec_timeout=0,