Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions .github/workflows/smoke-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
name: Smoke Tests

on:
pull_request:
paths:
- "droidrun/**"

concurrency:
group: smoke-${{ github.head_ref }}
cancel-in-progress: true

jobs:
smoke:
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
contents: read
pull-requests: write

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.13"

- uses: astral-sh/setup-uv@v4

- name: Install dependencies
run: uv sync --all-groups

- name: Run smoke tests
env:
MOBILERUN_API_KEY: ${{ secrets.MOBILERUN_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }}
run: uv run python -m tests.smoke.run --output-dir=artifacts

- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: smoke-test-results
path: artifacts/

- name: Comment on PR
if: always()
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = 'artifacts/summary.md';

let body;
if (fs.existsSync(path)) {
body = fs.readFileSync(path, 'utf8');
// Add artifact download link
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
body += `\n\n---\n[Download all artifacts](${runUrl}#artifacts)`;
} else {
body = '## Smoke Tests\n\n**Failed to generate results.** Check the [workflow run](' +
`${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` +
') for details.';
}

// Find existing smoke test comment to update
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});

const marker = '<!-- smoke-test-results -->';
body = marker + '\n' + body;
const existing = comments.find(c => c.body?.includes(marker));

if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}
Empty file added tests/__init__.py
Empty file.
Empty file added tests/smoke/__init__.py
Empty file.
126 changes: 126 additions & 0 deletions tests/smoke/agent_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Run DroidAgent from source against a cloud device."""

import logging
import os
from dataclasses import dataclass, field
from uuid import uuid4

from pydantic import BaseModel, Field

from droidrun import DroidAgent, DroidrunConfig, load_llm
from droidrun.agent.common.events import ScreenshotEvent, ToolExecutionEvent
from droidrun.agent.droid.events import ResultEvent
from droidrun.tools.driver.cloud import CloudDriver

from tests.smoke.config import SmokeTestConfig

logger = logging.getLogger("smoke")

LLM_MODEL = "gemini-3.1-flash-lite-preview"


class AndroidVersion(BaseModel):
"""Structured output model for extracting Android version."""

android_version: str = Field(description="The Android version number (e.g. '14', '15')")


@dataclass
class RunResult:
result: ResultEvent | None = None
screenshots: list[bytes] = field(default_factory=list)
tool_events: list[ToolExecutionEvent] = field(default_factory=list)
error: str | None = None
langfuse_session_id: str | None = None


async def run_agent(
test_config: SmokeTestConfig,
device_id: str,
api_key: str,
base_url: str,
trajectory_dir: str | None = None,
langfuse_host: str | None = None,
) -> RunResult:
"""Run a single smoke test agent and collect results."""
run_result = RunResult()

# Ensure screenshots are emitted even for non-vision runs
os.environ["DROIDRUN_STREAM_SCREENSHOTS"] = "1"

try:
driver = CloudDriver(
device_id=device_id,
api_key=api_key,
base_url=base_url,
)

config = DroidrunConfig()
config.agent.reasoning = test_config.reasoning
config.agent.max_steps = test_config.max_steps
config.agent.streaming = False
config.agent.fast_agent.vision = test_config.vision
config.agent.manager.vision = test_config.vision
config.agent.executor.vision = test_config.vision
config.telemetry.enabled = False

# Trajectory writer
if trajectory_dir:
config.logging.save_trajectory = "all"
config.logging.trajectory_path = trajectory_dir
config.logging.trajectory_gifs = True
else:
config.logging.save_trajectory = "none"

# Langfuse tracing
langfuse_secret = os.environ.get("LANGFUSE_SECRET_KEY", "")
langfuse_public = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
if langfuse_secret and langfuse_public:
session_id = str(uuid4())
run_result.langfuse_session_id = session_id
config.tracing.enabled = True
config.tracing.provider = "langfuse"
config.tracing.langfuse_secret_key = langfuse_secret
config.tracing.langfuse_public_key = langfuse_public
config.tracing.langfuse_host = langfuse_host or os.environ.get(
"LANGFUSE_HOST", "https://us.cloud.langfuse.com"
)
config.tracing.langfuse_session_id = session_id
config.tracing.langfuse_user_id = "smoke-test"
else:
config.tracing.enabled = False

llm = load_llm("GoogleGenAI", model=LLM_MODEL)

credentials = None
if test_config.credentials:
credentials = {"test-account": "smoketest123"}

output_model = None
if test_config.output_schema:
output_model = AndroidVersion

agent = DroidAgent(
goal=test_config.task,
config=config,
llms=llm,
driver=driver,
credentials=credentials,
output_model=output_model,
timeout=300,
)

handler = agent.run()
async for event in handler.stream_events():
if isinstance(event, ScreenshotEvent):
run_result.screenshots.append(event.screenshot)
elif isinstance(event, ToolExecutionEvent):
run_result.tool_events.append(event)

run_result.result = await handler

except Exception as e:
logger.error(f"Agent run failed: {e}")
run_result.error = str(e)

return run_result
84 changes: 84 additions & 0 deletions tests/smoke/assertions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Smoke test assertions."""

import re
import logging

logger = logging.getLogger("smoke")


class AssertionError(Exception):
pass


def assert_result_success(result) -> None:
"""Assert the agent reported success."""
if not result.success:
raise AssertionError(
f"Agent reported failure: {result.reason}"
)


def assert_structured_output(result) -> None:
"""Assert structured output contains a valid Android version."""
output = result.structured_output
if output is None:
raise AssertionError("No structured output returned")

version = getattr(output, "android_version", None)
if version is None:
raise AssertionError(
f"structured_output missing 'android_version' field: {output}"
)

if not re.match(r"^\d+", str(version)):
raise AssertionError(
f"android_version doesn't look like a version: '{version}'"
)


def assert_type_secret_called(tool_events: list) -> None:
"""Assert type_secret was called and succeeded."""
for event in tool_events:
if event.tool_name == "type_secret" and event.success:
return

names = [e.tool_name for e in tool_events]
raise AssertionError(
f"type_secret not found or failed in tool events. Tools called: {names}"
)


def assert_package_name(ui_state, expected_substring: str) -> None:
"""Assert the device's current package name contains the expected substring."""
pkg = ui_state.phone_state.package_name or ""
if expected_substring.lower() not in pkg.lower():
raise AssertionError(
f"Expected package containing '{expected_substring}', got '{pkg}'"
)


ASSERTION_MAP = {
"result_success": lambda ctx: assert_result_success(ctx["result"]),
"structured_output": lambda ctx: assert_structured_output(ctx["result"]),
"type_secret_called": lambda ctx: assert_type_secret_called(ctx["tool_events"]),
"package_name": lambda ctx: assert_package_name(
ctx["ui_state"], ctx["expected_package"]
),
}


def run_assertions(assertion_names: list[str], context: dict) -> list[str]:
"""Run named assertions and return list of failure messages."""
failures = []
for name in assertion_names:
fn = ASSERTION_MAP.get(name)
if fn is None:
failures.append(f"Unknown assertion: {name}")
continue
try:
fn(context)
logger.info(f" PASS: {name}")
except (AssertionError, Exception) as e:
logger.error(f" FAIL: {name} — {e}")
failures.append(f"{name}: {e}")
return failures
58 changes: 58 additions & 0 deletions tests/smoke/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Smoke test matrix configuration."""

from dataclasses import dataclass, field


@dataclass
class SmokeTestConfig:
name: str
reasoning: bool
vision: bool
max_steps: int
task: str
output_schema: bool = False
credentials: bool = False
expected_package: str = ""
assertions: list[str] = field(default_factory=list)


SMOKE_TESTS: list[SmokeTestConfig] = [
SmokeTestConfig(
name="fast-no-vision",
reasoning=False,
vision=False,
max_steps=15,
task="Go to Settings and find the Android version number",
expected_package="settings",
assertions=["result_success", "package_name"],
),
SmokeTestConfig(
name="fast-vision",
reasoning=False,
vision=True,
max_steps=15,
task="Go to Settings and find the Android version number",
output_schema=True,
expected_package="settings",
assertions=["result_success", "structured_output", "package_name"],
),
SmokeTestConfig(
name="reasoning-no-vision",
reasoning=True,
vision=False,
max_steps=30,
task="Open Chrome, tap the search bar, and use the type_secret tool to type the saved credential into it",
credentials=True,
expected_package="chrome",
assertions=["type_secret_called"],
),
SmokeTestConfig(
name="reasoning-vision",
reasoning=True,
vision=True,
max_steps=30,
task="Go to Settings and find the Android version number",
expected_package="settings",
assertions=["result_success", "package_name"],
),
]
Loading
Loading