diff --git a/apps/app/src/app/constants.ts b/apps/app/src/app/constants.ts
index fa52e09c0b..fc64403092 100644
--- a/apps/app/src/app/constants.ts
+++ b/apps/app/src/app/constants.ts
@@ -156,6 +156,18 @@ export const MCP_QUICK_CONNECT: McpDirectoryInfo[] = [
composerPrompt: "Use the Chrome extension to ",
defaultEnabled: true,
},
+ {
+ id: "handsfree-computer-use",
+ name: "HandsFree Computer Use",
+ serverName: "handsfree-computer-use",
+ description: "Control macOS apps through semantic accessibility refs, background-safe clicks, screenshots, keyboard input, and strict mode.",
+ type: "local",
+ command: ["npx", "-y", "@openwork/handsfree", "mcp"],
+ oauth: false,
+ kind: "extension",
+ iconSrc: "/openwork-mark.svg",
+ composerPrompt: "Use HandsFree Computer Use to ",
+ },
{
id: "openai-image-gen",
name: "OpenAI Image Gen",
diff --git a/apps/app/src/app/lib/desktop.ts b/apps/app/src/app/lib/desktop.ts
index 91ab00a714..10eeac56e3 100644
--- a/apps/app/src/app/lib/desktop.ts
+++ b/apps/app/src/app/lib/desktop.ts
@@ -157,6 +157,7 @@ declare global {
initialDeepLinks?: string[];
platform?: "darwin" | "linux" | "windows";
version?: string;
+ browserCdpPort?: string;
};
};
}
diff --git a/apps/app/src/react-app/design-system/extension-card.tsx b/apps/app/src/react-app/design-system/extension-card.tsx
index 92e1b642ba..d9486e58d6 100644
--- a/apps/app/src/react-app/design-system/extension-card.tsx
+++ b/apps/app/src/react-app/design-system/extension-card.tsx
@@ -108,15 +108,11 @@ export function ExtensionCard(props: ExtensionCardProps) {
{name}
- {connected ? (
-
- Connected
-
- ) : (
+ {!connected ? (
{kindLabel[kind]}
- )}
+ ) : null}
{description}
{!connected && !connecting && actionLabel ? (
diff --git a/apps/app/src/react-app/design-system/extension-detail-modal.tsx b/apps/app/src/react-app/design-system/extension-detail-modal.tsx
index f6823dbf7f..d894a26da9 100644
--- a/apps/app/src/react-app/design-system/extension-detail-modal.tsx
+++ b/apps/app/src/react-app/design-system/extension-detail-modal.tsx
@@ -255,10 +255,10 @@ export function ExtensionDetailModal(props: ExtensionDetailModalProps) {
) : null}
- {kind === "ui-control" ? (
+ {launchCommand ? (
Launch
- {(launchCommand ?? fallbackUiControlCommand).join(" ")}
+ {launchCommand.join(" ")}
) : null}
diff --git a/apps/app/src/react-app/domains/connections/store.ts b/apps/app/src/react-app/domains/connections/store.ts
index f2899bd139..ba4b3f71a7 100644
--- a/apps/app/src/react-app/domains/connections/store.ts
+++ b/apps/app/src/react-app/domains/connections/store.ts
@@ -273,6 +273,16 @@ export function createConnectionsStore(options: {
};
const resolveLocalMcpCommand = async (entry: McpDirectoryInfo) => {
+ if (entry.serverName === "handsfree-computer-use") {
+ try {
+ const command = await (window as any).__OPENWORK_ELECTRON__?.invokeDesktop?.("getHandsFreeMcpCommand");
+ if (Array.isArray(command) && command.every((part) => typeof part === "string") && command.length > 0) {
+ return command;
+ }
+ } catch {
+ // Fall through to the published package command.
+ }
+ }
if (entry.serverName !== "openwork-ui") {
return entry.command;
}
diff --git a/apps/app/src/react-app/domains/session/surface/composer/composer.tsx b/apps/app/src/react-app/domains/session/surface/composer/composer.tsx
index 51aa3ac046..6d790bb958 100644
--- a/apps/app/src/react-app/domains/session/surface/composer/composer.tsx
+++ b/apps/app/src/react-app/domains/session/surface/composer/composer.tsx
@@ -678,7 +678,16 @@ export function ReactSessionComposer(props: ComposerProps) {
};
const applyExtensionSelection = (entry: McpDirectoryInfo) => {
- props.onDraftChange(entry.composerPrompt ?? `Use ${entry.name} to `);
+ if (entry.id === "openwork-browser") {
+ const port = window.__OPENWORK_ELECTRON__?.meta?.browserCdpPort?.trim();
+ props.onDraftChange(
+ port
+ ? `Use the OpenWork Browser extension with browser_url "http://127.0.0.1:${port}". Do not use any other browser_url. `
+ : entry.composerPrompt ?? `Use ${entry.name} to `,
+ );
+ } else {
+ props.onDraftChange(entry.composerPrompt ?? `Use ${entry.name} to `);
+ }
setToolMenuOpen(false);
};
diff --git a/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx b/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx
index f4c5e59a83..3e479e2572 100644
--- a/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx
+++ b/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx
@@ -196,6 +196,7 @@ export function McpView(props: McpViewProps) {
const [detailSkillContent, setDetailSkillContent] = useState(null);
const [openworkUiMcpCommand, setOpenworkUiMcpCommand] = useState(null);
const [openworkUiMcpEnvironment, setOpenworkUiMcpEnvironment] = useState | null>(null);
+ const [handsFreeMcpCommand, setHandsFreeMcpCommand] = useState(null);
const [search, setSearch] = useState("");
const [filter, setFilter] = useState("all");
const [, setExtensionStateVersion] = useState(0);
@@ -266,9 +267,14 @@ export function McpView(props: McpViewProps) {
),
));
}
+ const handsFreeCommand = await (window as any).__OPENWORK_ELECTRON__?.invokeDesktop?.("getHandsFreeMcpCommand");
+ if (Array.isArray(handsFreeCommand) && handsFreeCommand.every((part) => typeof part === "string")) {
+ setHandsFreeMcpCommand(handsFreeCommand);
+ }
} catch {
setOpenworkUiMcpCommand(null);
setOpenworkUiMcpEnvironment(null);
+ setHandsFreeMcpCommand(null);
}
})();
}, []);
@@ -341,6 +347,15 @@ export function McpView(props: McpViewProps) {
const isQuickConnectConfigured = (entry: McpDirectoryInfo) =>
props.mcpServers.some((server) => server.name === getMcpIdentityKey(entry));
+ const isMcpBackedExtension = (entry: McpDirectoryInfo) =>
+ entry.kind === "extension" && Boolean(entry.type || entry.command?.length || entry.url);
+
+ const launchCommandForEntry = (entry: McpDirectoryInfo) => {
+ if (entry.serverName === "openwork-ui") return openworkUiMcpCommand ?? undefined;
+ if (entry.serverName === "handsfree-computer-use") return handsFreeMcpCommand ?? entry.command;
+ return entry.command;
+ };
+
const supportsOauth = (entry: McpServerEntry) =>
entry.config.type === "remote" && entry.config.oauth !== false;
@@ -468,7 +483,7 @@ export function McpView(props: McpViewProps) {
busy={props.busy}
connectingName={props.mcpConnectingName}
isConfigured={(entry) =>
- entry.kind === "extension"
+ entry.kind === "extension" && !isMcpBackedExtension(entry)
? (entry.defaultEnabled ? isOpenWorkExtensionEnabled(entry) : props.isExtensionConnected?.(entry) ?? false)
: isQuickConnectConfigured(entry)
}
@@ -575,7 +590,7 @@ export function McpView(props: McpViewProps) {
{detailEntry ? (() => {
const extensionConfigSlot = props.configSlotForEntry?.(detailEntry) ?? null;
const hasConfigSlot = extensionConfigSlot !== null;
- const isConnected = detailEntry.kind === "extension"
+ const isConnected = detailEntry.kind === "extension" && !isMcpBackedExtension(detailEntry)
? (detailEntry.defaultEnabled ? isOpenWorkExtensionEnabled(detailEntry) : props.isExtensionConnected?.(detailEntry) ?? false)
: isQuickConnectConfigured(detailEntry);
return (
@@ -590,19 +605,19 @@ export function McpView(props: McpViewProps) {
kind={detailEntry.kind ?? "mcp"}
connected={isConnected}
connecting={props.mcpConnectingName === detailEntry.name}
- launchCommand={detailEntry.serverName === "openwork-ui" ? openworkUiMcpCommand ?? undefined : undefined}
+ launchCommand={launchCommandForEntry(detailEntry)}
environment={detailEntry.serverName === "openwork-ui" ? openworkUiMcpEnvironment ?? undefined : undefined}
url={typeof detailEntry.url === "string" ? detailEntry.url : undefined}
oauth={detailEntry.oauth}
configSlot={extensionConfigSlot}
- onConnect={detailEntry.defaultEnabled ? () => {
+ onConnect={detailEntry.defaultEnabled && !isMcpBackedExtension(detailEntry) ? () => {
setOpenWorkExtensionEnabled(detailEntry, true);
setDetailEntry(null);
} : hasConfigSlot ? undefined : () => {
props.connectMcp(detailEntry);
setDetailEntry(null);
}}
- onUninstall={detailEntry.defaultEnabled && isConnected ? () => {
+ onUninstall={detailEntry.defaultEnabled && !isMcpBackedExtension(detailEntry) && isConnected ? () => {
setOpenWorkExtensionEnabled(detailEntry, false);
} : isQuickConnectConfigured(detailEntry) ? () => {
const slug = getMcpIdentityKey(detailEntry);
diff --git a/apps/app/src/react-app/domains/settings/shell/settings-page.tsx b/apps/app/src/react-app/domains/settings/shell/settings-page.tsx
index 05246742ec..4780b4aa66 100644
--- a/apps/app/src/react-app/domains/settings/shell/settings-page.tsx
+++ b/apps/app/src/react-app/domains/settings/shell/settings-page.tsx
@@ -3,7 +3,6 @@ import type * as React from "react";
import {
ArrowLeft,
Bug,
- ChevronDown,
CloudCog,
Cog,
Container,
@@ -33,12 +32,6 @@ import {
SidebarMenuButton,
SidebarMenuItem,
} from "@/components/ui/sidebar";
-import {
- DropdownMenu,
- DropdownMenuContent,
- DropdownMenuItem,
- DropdownMenuTrigger,
-} from "@/components/ui/dropdown-menu";
import { t } from "../../../../i18n";
import type { SettingsTab } from "../../../../app/types";
import {
@@ -53,7 +46,6 @@ import {
SettingsPanelToolbarMessage,
SettingsPanelToolbarStatus,
} from "./panel";
-import { WorkspaceIcon } from "../../../design-system/workspace-icon";
export function getSettingsTabIcon(tab: SettingsTab) {
switch (tab) {
@@ -234,31 +226,6 @@ export function SettingsSidebar(props: SettingsSidebarProps) {
{t("dashboard.back_to_app")}
-
-
-
-
- {props.selectedWorkspaceName}
-
-
- }
- />
-
- {props.workspaces.map((workspace) => (
- props.onSelectWorkspace(workspace.id)}
- disabled={workspace.id === props.selectedWorkspaceId}
- >
-
- {workspace.name}
-
- ))}
-
-
-
diff --git a/apps/desktop/electron/main.mjs b/apps/desktop/electron/main.mjs
index 5288287cd2..7fba502019 100644
--- a/apps/desktop/electron/main.mjs
+++ b/apps/desktop/electron/main.mjs
@@ -226,15 +226,15 @@ if (process.platform === "darwin" && APP_ICON_IMAGE && !APP_ICON_IMAGE.isEmpty()
// Expose Chrome DevTools Protocol so the opencode-chrome-devtools plugin can
// drive the built-in browser panel. Use OPENWORK_ELECTRON_REMOTE_DEBUG_PORT to
-// pin a specific port; otherwise pick a default (9223) that stays out of the
-// way of common dev-tools ports (9222 = Chrome, 9229 = Node inspector).
+// pin a specific port. Prod defaults to 9223; dev defaults to 9823 so both
+// apps can run side by side without the dev browser tools attaching to prod.
const explicitCdpPort = Number.parseInt(
process.env.OPENWORK_ELECTRON_REMOTE_DEBUG_PORT?.trim() ?? "",
10,
);
const remoteDebugPort = Number.isFinite(explicitCdpPort) && explicitCdpPort > 0
? explicitCdpPort
- : 9223;
+ : isDevMode ? 9823 : 9223;
app.commandLine.appendSwitch("remote-debugging-port", String(remoteDebugPort));
app.commandLine.appendSwitch("remote-debugging-address", "127.0.0.1");
// Make the port available to the embedded server so it can pass it to OpenCode.
@@ -2362,6 +2362,12 @@ async function handleDesktopInvoke(event, command, ...args) {
}
return ["npx", "-y", "openwork-ui-mcp"];
}
+ case "getHandsFreeMcpCommand": {
+ if (process.env.OPENWORK_DEV_MODE === "1") {
+ return ["node", path.resolve(__dirname, "../../..", "packages/handsfree/bin/openwork-handsfree-computer-use.mjs"), "mcp"];
+ }
+ return ["npx", "-y", "@openwork/handsfree", "mcp"];
+ }
case "getOpenworkUiMcpEnvironment": {
return {
OPENWORK_UI_CONTROL_DISCOVERY: path.join(app.getPath("userData"), "openwork-ui-control.json"),
diff --git a/apps/desktop/electron/preload.mjs b/apps/desktop/electron/preload.mjs
index c8a50f7c4e..9c1c31beb3 100644
--- a/apps/desktop/electron/preload.mjs
+++ b/apps/desktop/electron/preload.mjs
@@ -133,6 +133,7 @@ contextBridge.exposeInMainWorld("__OPENWORK_ELECTRON__", {
initialDeepLinks: [],
platform: normalizePlatform(process.platform),
version: process.versions.electron,
+ browserCdpPort: process.env.OPENWORK_ELECTRON_REMOTE_DEBUG_PORT || undefined,
},
});
diff --git a/apps/server/src/workspace-init.ts b/apps/server/src/workspace-init.ts
index 7e5963adf8..b37c8b8a55 100644
--- a/apps/server/src/workspace-init.ts
+++ b/apps/server/src/workspace-init.ts
@@ -47,6 +47,7 @@ Browser tools (\`browser_navigate\`, \`browser_snapshot\`, \`browser_click\`, \`
- \`browser_url\`: always use \`"http://127.0.0.1:{{BROWSER_CDP_PORT}}"\`.
- Use for general browsing tasks. The user sees what you do in real time.
- Always call \`browser_list\` first to discover available targets, then use the appropriate \`target_id\`.
+- Do not scan common CDP ports or fall back to another port. If this endpoint is unavailable, report that the built-in browser is unavailable.
**Chrome (external browser)**:
- Use when the user needs their real cookies, sign-ins, or extensions.
diff --git a/packages/handsfree/README.md b/packages/handsfree/README.md
new file mode 100644
index 0000000000..aaf7ef90b6
--- /dev/null
+++ b/packages/handsfree/README.md
@@ -0,0 +1,26 @@
+# OpenWork HandsFree Computer Use
+
+Native macOS computer-use runtime imported from the HandsFree prototype.
+
+This package focuses on the reusable control layer:
+
+- Semantic AX snapshots with compact refs like `{e1}`.
+- Strict background mode that avoids foreground cursor/HID fallbacks.
+- Target-window screenshots via `CGWindowListCreateImage(.optionIncludingWindow)`.
+- Background input through `CGEvent.postToPid` with window-addressing fields.
+- Background activation using per-process event taps plus AppKit and center-click primers.
+- Non-UI orchestration modules from the original Electron prototype: realtime tool schemas/instructions and the GPT computer-use loop.
+
+Build the native stdio server:
+
+```bash
+pnpm --filter @openwork/handsfree check:native
+```
+
+Run it as an MCP-compatible adapter:
+
+```bash
+pnpm --filter @openwork/handsfree exec openwork-handsfree-computer-use mcp
+```
+
+The core runtime is intentionally MCP-independent. `ComputerUseRuntime` exposes a small direct surface (`snapshot`, `click`, `typeText`, `pressKey`, `scroll`, `wait`, `setValue`, `performAction`); `MCPServer` is only a thin stdio wrapper.
diff --git a/packages/handsfree/bin/openwork-handsfree-computer-use.mjs b/packages/handsfree/bin/openwork-handsfree-computer-use.mjs
new file mode 100755
index 0000000000..c99fc9a340
--- /dev/null
+++ b/packages/handsfree/bin/openwork-handsfree-computer-use.mjs
@@ -0,0 +1,41 @@
+#!/usr/bin/env node
+
+import { spawn } from "node:child_process";
+import { existsSync } from "node:fs";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const packageRoot = path.resolve(__dirname, "..");
+const swiftPackagePath = path.join(packageRoot, "native", "HandsFree");
+
+const explicitBinary = process.env.HANDSFREE_COMPUTER_USE_BINARY?.trim();
+const candidates = [
+ explicitBinary,
+ path.join(swiftPackagePath, ".build", "release", "HandsFreeComputerUse"),
+ path.join(swiftPackagePath, ".build", "arm64-apple-macosx", "release", "HandsFreeComputerUse"),
+ path.join(swiftPackagePath, ".build", "debug", "HandsFreeComputerUse"),
+ path.join(swiftPackagePath, ".build", "arm64-apple-macosx", "debug", "HandsFreeComputerUse"),
+].filter(Boolean);
+
+const args = process.argv.slice(2);
+const binary = candidates.find((candidate) => existsSync(candidate));
+const command = binary ?? "swift";
+const commandArgs = binary
+ ? args
+ : ["run", "--package-path", swiftPackagePath, "HandsFreeComputerUse", ...args];
+
+const child = spawn(command, commandArgs, {
+ stdio: "inherit",
+ env: process.env,
+});
+
+child.on("exit", (code, signal) => {
+ if (signal) process.kill(process.pid, signal);
+ process.exit(code ?? 0);
+});
+
+child.on("error", (error) => {
+ console.error(`Failed to start HandsFreeComputerUse: ${error.message}`);
+ process.exit(1);
+});
diff --git a/packages/handsfree/native/HandsFree/.gitignore b/packages/handsfree/native/HandsFree/.gitignore
new file mode 100644
index 0000000000..2d9f16e2d2
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/.gitignore
@@ -0,0 +1,2 @@
+.build/
+.swiftpm/
diff --git a/packages/handsfree/native/HandsFree/Package.swift b/packages/handsfree/native/HandsFree/Package.swift
new file mode 100644
index 0000000000..0d507c4598
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Package.swift
@@ -0,0 +1,13 @@
+// swift-tools-version: 5.9
+import PackageDescription
+
+let package = Package(
+ name: "HandsFree",
+ platforms: [.macOS(.v14)],
+ targets: [
+ .executableTarget(
+ name: "HandsFreeComputerUse",
+ path: "Sources/ComputerUse"
+ ),
+ ]
+)
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/AccessibilityService.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/AccessibilityService.swift
new file mode 100644
index 0000000000..b3d3848fb9
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/AccessibilityService.swift
@@ -0,0 +1,373 @@
+import AppKit
+import ApplicationServices
+
+final class AccessibilityService: @unchecked Sendable {
+ private let screenshotImageWidth: CGFloat = 768
+ private let maxElements = 250
+ private let maxDepth = 22
+
+ private let importantRoles: Set = [
+ "AXButton", "AXCheckBox", "AXRadioButton", "AXPopUpButton", "AXMenuButton",
+ "AXComboBox", "AXTextField", "AXTextArea", "AXSearchField", "AXLink",
+ "AXSlider", "AXIncrementor", "AXScrollArea", "AXScrollBar", "AXTabGroup",
+ "AXTab", "AXMenuItem", "AXCell", "AXRow", "AXStaticText", "AXImage",
+ "AXOutline", "AXTable", "AXList", "AXGroup",
+ ]
+
+ func resolveTarget(appName: String?) throws -> WindowTarget {
+ guard AXIsProcessTrusted() else { throw ComputerUseError.accessibilityDenied }
+
+ let app = try resolveApp(appName: appName)
+ let pid = app.processIdentifier
+ let axApp = AXUIElementCreateApplication(pid)
+ let axWindow = firstAXWindow(axApp: axApp)
+ let title = axWindow.flatMap { axString($0, kAXTitleAttribute) }
+ let info = firstCGWindowInfo(pid: pid, title: title)
+ let bounds = axWindow.flatMap(axFrame) ?? info?.bounds
+
+ guard let bounds, bounds.width > 20, bounds.height > 20 else {
+ throw ComputerUseError.noWindow(app.localizedName ?? appName ?? "frontmost app")
+ }
+
+ return WindowTarget(
+ appName: app.localizedName ?? "Unknown",
+ pid: pid,
+ windowNumber: info?.number,
+ windowTitle: title ?? info?.title,
+ bounds: bounds,
+ isFrontmost: NSWorkspace.shared.frontmostApplication?.processIdentifier == pid,
+ axWindow: axWindow
+ )
+ }
+
+ func snapshot(target: WindowTarget, strictMode: Bool, backgroundActivated: Bool) async throws -> AppSnapshot {
+ let records = target.axWindow.map(semanticRecords(window:)) ?? []
+ let (data, meta) = try captureScreenshot(target: target)
+
+ return AppSnapshot(
+ appName: target.appName,
+ pid: target.pid,
+ windowNumber: target.windowNumber,
+ windowTitle: target.windowTitle,
+ screenshotData: data,
+ screenshotMimeType: "image/jpeg",
+ screenshotMeta: meta,
+ records: records,
+ strictMode: strictMode,
+ backgroundActivated: backgroundActivated
+ )
+ }
+
+ func press(record: AXElementRecord) -> Bool {
+ AXUIElementPerformAction(record.element, kAXPressAction as CFString) == .success
+ }
+
+ func focus(record: AXElementRecord) -> Bool {
+ AXUIElementSetAttributeValue(record.element, kAXFocusedAttribute as CFString, true as CFBoolean) == .success
+ }
+
+ func setValue(record: AXElementRecord, value: String) -> Bool {
+ var settable = DarwinBoolean(false)
+ guard AXUIElementIsAttributeSettable(record.element, kAXValueAttribute as CFString, &settable) == .success,
+ settable.boolValue else {
+ return false
+ }
+ return AXUIElementSetAttributeValue(record.element, kAXValueAttribute as CFString, value as CFString) == .success
+ }
+
+ func performAction(record: AXElementRecord, action: String) -> Bool {
+ AXUIElementPerformAction(record.element, action as CFString) == .success
+ }
+
+ private func resolveApp(appName: String?) throws -> NSRunningApplication {
+ guard let rawName = appName?.trimmingCharacters(in: .whitespacesAndNewlines), !rawName.isEmpty else {
+ guard let frontmost = NSWorkspace.shared.frontmostApplication else {
+ throw ComputerUseError.noFrontmostApplication
+ }
+ return frontmost
+ }
+
+ let needle = rawName.lowercased()
+ let regularApps = NSWorkspace.shared.runningApplications.filter { $0.activationPolicy == .regular }
+ if let exact = regularApps.first(where: { $0.localizedName?.lowercased() == needle }) {
+ return exact
+ }
+ if let contains = regularApps.first(where: { $0.localizedName?.lowercased().contains(needle) == true }) {
+ return contains
+ }
+ throw ComputerUseError.appNotFound(rawName)
+ }
+
+ private func firstAXWindow(axApp: AXUIElement) -> AXUIElement? {
+ var windowValue: CFTypeRef?
+ guard AXUIElementCopyAttributeValue(axApp, kAXWindowsAttribute as CFString, &windowValue) == .success,
+ let windows = windowValue as? [AXUIElement] else {
+ return nil
+ }
+ return windows.first { window in
+ guard let frame = axFrame(window) else { return false }
+ return frame.width > 20 && frame.height > 20
+ }
+ }
+
+ private func semanticRecords(window: AXUIElement) -> [AXElementRecord] {
+ var records: [AXElementRecord] = []
+ collect(element: window, depth: 0, records: &records)
+ return records
+ }
+
+ private func collect(element: AXUIElement, depth: Int, records: inout [AXElementRecord]) {
+ guard depth <= maxDepth, records.count < maxElements else { return }
+
+ let rawRole = axString(element, kAXRoleAttribute) ?? "AXUnknown"
+ let role = normalizedRole(rawRole)
+ let value = axString(element, kAXValueAttribute).map { String($0.prefix(120)) }
+ let label = semanticLabel(element: element, role: role, value: value)
+ let actions = axActions(element)
+ let frame = axFrame(element)
+ let capabilities = capabilitiesFor(element: element, rawRole: rawRole, actions: actions)
+ let shouldSurface = shouldSurfaceElement(rawRole: rawRole, label: label, value: value, frame: frame, capabilities: capabilities)
+
+ if shouldSurface, let frame {
+ let id = records.count + 1
+ let semantic = SemanticAXElement(
+ id: id,
+ ref: "{e\(id)}",
+ role: role,
+ label: label,
+ value: value,
+ frame: ElementFrame(
+ x: Int(frame.origin.x),
+ y: Int(frame.origin.y),
+ width: Int(frame.width),
+ height: Int(frame.height)
+ ),
+ state: stateFor(element: element, rawRole: rawRole),
+ capabilities: capabilities
+ )
+ records.append(AXElementRecord(element: element, semantic: semantic))
+ }
+
+ for child in axChildren(element) {
+ collect(element: child, depth: depth + 1, records: &records)
+ if records.count >= maxElements { break }
+ }
+ }
+
+ private func shouldSurfaceElement(rawRole: String, label: String, value: String?, frame: CGRect?, capabilities: AXElementCapabilities) -> Bool {
+ guard let frame, frame.width > 1, frame.height > 1 else { return false }
+ let hasSemanticText = !label.isEmpty || value?.isEmpty == false
+ let interactive = capabilities.canPress || capabilities.canFocus || capabilities.canScroll || capabilities.canAdjust || capabilities.canSetValue
+ if interactive { return true }
+ if !importantRoles.contains(rawRole) { return false }
+ if rawRole == "AXGroup" { return hasSemanticText && frame.width < 900 && frame.height < 700 }
+ return hasSemanticText
+ }
+
+ private func semanticLabel(element: AXUIElement, role: String, value: String?) -> String {
+ let candidates = [
+ axString(element, kAXTitleAttribute),
+ axString(element, kAXDescriptionAttribute),
+ axString(element, kAXHelpAttribute),
+ axString(element, kAXIdentifierAttribute),
+ value,
+ ]
+ for candidate in candidates {
+ if let trimmed = candidate?.trimmingCharacters(in: .whitespacesAndNewlines), !trimmed.isEmpty {
+ return String(trimmed.prefix(120))
+ }
+ }
+ return role
+ }
+
+ private func capabilitiesFor(element: AXUIElement, rawRole: String, actions: [String]) -> AXElementCapabilities {
+ var valueSettable = DarwinBoolean(false)
+ let canSetValue = AXUIElementIsAttributeSettable(element, kAXValueAttribute as CFString, &valueSettable) == .success && valueSettable.boolValue
+
+ var focusSettable = DarwinBoolean(false)
+ let canFocus = AXUIElementIsAttributeSettable(element, kAXFocusedAttribute as CFString, &focusSettable) == .success && focusSettable.boolValue
+
+ let canAdjust = actions.contains(kAXIncrementAction) || actions.contains(kAXDecrementAction) || rawRole == "AXSlider" || rawRole == "AXIncrementor"
+ let canScroll = actions.contains("AXScrollToVisible") || rawRole == "AXScrollArea" || rawRole == "AXScrollBar"
+ let canPress = actions.contains(kAXPressAction) || ["AXButton", "AXCheckBox", "AXRadioButton", "AXLink", "AXMenuItem", "AXPopUpButton", "AXMenuButton", "AXCell"].contains(rawRole)
+
+ return AXElementCapabilities(
+ canPress: canPress,
+ canFocus: canFocus,
+ canScroll: canScroll,
+ canAdjust: canAdjust,
+ canSetValue: canSetValue,
+ actions: actions
+ )
+ }
+
+ private func stateFor(element: AXUIElement, rawRole: String) -> AXElementState {
+ AXElementState(
+ enabled: axBool(element, kAXEnabledAttribute),
+ focused: axBool(element, kAXFocusedAttribute),
+ selected: axBool(element, kAXSelectedAttribute),
+ expanded: axBool(element, kAXExpandedAttribute),
+ checked: rawRole == "AXCheckBox" || rawRole == "AXRadioButton" ? axBool(element, kAXValueAttribute) : nil
+ )
+ }
+
+ private func axChildren(_ element: AXUIElement) -> [AXUIElement] {
+ var value: CFTypeRef?
+ guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success,
+ let children = value as? [AXUIElement] else {
+ return []
+ }
+ return children
+ }
+
+ private func axActions(_ element: AXUIElement) -> [String] {
+ var actionNames: CFArray?
+ guard AXUIElementCopyActionNames(element, &actionNames) == .success,
+ let names = actionNames as? [String] else {
+ return []
+ }
+ return names
+ }
+
+ private func axString(_ element: AXUIElement, _ attribute: String) -> String? {
+ var value: CFTypeRef?
+ guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success,
+ let value else {
+ return nil
+ }
+ if let string = value as? String, !string.isEmpty { return string }
+ if let attributed = value as? NSAttributedString, !attributed.string.isEmpty { return attributed.string }
+ if let number = value as? NSNumber { return number.stringValue }
+ return nil
+ }
+
+ private func axBool(_ element: AXUIElement, _ attribute: String) -> Bool? {
+ var value: CFTypeRef?
+ guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success,
+ let value else {
+ return nil
+ }
+ if let bool = value as? Bool { return bool }
+ if let number = value as? NSNumber { return number.boolValue }
+ return nil
+ }
+
+ private func axFrame(_ element: AXUIElement) -> CGRect? {
+ guard let position = axPoint(element, kAXPositionAttribute), let size = axSize(element, kAXSizeAttribute) else {
+ return nil
+ }
+ return CGRect(origin: position, size: size)
+ }
+
+ private func axPoint(_ element: AXUIElement, _ attribute: String) -> CGPoint? {
+ var value: CFTypeRef?
+ guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success,
+ let value,
+ CFGetTypeID(value) == AXValueGetTypeID() else {
+ return nil
+ }
+ let axValue = value as! AXValue
+ guard AXValueGetType(axValue) == .cgPoint else { return nil }
+ var point = CGPoint.zero
+ guard AXValueGetValue(axValue, .cgPoint, &point) else { return nil }
+ return point
+ }
+
+ private func axSize(_ element: AXUIElement, _ attribute: String) -> CGSize? {
+ var value: CFTypeRef?
+ guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success,
+ let value,
+ CFGetTypeID(value) == AXValueGetTypeID() else {
+ return nil
+ }
+ let axValue = value as! AXValue
+ guard AXValueGetType(axValue) == .cgSize else { return nil }
+ var size = CGSize.zero
+ guard AXValueGetValue(axValue, .cgSize, &size) else { return nil }
+ return size
+ }
+
+ private func normalizedRole(_ rawRole: String) -> String {
+ rawRole.hasPrefix("AX") ? String(rawRole.dropFirst(2)) : rawRole
+ }
+
+ private func firstCGWindowInfo(pid: pid_t, title: String?) -> (number: Int, title: String?, bounds: CGRect)? {
+ guard let list = CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else {
+ return nil
+ }
+
+ let candidates = list.compactMap { info -> (number: Int, title: String?, bounds: CGRect)? in
+ guard let ownerPID = info[kCGWindowOwnerPID as String] as? Int32,
+ ownerPID == pid,
+ let layer = info[kCGWindowLayer as String] as? Int,
+ layer == 0,
+ let number = info[kCGWindowNumber as String] as? Int,
+ let boundsDict = info[kCGWindowBounds as String] as? [String: Any] else {
+ return nil
+ }
+ let bounds = CGRect(
+ x: cgFloat(boundsDict["X"]),
+ y: cgFloat(boundsDict["Y"]),
+ width: cgFloat(boundsDict["Width"]),
+ height: cgFloat(boundsDict["Height"])
+ )
+ guard bounds.width > 20, bounds.height > 20 else { return nil }
+ return (number, info[kCGWindowName as String] as? String, bounds)
+ }
+
+ if let title, let exact = candidates.first(where: { $0.title == title }) {
+ return exact
+ }
+ return candidates.first
+ }
+
+ private func captureScreenshot(target: WindowTarget) throws -> (Data, ScreenshotMetadata) {
+ let cgImage: CGImage?
+ if let windowNumber = target.windowNumber {
+ cgImage = CGWindowListCreateImage(
+ CGRect.null,
+ .optionIncludingWindow,
+ CGWindowID(windowNumber),
+ [.bestResolution, .boundsIgnoreFraming]
+ )
+ } else {
+ cgImage = CGWindowListCreateImage(target.bounds, .optionOnScreenOnly, kCGNullWindowID, [.bestResolution])
+ }
+
+ guard let cgImage else { throw ComputerUseError.screenshotFailed }
+
+ let rawWidth = CGFloat(cgImage.width)
+ let rawHeight = CGFloat(cgImage.height)
+ let targetWidth = min(screenshotImageWidth, rawWidth)
+ let targetHeight = rawHeight * (targetWidth / rawWidth)
+
+ let source = NSImage(cgImage: cgImage, size: NSSize(width: rawWidth, height: rawHeight))
+ let resized = NSImage(size: NSSize(width: targetWidth, height: targetHeight))
+ resized.lockFocus()
+ source.draw(in: NSRect(x: 0, y: 0, width: targetWidth, height: targetHeight))
+ resized.unlockFocus()
+
+ guard let tiff = resized.tiffRepresentation,
+ let rep = NSBitmapImageRep(data: tiff),
+ let jpeg = rep.representation(using: .jpeg, properties: [.compressionFactor: 0.45]) else {
+ throw ComputerUseError.screenshotFailed
+ }
+
+ return (
+ jpeg,
+ ScreenshotMetadata(
+ imageWidth: Int(targetWidth),
+ imageHeight: Int(targetHeight),
+ capturedBounds: target.bounds
+ )
+ )
+ }
+
+ private func cgFloat(_ value: Any?) -> CGFloat {
+ if let value = value as? CGFloat { return value }
+ if let value = value as? Double { return CGFloat(value) }
+ if let value = value as? Int { return CGFloat(value) }
+ if let value = value as? NSNumber { return CGFloat(truncating: value) }
+ return 0
+ }
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundActivationSession.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundActivationSession.swift
new file mode 100644
index 0000000000..76c02f7d5b
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundActivationSession.swift
@@ -0,0 +1,164 @@
+import AppKit
+import CoreGraphics
+
+final class BackgroundActivationSession: @unchecked Sendable {
+ private final class TapContext {
+ let suppressFocusMessages: Bool
+
+ init(suppressFocusMessages: Bool) {
+ self.suppressFocusMessages = suppressFocusMessages
+ }
+ }
+
+ private struct TapRef {
+ let tap: CFMachPort
+ let source: CFRunLoopSource
+ let context: TapContext
+ }
+
+ private let previousPID: pid_t
+ private let targetPID: pid_t
+ private var taps: [TapRef] = []
+ private var runLoop: CFRunLoop?
+ private var thread: Thread?
+ private var startupError: Error?
+ private var started = false
+ private var lastTarget: WindowTarget?
+
+ init(previousPID: pid_t, targetPID: pid_t) {
+ self.previousPID = previousPID
+ self.targetPID = targetPID
+ }
+
+ deinit {
+ stop()
+ }
+
+ func start() throws {
+ guard !started else { return }
+
+ let ready = DispatchSemaphore(value: 0)
+ let thread = Thread { [weak self] in
+ guard let self else {
+ ready.signal()
+ return
+ }
+ self.runLoop = CFRunLoopGetCurrent()
+ do {
+ try self.installTapsOnCurrentRunLoop()
+ } catch {
+ self.startupError = error
+ }
+ ready.signal()
+ if self.startupError == nil {
+ CFRunLoopRun()
+ }
+ }
+ thread.name = "OpenWorkBackgroundActivationSession"
+ self.thread = thread
+ thread.start()
+ ready.wait()
+
+ if let startupError {
+ throw startupError
+ }
+ started = true
+ }
+
+ func activate(target: WindowTarget) async throws {
+ guard let windowNumber = target.windowNumber else {
+ throw ComputerUseError.strictModeViolation("target window has no CG window number")
+ }
+
+ postAppKitDefined(subtype: 1, target: target, windowNumber: windowNumber)
+ try await Task.sleep(nanoseconds: 25_000_000)
+ try await BackgroundInputDispatcher.click(pid: target.pid, windowNumber: windowNumber, point: target.center)
+ try await Task.sleep(nanoseconds: 80_000_000)
+ lastTarget = target
+ }
+
+ func stop() {
+ if let target = lastTarget, let windowNumber = target.windowNumber {
+ postAppKitDefined(subtype: 2, target: target, windowNumber: windowNumber)
+ }
+ for tapRef in taps {
+ CFMachPortInvalidate(tapRef.tap)
+ }
+ taps.removeAll()
+ if let runLoop {
+ CFRunLoopStop(runLoop)
+ }
+ runLoop = nil
+ thread = nil
+ started = false
+ }
+
+ private func installTapsOnCurrentRunLoop() throws {
+ if previousPID != targetPID {
+ taps.append(try installTap(pid: previousPID, suppressFocusMessages: true))
+ }
+ taps.append(try installTap(pid: targetPID, suppressFocusMessages: false))
+ }
+
+ private func installTap(pid: pid_t, suppressFocusMessages: Bool) throws -> TapRef {
+ let context = TapContext(suppressFocusMessages: suppressFocusMessages)
+ let userInfo = Unmanaged.passUnretained(context).toOpaque()
+ guard let tap = CGEvent.tapCreateForPid(
+ pid: pid,
+ place: .headInsertEventTap,
+ options: .defaultTap,
+ eventsOfInterest: CGEventMask.max,
+ callback: BackgroundActivationSession.eventTapCallback,
+ userInfo: userInfo
+ ) else {
+ throw ComputerUseError.strictModeViolation("could not install per-process event tap for pid \(pid)")
+ }
+
+ guard let source = CFMachPortCreateRunLoopSource(kCFAllocatorDefault, tap, 0) else {
+ CFMachPortInvalidate(tap)
+ throw ComputerUseError.strictModeViolation("could not create run loop source for pid \(pid) event tap")
+ }
+
+ CFRunLoopAddSource(CFRunLoopGetCurrent(), source, .commonModes)
+ CGEvent.tapEnable(tap: tap, enable: true)
+ return TapRef(tap: tap, source: source, context: context)
+ }
+
+ private func postAppKitDefined(subtype: Int16, target: WindowTarget, windowNumber: Int) {
+ guard let event = NSEvent.otherEvent(
+ with: .appKitDefined,
+ location: target.center,
+ modifierFlags: [],
+ timestamp: ProcessInfo.processInfo.systemUptime,
+ windowNumber: windowNumber,
+ context: nil,
+ subtype: subtype,
+ data1: 0,
+ data2: 0
+ )?.cgEvent else {
+ return
+ }
+
+ BackgroundInputDispatcher.address(event, pid: target.pid, windowNumber: windowNumber)
+ event.postToPid(target.pid)
+ }
+
+ private static let eventTapCallback: CGEventTapCallBack = { _, type, event, userInfo in
+ if type == .tapDisabledByTimeout || type == .tapDisabledByUserInput {
+ return Unmanaged.passUnretained(event)
+ }
+ guard let userInfo else {
+ return Unmanaged.passUnretained(event)
+ }
+ let context = Unmanaged.fromOpaque(userInfo).takeUnretainedValue()
+ if context.suppressFocusMessages && BackgroundActivationSession.isFocusMessage(type) {
+ return nil
+ }
+ return Unmanaged.passUnretained(event)
+ }
+
+ private static func isFocusMessage(_ type: CGEventType) -> Bool {
+ let raw = Int(type.rawValue)
+ return raw == 13 || raw == 19 || raw == 20
+ }
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundInputDispatcher.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundInputDispatcher.swift
new file mode 100644
index 0000000000..1e9f8829b6
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundInputDispatcher.swift
@@ -0,0 +1,140 @@
+import Foundation
+import CoreGraphics
+
+enum BackgroundInputDispatcher {
+ private static let privateWindowField = CGEventField(rawValue: 51)
+ private static let privateRouteField = CGEventField(rawValue: 58)
+
+ static func click(pid: pid_t, windowNumber: Int, point: CGPoint, doubleClick: Bool = false) async throws {
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+
+ let clickCount = doubleClick ? 2 : 1
+ for clickState in 1...clickCount {
+ guard let down = CGEvent(mouseEventSource: source, mouseType: .leftMouseDown, mouseCursorPosition: point, mouseButton: .left),
+ let up = CGEvent(mouseEventSource: source, mouseType: .leftMouseUp, mouseCursorPosition: point, mouseButton: .left) else {
+ throw ComputerUseError.eventCreationFailed
+ }
+
+ address(down, pid: pid, windowNumber: windowNumber)
+ down.setIntegerValueField(.mouseEventClickState, value: Int64(clickState))
+ down.setDoubleValueField(.mouseEventPressure, value: 1)
+ down.postToPid(pid)
+
+ try await Task.sleep(nanoseconds: 30_000_000)
+
+ address(up, pid: pid, windowNumber: windowNumber)
+ up.setIntegerValueField(.mouseEventClickState, value: Int64(clickState))
+ up.setDoubleValueField(.mouseEventPressure, value: 0)
+ up.postToPid(pid)
+
+ if clickState < clickCount {
+ try await Task.sleep(nanoseconds: 50_000_000)
+ }
+ }
+ }
+
+ static func scroll(pid: pid_t, windowNumber: Int, point: CGPoint, deltaX: Int32, deltaY: Int32) throws {
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+ guard let event = CGEvent(scrollWheelEvent2Source: source, units: .line, wheelCount: 2, wheel1: deltaY, wheel2: deltaX, wheel3: 0) else {
+ throw ComputerUseError.eventCreationFailed
+ }
+ event.location = point
+ address(event, pid: pid, windowNumber: windowNumber)
+ event.postToPid(pid)
+ }
+
+ static func typeText(pid: pid_t, text: String) throws {
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+
+ let units = Array(text.utf16)
+ let chunkSize = 20
+ for start in stride(from: 0, to: units.count, by: chunkSize) {
+ let end = min(start + chunkSize, units.count)
+ let chunk = Array(units[start.. (flags: CGEventFlags, keyCode: CGKeyCode) {
+ let parts = combo.lowercased().split(separator: "+").map(String.init)
+ var flags: CGEventFlags = []
+ var keyName = ""
+
+ for part in parts {
+ switch part {
+ case "command", "cmd", "meta": flags.insert(.maskCommand)
+ case "shift": flags.insert(.maskShift)
+ case "control", "ctrl": flags.insert(.maskControl)
+ case "option", "alt": flags.insert(.maskAlternate)
+ default: keyName = part
+ }
+ }
+
+ guard let keyCode = keyCodes[keyName] else {
+ throw ComputerUseError.unknownKey(keyName)
+ }
+ return (flags, keyCode)
+ }
+
+ private static let keyCodes: [String: CGKeyCode] = [
+ "return": 0x24, "enter": 0x24, "tab": 0x30, "space": 0x31,
+ "delete": 0x33, "backspace": 0x33, "escape": 0x35, "esc": 0x35,
+ "up": 0x7E, "down": 0x7D, "left": 0x7B, "right": 0x7C,
+ "home": 0x73, "end": 0x77, "pageup": 0x74, "pagedown": 0x79,
+ "a": 0x00, "b": 0x0B, "c": 0x08, "d": 0x02, "e": 0x0E,
+ "f": 0x03, "g": 0x05, "h": 0x04, "i": 0x22, "j": 0x26,
+ "k": 0x28, "l": 0x25, "m": 0x2E, "n": 0x2D, "o": 0x1F,
+ "p": 0x23, "q": 0x0C, "r": 0x0F, "s": 0x01, "t": 0x11,
+ "u": 0x20, "v": 0x09, "w": 0x0D, "x": 0x07, "y": 0x10, "z": 0x06,
+ "0": 0x1D, "1": 0x12, "2": 0x13, "3": 0x14, "4": 0x15,
+ "5": 0x17, "6": 0x16, "7": 0x1A, "8": 0x1C, "9": 0x19,
+ "f1": 0x7A, "f2": 0x78, "f3": 0x63, "f4": 0x76,
+ "f5": 0x60, "f6": 0x61, "f7": 0x62, "f8": 0x64,
+ "f9": 0x65, "f10": 0x6D, "f11": 0x67, "f12": 0x6F,
+ "-": 0x1B, "=": 0x18, "[": 0x21, "]": 0x1E,
+ "\\": 0x2A, ";": 0x29, "'": 0x27, ",": 0x2B,
+ ".": 0x2F, "/": 0x2C, "`": 0x32,
+ ]
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/ComputerUseRuntime.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/ComputerUseRuntime.swift
new file mode 100644
index 0000000000..c8e867a422
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/ComputerUseRuntime.swift
@@ -0,0 +1,227 @@
+import AppKit
+import Foundation
+
+actor ComputerUseRuntime {
+ private let accessibility = AccessibilityService()
+ private let foregroundInput = InputService()
+ private var lastSnapshot: AppSnapshot?
+ private var strictMode = true
+ private var activationSession: BackgroundActivationSession?
+ private var activationKey: String?
+ private var activatedWindowKey: String?
+
+ func setStrictMode(_ enabled: Bool) -> ActionMetadata {
+ strictMode = enabled
+ if !enabled {
+ activationSession?.stop()
+ activationSession = nil
+ activationKey = nil
+ activatedWindowKey = nil
+ }
+ return ActionMetadata(
+ ok: true,
+ path: .none,
+ strictMode: enabled,
+ backgroundSafe: enabled,
+ fallbackUsed: false,
+ message: enabled ? "Strict background mode enabled." : "Strict background mode disabled. Foreground fallback is allowed."
+ )
+ }
+
+ func snapshot(appName: String?, strict requestedStrict: Bool?) async throws -> AppSnapshot {
+ let effectiveStrict = requestedStrict ?? strictMode
+ if !effectiveStrict {
+ activationSession?.stop()
+ activationSession = nil
+ activationKey = nil
+ activatedWindowKey = nil
+ }
+
+ var target = try accessibility.resolveTarget(appName: appName)
+ let backgroundActivated: Bool
+ if effectiveStrict, !target.isFrontmost {
+ backgroundActivated = try await ensureBackgroundActivation(target: target)
+ target = try accessibility.resolveTarget(appName: appName)
+ } else {
+ backgroundActivated = false
+ }
+
+ let snapshot = try await accessibility.snapshot(
+ target: target,
+ strictMode: effectiveStrict,
+ backgroundActivated: backgroundActivated
+ )
+ lastSnapshot = snapshot
+ return snapshot
+ }
+
+ func click(ref: String?, index: Int?, imageX: Double?, imageY: Double?, clickCount: Int, strict requestedStrict: Bool?) async throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ let effectiveStrict = requestedStrict ?? snapshot.strictMode
+
+ if let record = findRecord(ref: ref, index: index, in: snapshot) {
+ if record.semantic.capabilities.canPress, accessibility.press(record: record) {
+ return ActionMetadata(ok: true, path: .accessibility, strictMode: effectiveStrict, backgroundSafe: true, fallbackUsed: false, message: "Pressed \(record.semantic.ref) via AXPress.")
+ }
+ if record.semantic.capabilities.canFocus, accessibility.focus(record: record) {
+ return ActionMetadata(ok: true, path: .accessibility, strictMode: effectiveStrict, backgroundSafe: true, fallbackUsed: false, message: "Focused \(record.semantic.ref) via AX.")
+ }
+ return try await clickPoint(record.semantic.frame.center, clickCount: clickCount, strict: effectiveStrict, fallbackUsed: true)
+ }
+
+ if let imageX, let imageY {
+ let point = snapshot.screenshotMeta.toScreen(imageX: imageX, imageY: imageY)
+ return try await clickPoint(point, clickCount: clickCount, strict: effectiveStrict, fallbackUsed: false)
+ }
+
+ throw ComputerUseError.invalidElement(ref ?? index.map(String.init) ?? "")
+ }
+
+ func typeText(_ text: String, strict requestedStrict: Bool?) throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ let effectiveStrict = requestedStrict ?? snapshot.strictMode
+ if effectiveStrict {
+ try BackgroundInputDispatcher.typeText(pid: snapshot.pid, text: text)
+ return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: false, message: "Typed text with postToPid.")
+ }
+
+ try foregroundInput.typeText(text)
+ return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Typed text with foreground HID fallback.")
+ }
+
+ func pressKey(_ combo: String, strict requestedStrict: Bool?) throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ let effectiveStrict = requestedStrict ?? snapshot.strictMode
+ if effectiveStrict {
+ try BackgroundInputDispatcher.pressKey(pid: snapshot.pid, combo: combo)
+ return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: false, message: "Pressed key with postToPid.")
+ }
+
+ try foregroundInput.pressKey(combo)
+ return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Pressed key with foreground HID fallback.")
+ }
+
+ func scroll(direction: String?, pages: Double, imageX: Double?, imageY: Double?, strict requestedStrict: Bool?) throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ let effectiveStrict = requestedStrict ?? snapshot.strictMode
+ let amount = max(1, Int32(pages * 5))
+ let deltas = scrollDeltas(direction: direction, amount: amount)
+ let point: CGPoint = {
+ if let imageX, let imageY {
+ return snapshot.screenshotMeta.toScreen(imageX: imageX, imageY: imageY)
+ }
+ return CGPoint(x: snapshot.screenshotMeta.capturedBounds.midX, y: snapshot.screenshotMeta.capturedBounds.midY)
+ }()
+
+ if effectiveStrict {
+ guard let windowNumber = snapshot.windowNumber else {
+ throw ComputerUseError.strictModeViolation("background scroll requires a CG window number")
+ }
+ try BackgroundInputDispatcher.scroll(pid: snapshot.pid, windowNumber: windowNumber, point: point, deltaX: deltas.x, deltaY: deltas.y)
+ return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: false, message: "Scrolled with postToPid.")
+ }
+
+ try foregroundInput.scroll(point: point, deltaX: deltas.x, deltaY: deltas.y)
+ return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Scrolled with foreground HID fallback.")
+ }
+
+ func setValue(ref: String?, index: Int?, value: String) throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ guard let record = findRecord(ref: ref, index: index, in: snapshot) else {
+ throw ComputerUseError.invalidElement(ref ?? index.map(String.init) ?? "")
+ }
+ let ok = accessibility.setValue(record: record, value: value)
+ return ActionMetadata(ok: ok, path: .accessibility, strictMode: snapshot.strictMode, backgroundSafe: true, fallbackUsed: false, message: ok ? "Set \(record.semantic.ref) via AXValue." : "Element value is not settable.")
+ }
+
+ func performAction(ref: String?, index: Int?, action: String) throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ guard let record = findRecord(ref: ref, index: index, in: snapshot) else {
+ throw ComputerUseError.invalidElement(ref ?? index.map(String.init) ?? "")
+ }
+ let ok = accessibility.performAction(record: record, action: action)
+ return ActionMetadata(ok: ok, path: .accessibility, strictMode: snapshot.strictMode, backgroundSafe: true, fallbackUsed: false, message: ok ? "Performed \(action) on \(record.semantic.ref)." : "AX action \(action) failed.")
+ }
+
+ func wait(milliseconds: Int) async -> ActionMetadata {
+ let clamped = max(0, min(milliseconds, 10_000))
+ try? await Task.sleep(nanoseconds: UInt64(clamped) * 1_000_000)
+ return ActionMetadata(ok: true, path: .none, strictMode: strictMode, backgroundSafe: true, fallbackUsed: false, message: "Waited \(clamped)ms.")
+ }
+
+ private func clickPoint(_ point: CGPoint, clickCount: Int, strict: Bool, fallbackUsed: Bool) async throws -> ActionMetadata {
+ let snapshot = try requireSnapshot()
+ if strict {
+ guard let windowNumber = snapshot.windowNumber else {
+ throw ComputerUseError.strictModeViolation("background click requires a CG window number")
+ }
+ try await BackgroundInputDispatcher.click(pid: snapshot.pid, windowNumber: windowNumber, point: point, doubleClick: clickCount >= 2)
+ return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: fallbackUsed, message: "Clicked with postToPid at \(Int(point.x)),\(Int(point.y)).")
+ }
+
+ try await foregroundInput.click(point: point, doubleClick: clickCount >= 2)
+ return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Clicked with foreground HID fallback at \(Int(point.x)),\(Int(point.y)).")
+ }
+
+ private func ensureBackgroundActivation(target: WindowTarget) async throws -> Bool {
+ guard let previousPID = NSWorkspace.shared.frontmostApplication?.processIdentifier else {
+ throw ComputerUseError.noFrontmostApplication
+ }
+ let nextActivationKey = "\(previousPID):\(target.pid)"
+ if activationKey != nextActivationKey {
+ activationSession?.stop()
+ let next = BackgroundActivationSession(previousPID: previousPID, targetPID: target.pid)
+ try next.start()
+ activationSession = next
+ activationKey = nextActivationKey
+ activatedWindowKey = nil
+ }
+
+ let nextWindowKey = "\(target.pid):\(target.windowNumber ?? -1)"
+ if activatedWindowKey != nextWindowKey {
+ guard let activationSession else {
+ throw ComputerUseError.strictModeViolation("background activation session was not created")
+ }
+ try await activationSession.activate(target: target)
+ activatedWindowKey = nextWindowKey
+ }
+ return true
+ }
+
+ private func requireSnapshot() throws -> AppSnapshot {
+ guard let lastSnapshot else { throw ComputerUseError.noSnapshot }
+ return lastSnapshot
+ }
+
+ private func findRecord(ref: String?, index: Int?, in snapshot: AppSnapshot) -> AXElementRecord? {
+ if let ref {
+ let normalized = normalizeRef(ref)
+ return snapshot.records.first { $0.semantic.ref == normalized }
+ }
+ if let index {
+ if let byID = snapshot.records.first(where: { $0.semantic.id == index }) {
+ return byID
+ }
+ if index >= 0 && index < snapshot.records.count {
+ return snapshot.records[index]
+ }
+ }
+ return nil
+ }
+
+ private func normalizeRef(_ raw: String) -> String {
+ let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines)
+ if trimmed.hasPrefix("{e"), trimmed.hasSuffix("}") { return trimmed }
+ if trimmed.hasPrefix("e") { return "{\(trimmed)}" }
+ return "{e\(trimmed)}"
+ }
+
+ private func scrollDeltas(direction: String?, amount: Int32) -> (x: Int32, y: Int32) {
+ switch direction?.lowercased() ?? "down" {
+ case "up": return (0, amount)
+ case "left": return (amount, 0)
+ case "right": return (-amount, 0)
+ default: return (0, -amount)
+ }
+ }
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/InputService.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/InputService.swift
new file mode 100644
index 0000000000..fcea3c86f6
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/InputService.swift
@@ -0,0 +1,102 @@
+import Foundation
+import CoreGraphics
+
+final class InputService: @unchecked Sendable {
+ func moveMouse(point: CGPoint) throws {
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+ guard let event = CGEvent(mouseEventSource: source, mouseType: .mouseMoved, mouseCursorPosition: point, mouseButton: .left) else {
+ throw ComputerUseError.eventCreationFailed
+ }
+ event.post(tap: .cghidEventTap)
+ }
+
+ func click(point: CGPoint, doubleClick: Bool = false) async throws {
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+
+ if let move = CGEvent(mouseEventSource: source, mouseType: .mouseMoved, mouseCursorPosition: point, mouseButton: .left) {
+ move.post(tap: .cghidEventTap)
+ }
+ try await Task.sleep(nanoseconds: 50_000_000)
+
+ let count = doubleClick ? 2 : 1
+ for clickState in 1...count {
+ guard let down = CGEvent(mouseEventSource: source, mouseType: .leftMouseDown, mouseCursorPosition: point, mouseButton: .left),
+ let up = CGEvent(mouseEventSource: source, mouseType: .leftMouseUp, mouseCursorPosition: point, mouseButton: .left) else {
+ throw ComputerUseError.eventCreationFailed
+ }
+ down.setIntegerValueField(.mouseEventClickState, value: Int64(clickState))
+ up.setIntegerValueField(.mouseEventClickState, value: Int64(clickState))
+ down.post(tap: .cghidEventTap)
+ up.post(tap: .cghidEventTap)
+ }
+ }
+
+ func typeText(_ text: String) throws {
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+ let units = Array(text.utf16)
+ for start in stride(from: 0, to: units.count, by: 20) {
+ let end = min(start + 20, units.count)
+ let chunk = Array(units[start..= 2 else { return }
+ guard let source = CGEventSource(stateID: .combinedSessionState) else {
+ throw ComputerUseError.eventSourceFailed
+ }
+ guard let down = CGEvent(mouseEventSource: source, mouseType: .leftMouseDown, mouseCursorPosition: path[0], mouseButton: .left) else {
+ throw ComputerUseError.eventCreationFailed
+ }
+ down.post(tap: .cghidEventTap)
+ for point in path.dropFirst().dropLast() {
+ if let drag = CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: point, mouseButton: .left) {
+ drag.post(tap: .cghidEventTap)
+ }
+ try await Task.sleep(nanoseconds: 12_000_000)
+ }
+ guard let last = path.last,
+ let up = CGEvent(mouseEventSource: source, mouseType: .leftMouseUp, mouseCursorPosition: last, mouseButton: .left) else {
+ throw ComputerUseError.eventCreationFailed
+ }
+ up.post(tap: .cghidEventTap)
+ }
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/MCPServer.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/MCPServer.swift
new file mode 100644
index 0000000000..6a3ce78ddd
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/MCPServer.swift
@@ -0,0 +1,533 @@
+import AppKit
+import ApplicationServices
+import Foundation
+
+actor MCPServer {
+ private let runtime = ComputerUseRuntime()
+ private let input = InputService()
+
+ func run() async {
+ log("HandsFree computer-use server starting")
+ while let line = readLine(strippingNewline: true) {
+ guard !line.isEmpty else { continue }
+ guard let data = line.data(using: .utf8),
+ let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
+ log("Invalid JSON-RPC line")
+ continue
+ }
+
+ let id = json["id"]
+ let method = json["method"] as? String ?? ""
+ let params = json["params"] as? [String: Any] ?? [:]
+
+ switch method {
+ case "initialize":
+ respond(id: id, result: [
+ "protocolVersion": "2025-03-26",
+ "capabilities": ["tools": [:]],
+ "serverInfo": ["name": "openwork-handsfree-computer-use", "version": "0.1.0"],
+ ])
+ case "notifications/initialized":
+ break
+ case "tools/list":
+ respond(id: id, result: ["tools": toolSchemas()])
+ case "tools/call":
+ let name = params["name"] as? String ?? ""
+ let args = params["arguments"] as? [String: Any] ?? [:]
+ let content = await executeTool(name: name, args: args)
+ respond(id: id, result: ["content": content])
+ default:
+ if id != nil {
+ respondError(id: id, code: -32601, message: "Method not found: \(method)")
+ }
+ }
+ }
+ }
+
+ private func toolSchemas() -> [[String: Any]] {
+ [
+ toolSchema(
+ name: "snapshot",
+ description: "Return target-window screenshot plus compact semantic AX state. Uses strict background activation by default.",
+ properties: [
+ "app": ["type": "string", "description": "Optional running app name. Omit for frontmost app."],
+ "strict": ["type": "boolean", "description": "Keep actions on background-safe AX/postToPid paths. Default true."],
+ ]
+ ),
+ toolSchema(
+ name: "click",
+ description: "Click a semantic ref like {e1}, an index, or screenshot x/y. AX is tried first; strict mode only falls back to background postToPid.",
+ properties: [
+ "ref": ["type": "string", "description": "Semantic ref from snapshot, e.g. {e1}."],
+ "index": ["type": "number", "description": "Element id or zero-based compatibility index."],
+ "x": ["type": "number", "description": "Screenshot x coordinate."],
+ "y": ["type": "number", "description": "Screenshot y coordinate."],
+ "click_count": ["type": "number", "description": "1 or 2. Default 1."],
+ "strict": ["type": "boolean", "description": "Override strict mode for this action."],
+ ]
+ ),
+ toolSchema(
+ name: "type_text",
+ description: "Type text into the target process. In strict mode this uses CGEvent.postToPid and does not move the real cursor.",
+ properties: [
+ "text": ["type": "string", "description": "Text to type."],
+ "strict": ["type": "boolean", "description": "Override strict mode for this action."],
+ ]
+ ),
+ toolSchema(
+ name: "press_key",
+ description: "Press a key combo such as command+k, return, tab, or escape.",
+ properties: [
+ "combo": ["type": "string", "description": "Key combo."],
+ "strict": ["type": "boolean", "description": "Override strict mode for this action."],
+ ]
+ ),
+ toolSchema(
+ name: "scroll",
+ description: "Scroll the target window without foregrounding it in strict mode.",
+ properties: [
+ "direction": ["type": "string", "description": "up, down, left, or right."],
+ "pages": ["type": "number", "description": "Approximate page count. Default 1."],
+ "x": ["type": "number", "description": "Optional screenshot x coordinate."],
+ "y": ["type": "number", "description": "Optional screenshot y coordinate."],
+ "strict": ["type": "boolean", "description": "Override strict mode for this action."],
+ ]
+ ),
+ toolSchema(
+ name: "set_value",
+ description: "Set a semantic AX element value directly. This stays background-safe.",
+ properties: [
+ "ref": ["type": "string", "description": "Semantic ref from snapshot."],
+ "index": ["type": "number", "description": "Element id or zero-based compatibility index."],
+ "value": ["type": "string", "description": "Value to set."],
+ ]
+ ),
+ toolSchema(
+ name: "perform_action",
+ description: "Perform a named AX action such as AXPress, AXShowMenu, AXIncrement, or AXDecrement.",
+ properties: [
+ "ref": ["type": "string", "description": "Semantic ref from snapshot."],
+ "index": ["type": "number", "description": "Element id or zero-based compatibility index."],
+ "action": ["type": "string", "description": "AX action name. Default AXPress."],
+ ]
+ ),
+ toolSchema(
+ name: "wait",
+ description: "Wait for UI to settle.",
+ properties: ["milliseconds": ["type": "number", "description": "Wait time. Default 1000."]]
+ ),
+ toolSchema(
+ name: "set_strict_mode",
+ description: "Enable or disable strict background mode. Strict mode rejects foreground fallbacks.",
+ properties: ["enabled": ["type": "boolean", "description": "Whether strict background mode is enabled."]]
+ ),
+ toolSchema(name: "check_permissions", description: "Check Accessibility and Screen Recording permission status.", properties: [:]),
+ toolSchema(name: "get_app_state", description: "Compatibility alias for snapshot.", properties: ["app": ["type": "string"], "strict": ["type": "boolean"]]),
+ toolSchema(name: "launch_app", description: "Launch a macOS app by name.", properties: ["name": ["type": "string"]]),
+ toolSchema(name: "activate_app", description: "Bring a running macOS app to the foreground.", properties: ["name": ["type": "string"]]),
+ toolSchema(name: "list_apps", description: "List running regular macOS apps.", properties: [:]),
+ toolSchema(name: "open_url", description: "Open a URL in the default browser or a specific browser app.", properties: ["url": ["type": "string"], "app": ["type": "string"]]),
+ toolSchema(name: "clipboard_read", description: "Read text from the macOS clipboard.", properties: [:]),
+ toolSchema(name: "clipboard_write", description: "Write text to the macOS clipboard.", properties: ["text": ["type": "string"]]),
+ toolSchema(name: "display_info", description: "Return main display logical dimensions and scale factor.", properties: [:]),
+ toolSchema(name: "cua_screenshot", description: "Compatibility full-screen screenshot for CUA loops. Returns logical-size PNG.", properties: [:]),
+ toolSchema(name: "cua_click", description: "Compatibility click at absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"]]),
+ toolSchema(name: "cua_double_click", description: "Compatibility double-click at absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"]]),
+ toolSchema(name: "cua_move", description: "Compatibility mouse move to absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"]]),
+ toolSchema(name: "cua_type", description: "Compatibility text typing into focused input.", properties: ["text": ["type": "string"]]),
+ toolSchema(name: "cua_keypress", description: "Compatibility keypress using CUA key names.", properties: ["keys": ["type": "array", "items": ["type": "string"]]]),
+ toolSchema(name: "cua_scroll", description: "Compatibility scroll at absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"], "scroll_x": ["type": "number"], "scroll_y": ["type": "number"]]),
+ toolSchema(name: "cua_drag", description: "Compatibility drag over an array of [x,y] points.", properties: ["path": ["type": "array"]]),
+ toolSchema(name: "cua_wait", description: "Compatibility wait for UI to settle.", properties: [:]),
+ ]
+ }
+
+ private func toolSchema(name: String, description: String, properties: [String: Any]) -> [String: Any] {
+ ["name": name, "description": description, "inputSchema": ["type": "object", "properties": properties]]
+ }
+
+ private func executeTool(name: String, args: [String: Any]) async -> [[String: Any]] {
+ do {
+ switch name {
+ case "snapshot", "get_app_state":
+ return try await snapshotResult(args: args)
+ case "click":
+ let metadata = try await runtime.click(
+ ref: args["ref"] as? String,
+ index: intArg(args, "index"),
+ imageX: doubleArg(args, "x"),
+ imageY: doubleArg(args, "y"),
+ clickCount: intArg(args, "click_count") ?? 1,
+ strict: boolArg(args, "strict")
+ )
+ return jsonResult(metadata.dictionary)
+ case "type_text":
+ let metadata = try await runtime.typeText(args["text"] as? String ?? "", strict: boolArg(args, "strict"))
+ return jsonResult(metadata.dictionary)
+ case "press_key":
+ let metadata = try await runtime.pressKey(args["combo"] as? String ?? "", strict: boolArg(args, "strict"))
+ return jsonResult(metadata.dictionary)
+ case "scroll":
+ let metadata = try await runtime.scroll(
+ direction: args["direction"] as? String,
+ pages: doubleArg(args, "pages") ?? 1,
+ imageX: doubleArg(args, "x"),
+ imageY: doubleArg(args, "y"),
+ strict: boolArg(args, "strict")
+ )
+ return jsonResult(metadata.dictionary)
+ case "set_value":
+ let metadata = try await runtime.setValue(ref: args["ref"] as? String, index: intArg(args, "index"), value: args["value"] as? String ?? "")
+ return jsonResult(metadata.dictionary)
+ case "perform_action":
+ let metadata = try await runtime.performAction(ref: args["ref"] as? String, index: intArg(args, "index"), action: args["action"] as? String ?? kAXPressAction)
+ return jsonResult(metadata.dictionary)
+ case "wait":
+ let metadata = await runtime.wait(milliseconds: intArg(args, "milliseconds") ?? 1000)
+ return jsonResult(metadata.dictionary)
+ case "set_strict_mode":
+ let metadata = await runtime.setStrictMode(boolArg(args, "enabled") ?? true)
+ return jsonResult(metadata.dictionary)
+ case "check_permissions":
+ return jsonResult(checkPermissions())
+ case "launch_app":
+ return try await jsonResult(handleLaunchApp(args: args))
+ case "activate_app":
+ return jsonResult(handleActivateApp(args: args))
+ case "list_apps":
+ return jsonResult(["ok": true, "apps": runningApps().compactMap(\.localizedName).sorted()])
+ case "open_url":
+ return try await jsonResult(handleOpenURL(args: args))
+ case "clipboard_read":
+ return jsonResult(["ok": true, "text": NSPasteboard.general.string(forType: .string) ?? ""])
+ case "clipboard_write":
+ let pasteboard = NSPasteboard.general
+ pasteboard.clearContents()
+ pasteboard.setString(args["text"] as? String ?? "", forType: .string)
+ return jsonResult(["ok": true])
+ case "display_info":
+ return jsonResult(displayInfo())
+ case "cua_screenshot":
+ return try cuaScreenshotResult()
+ case "cua_click":
+ try await input.click(point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0))
+ return jsonResult(["ok": true])
+ case "cua_double_click":
+ try await input.click(point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0), doubleClick: true)
+ return jsonResult(["ok": true])
+ case "cua_move":
+ try input.moveMouse(point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0))
+ return jsonResult(["ok": true])
+ case "cua_type":
+ try input.typeText(args["text"] as? String ?? "")
+ return jsonResult(["ok": true])
+ case "cua_keypress":
+ try input.pressKey(cuaKeysToCombo(args["keys"] as? [String] ?? []))
+ return jsonResult(["ok": true])
+ case "cua_scroll":
+ try input.scroll(
+ point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0),
+ deltaX: Int32(intArg(args, "scroll_x") ?? 0),
+ deltaY: Int32(-(intArg(args, "scroll_y") ?? 0))
+ )
+ return jsonResult(["ok": true])
+ case "cua_drag":
+ try await input.drag(path: parsePointPath(args["path"]))
+ return jsonResult(["ok": true])
+ case "cua_wait":
+ try await Task.sleep(nanoseconds: 1_000_000_000)
+ return jsonResult(["ok": true])
+ default:
+ return jsonResult(["ok": false, "error": "Unknown tool: \(name)"])
+ }
+ } catch {
+ return jsonResult(errorPayload(error))
+ }
+ }
+
+ private func snapshotResult(args: [String: Any]) async throws -> [[String: Any]] {
+ let snapshot = try await runtime.snapshot(appName: args["app"] as? String, strict: boolArg(args, "strict"))
+ let payload = snapshotPayload(snapshot)
+ guard let text = jsonString(payload) else {
+ return textResult("Failed to serialize semantic AX snapshot.")
+ }
+ return [
+ ["type": "image", "data": snapshot.screenshotData.base64EncodedString(), "mimeType": snapshot.screenshotMimeType],
+ ["type": "text", "text": text],
+ ]
+ }
+
+ private func snapshotPayload(_ snapshot: AppSnapshot) -> [String: Any] {
+ let elements = snapshot.elements.map { element -> [String: Any] in
+ var dict = element.dictionary
+ let imagePoint = snapshot.screenshotMeta.toImage(point: element.frame.center)
+ dict["center"] = [
+ "screenX": Int(element.frame.center.x),
+ "screenY": Int(element.frame.center.y),
+ "imageX": Int(imagePoint.x),
+ "imageY": Int(imagePoint.y),
+ ]
+ return dict
+ }
+
+ var result: [String: Any] = [
+ "ok": true,
+ "semanticAXVersion": 1,
+ "app": snapshot.appName,
+ "pid": Int(snapshot.pid),
+ "windowTitle": snapshot.windowTitle ?? "",
+ "screenshot": snapshot.screenshotMeta.dictionary,
+ "execution": [
+ "strictMode": snapshot.strictMode,
+ "backgroundActivated": snapshot.backgroundActivated,
+ "defaultPath": snapshot.strictMode ? "accessibility_then_background_cgevent" : "accessibility_then_foreground_fallback",
+ ],
+ "elements": elements,
+ "hint": "Use refs like {e1}. Prefer AX-capable refs; strict mode rejects foreground fallback and reports path metadata after every action.",
+ ]
+ if let windowNumber = snapshot.windowNumber {
+ result["windowNumber"] = windowNumber
+ }
+ return result
+ }
+
+ private func checkPermissions() -> [String: Any] {
+ let screenRecording = CGWindowListCreateImage(CGRect(x: 0, y: 0, width: 1, height: 1), .optionOnScreenOnly, kCGNullWindowID, []) != nil
+ return ["ok": true, "accessibility": AXIsProcessTrusted(), "screenRecording": screenRecording]
+ }
+
+ private func handleActivateApp(args: [String: Any]) -> [String: Any] {
+ let name = args["name"] as? String ?? ""
+ guard let app = runningApp(named: name) else {
+ return ["ok": false, "error": "App '\(name)' is not running."]
+ }
+ app.activate()
+ return ["ok": true, "app": app.localizedName ?? name]
+ }
+
+ private func handleLaunchApp(args: [String: Any]) async throws -> [String: Any] {
+ let name = (args["name"] as? String ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
+ guard !name.isEmpty else { return ["ok": false, "error": "App name is required."] }
+ if let app = runningApp(named: name) {
+ app.activate()
+ return ["ok": true, "app": app.localizedName ?? name, "alreadyRunning": true]
+ }
+ guard let appURL = applicationURL(named: name) else {
+ return ["ok": false, "error": "App '\(name)' was not found."]
+ }
+ let config = NSWorkspace.OpenConfiguration()
+ config.activates = true
+ let app = try await NSWorkspace.shared.openApplication(at: appURL, configuration: config)
+ return ["ok": true, "app": app.localizedName ?? name]
+ }
+
+ private func handleOpenURL(args: [String: Any]) async throws -> [String: Any] {
+ guard let rawURL = args["url"] as? String, let url = URL(string: rawURL) else {
+ return ["ok": false, "error": "Invalid URL."]
+ }
+ if let appName = args["app"] as? String, !appName.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
+ guard let appURL = applicationURL(named: appName) else {
+ return ["ok": false, "error": "App '\(appName)' was not found."]
+ }
+ _ = try await NSWorkspace.shared.open([url], withApplicationAt: appURL, configuration: NSWorkspace.OpenConfiguration())
+ return ["ok": true]
+ }
+ return ["ok": NSWorkspace.shared.open(url)]
+ }
+
+ private func displayInfo() -> [String: Any] {
+ guard let screen = NSScreen.main else { return ["ok": false, "error": "No main screen."] }
+ return [
+ "ok": true,
+ "width": Int(screen.frame.width),
+ "height": Int(screen.frame.height),
+ "scale_factor": screen.backingScaleFactor,
+ ]
+ }
+
+ private func cuaScreenshotResult() throws -> [[String: Any]] {
+ guard let screen = NSScreen.main else { throw ComputerUseError.screenshotFailed }
+ guard let cgImage = CGWindowListCreateImage(CGRect.null, .optionOnScreenOnly, kCGNullWindowID, [.bestResolution]) else {
+ throw ComputerUseError.screenshotFailed
+ }
+ let logicalWidth = Int(screen.frame.width)
+ let logicalHeight = Int(screen.frame.height)
+ guard let rep = NSBitmapImageRep(
+ bitmapDataPlanes: nil,
+ pixelsWide: logicalWidth,
+ pixelsHigh: logicalHeight,
+ bitsPerSample: 8,
+ samplesPerPixel: 4,
+ hasAlpha: true,
+ isPlanar: false,
+ colorSpaceName: .deviceRGB,
+ bytesPerRow: 0,
+ bitsPerPixel: 0
+ ) else {
+ throw ComputerUseError.screenshotFailed
+ }
+ rep.size = NSSize(width: logicalWidth, height: logicalHeight)
+ guard let context = NSGraphicsContext(bitmapImageRep: rep) else {
+ throw ComputerUseError.screenshotFailed
+ }
+ NSGraphicsContext.saveGraphicsState()
+ NSGraphicsContext.current = context
+ NSImage(cgImage: cgImage, size: NSSize(width: cgImage.width, height: cgImage.height))
+ .draw(in: NSRect(x: 0, y: 0, width: logicalWidth, height: logicalHeight))
+ NSGraphicsContext.restoreGraphicsState()
+ guard let png = rep.representation(using: .png, properties: [:]) else {
+ throw ComputerUseError.screenshotFailed
+ }
+ return [
+ ["type": "text", "text": jsonString(["ok": true, "width": logicalWidth, "height": logicalHeight]) ?? "{\"ok\":true}"],
+ ["type": "image", "data": png.base64EncodedString(), "mimeType": "image/png"],
+ ]
+ }
+
+ private func runningApps() -> [NSRunningApplication] {
+ NSWorkspace.shared.runningApplications.filter { $0.activationPolicy == .regular }
+ }
+
+ private func runningApp(named name: String) -> NSRunningApplication? {
+ let needle = name.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
+ return runningApps().first { $0.localizedName?.lowercased() == needle }
+ ?? runningApps().first { $0.localizedName?.lowercased().contains(needle) == true }
+ }
+
+ private func applicationURL(named name: String) -> URL? {
+ let trimmed = name.trimmingCharacters(in: .whitespacesAndNewlines)
+ if let url = NSWorkspace.shared.urlForApplication(withBundleIdentifier: bundleId(for: trimmed)) {
+ return url
+ }
+ if let path = NSWorkspace.shared.fullPath(forApplication: trimmed) {
+ return URL(fileURLWithPath: path)
+ }
+ let candidates = [
+ "/Applications/\(trimmed).app",
+ "/System/Applications/\(trimmed).app",
+ "/Applications/Utilities/\(trimmed).app",
+ NSString(string: "~/Applications/\(trimmed).app").expandingTildeInPath,
+ ]
+ return candidates.map(URL.init(fileURLWithPath:)).first { FileManager.default.fileExists(atPath: $0.path) }
+ }
+
+ private func bundleId(for appName: String) -> String {
+ switch appName.lowercased() {
+ case "safari": return "com.apple.Safari"
+ case "google chrome", "chrome": return "com.google.Chrome"
+ case "arc": return "company.thebrowser.Browser"
+ case "microsoft edge", "edge": return "com.microsoft.edgemac"
+ case "brave", "brave browser": return "com.brave.Browser"
+ case "slack": return "com.tinyspeck.slackmacgap"
+ default: return ""
+ }
+ }
+
+ private func cuaKeysToCombo(_ keys: [String]) -> String {
+ keys.map { key in
+ switch key.lowercased() {
+ case "ctrl", "control": return "command"
+ case "meta", "super", "win", "cmd": return "command"
+ case "alt": return "option"
+ case "arrowup": return "up"
+ case "arrowdown": return "down"
+ case "arrowleft": return "left"
+ case "arrowright": return "right"
+ case "backspace": return "delete"
+ case " ": return "space"
+ default: return key.lowercased()
+ }
+ }.joined(separator: "+")
+ }
+
+ private func parsePointPath(_ raw: Any?) -> [CGPoint] {
+ guard let pairs = raw as? [[Any]] else { return [] }
+ return pairs.compactMap { pair in
+ guard pair.count >= 2 else { return nil }
+ let x = valueAsDouble(pair[0])
+ let y = valueAsDouble(pair[1])
+ guard let x, let y else { return nil }
+ return CGPoint(x: x, y: y)
+ }
+ }
+
+ private func valueAsDouble(_ value: Any) -> Double? {
+ if let value = value as? Double { return value }
+ if let value = value as? Int { return Double(value) }
+ if let value = value as? String { return Double(value) }
+ return nil
+ }
+
+ private func respond(id: Any?, result: Any) {
+ var response: [String: Any] = ["jsonrpc": "2.0", "result": result]
+ if let id { response["id"] = id }
+ writeLine(response)
+ }
+
+ private func respondError(id: Any?, code: Int, message: String) {
+ var response: [String: Any] = ["jsonrpc": "2.0", "error": ["code": code, "message": message]]
+ if let id { response["id"] = id }
+ writeLine(response)
+ }
+
+ private func writeLine(_ object: [String: Any]) {
+ guard let text = jsonString(object) else { return }
+ print(text)
+ }
+
+ private func textResult(_ text: String) -> [[String: Any]] {
+ [["type": "text", "text": text]]
+ }
+
+ private func jsonResult(_ payload: [String: Any]) -> [[String: Any]] {
+ textResult(jsonString(payload) ?? "{\"ok\":false,\"error\":\"Failed to serialize result.\"}")
+ }
+
+ private func jsonString(_ value: Any) -> String? {
+ guard JSONSerialization.isValidJSONObject(value),
+ let data = try? JSONSerialization.data(withJSONObject: value),
+ let text = String(data: data, encoding: .utf8) else {
+ return nil
+ }
+ return text
+ }
+
+ private func errorPayload(_ error: Error) -> [String: Any] {
+ let message = error.localizedDescription
+ var payload: [String: Any] = ["ok": false, "error": message]
+ if message.localizedCaseInsensitiveContains("accessibility") {
+ payload["permissionNeeded"] = "accessibility"
+ }
+ if message.localizedCaseInsensitiveContains("screenshot") || message.localizedCaseInsensitiveContains("screen recording") {
+ payload["permissionNeeded"] = "screen-recording"
+ }
+ return payload
+ }
+
+ private func log(_ message: String) {
+ fputs("[HandsFreeComputerUse] \(message)\n", stderr)
+ }
+
+ private func intArg(_ args: [String: Any], _ key: String) -> Int? {
+ if let value = args[key] as? Int { return value }
+ if let value = args[key] as? Double { return Int(value) }
+ if let value = args[key] as? String { return Int(value) }
+ return nil
+ }
+
+ private func doubleArg(_ args: [String: Any], _ key: String) -> Double? {
+ if let value = args[key] as? Double { return value }
+ if let value = args[key] as? Int { return Double(value) }
+ if let value = args[key] as? String { return Double(value) }
+ return nil
+ }
+
+ private func boolArg(_ args: [String: Any], _ key: String) -> Bool? {
+ if let value = args[key] as? Bool { return value }
+ if let value = args[key] as? String {
+ if value == "true" { return true }
+ if value == "false" { return false }
+ }
+ return nil
+ }
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/Types.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/Types.swift
new file mode 100644
index 0000000000..a1498a411e
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/Types.swift
@@ -0,0 +1,220 @@
+import AppKit
+import ApplicationServices
+
+struct ElementFrame: Sendable {
+ let x: Int
+ let y: Int
+ let width: Int
+ let height: Int
+
+ var center: CGPoint {
+ CGPoint(x: x + width / 2, y: y + height / 2)
+ }
+
+ var dictionary: [String: Any] {
+ ["x": x, "y": y, "width": width, "height": height]
+ }
+}
+
+struct AXElementState: Sendable {
+ let enabled: Bool?
+ let focused: Bool?
+ let selected: Bool?
+ let expanded: Bool?
+ let checked: Bool?
+
+ var dictionary: [String: Any] {
+ var result: [String: Any] = [:]
+ if let enabled { result["enabled"] = enabled }
+ if let focused { result["focused"] = focused }
+ if let selected { result["selected"] = selected }
+ if let expanded { result["expanded"] = expanded }
+ if let checked { result["checked"] = checked }
+ return result
+ }
+}
+
+struct AXElementCapabilities: Sendable {
+ let canPress: Bool
+ let canFocus: Bool
+ let canScroll: Bool
+ let canAdjust: Bool
+ let canSetValue: Bool
+ let actions: [String]
+
+ var dictionary: [String: Any] {
+ [
+ "press": canPress,
+ "focus": canFocus,
+ "scroll": canScroll,
+ "adjust": canAdjust,
+ "setValue": canSetValue,
+ "actions": actions,
+ ]
+ }
+}
+
+struct SemanticAXElement: Identifiable, Sendable {
+ let id: Int
+ let ref: String
+ let role: String
+ let label: String
+ let value: String?
+ let frame: ElementFrame
+ let state: AXElementState
+ let capabilities: AXElementCapabilities
+
+ var dictionary: [String: Any] {
+ var result: [String: Any] = [
+ "id": id,
+ "ref": ref,
+ "role": role,
+ "label": label,
+ "frame": frame.dictionary,
+ "state": state.dictionary,
+ "capabilities": capabilities.dictionary,
+ ]
+ if let value { result["value"] = value }
+ return result
+ }
+}
+
+struct AXElementRecord: @unchecked Sendable {
+ let element: AXUIElement
+ let semantic: SemanticAXElement
+}
+
+struct ScreenshotMetadata: Sendable {
+ let imageWidth: Int
+ let imageHeight: Int
+ let capturedBounds: CGRect
+
+ var scaleX: CGFloat { capturedBounds.width / CGFloat(imageWidth) }
+ var scaleY: CGFloat { capturedBounds.height / CGFloat(imageHeight) }
+
+ func toScreen(imageX: Double, imageY: Double) -> CGPoint {
+ CGPoint(
+ x: capturedBounds.origin.x + imageX * scaleX,
+ y: capturedBounds.origin.y + imageY * scaleY
+ )
+ }
+
+ func toImage(point: CGPoint) -> CGPoint {
+ CGPoint(
+ x: (point.x - capturedBounds.origin.x) / scaleX,
+ y: (point.y - capturedBounds.origin.y) / scaleY
+ )
+ }
+
+ var dictionary: [String: Any] {
+ [
+ "imageWidth": imageWidth,
+ "imageHeight": imageHeight,
+ "capturedBounds": [
+ "x": Int(capturedBounds.origin.x),
+ "y": Int(capturedBounds.origin.y),
+ "width": Int(capturedBounds.width),
+ "height": Int(capturedBounds.height),
+ ],
+ ]
+ }
+}
+
+struct WindowTarget: @unchecked Sendable {
+ let appName: String
+ let pid: pid_t
+ let windowNumber: Int?
+ let windowTitle: String?
+ let bounds: CGRect
+ let isFrontmost: Bool
+ let axWindow: AXUIElement?
+
+ var center: CGPoint {
+ CGPoint(x: bounds.midX, y: bounds.midY)
+ }
+}
+
+struct AppSnapshot: @unchecked Sendable {
+ let appName: String
+ let pid: pid_t
+ let windowNumber: Int?
+ let windowTitle: String?
+ let screenshotData: Data
+ let screenshotMimeType: String
+ let screenshotMeta: ScreenshotMetadata
+ let records: [AXElementRecord]
+ let strictMode: Bool
+ let backgroundActivated: Bool
+
+ var elements: [SemanticAXElement] {
+ records.map(\.semantic)
+ }
+}
+
+enum ExecutionPath: String {
+ case accessibility = "accessibility"
+ case backgroundCGEvent = "background_cgevent"
+ case foregroundCGEvent = "foreground_cgevent"
+ case none = "none"
+}
+
+struct ActionMetadata: Sendable {
+ let ok: Bool
+ let path: ExecutionPath
+ let strictMode: Bool
+ let backgroundSafe: Bool
+ let fallbackUsed: Bool
+ let message: String
+
+ var dictionary: [String: Any] {
+ [
+ "ok": ok,
+ "path": path.rawValue,
+ "strictMode": strictMode,
+ "backgroundSafe": backgroundSafe,
+ "fallbackUsed": fallbackUsed,
+ "message": message,
+ ]
+ }
+}
+
+enum ComputerUseError: LocalizedError {
+ case accessibilityDenied
+ case screenshotFailed
+ case appNotFound(String)
+ case noFrontmostApplication
+ case noWindow(String)
+ case noSnapshot
+ case invalidElement(String)
+ case strictModeViolation(String)
+ case eventSourceFailed
+ case eventCreationFailed
+ case unknownKey(String)
+
+ var errorDescription: String? {
+ switch self {
+ case .accessibilityDenied:
+ return "Accessibility permission is not granted."
+ case .screenshotFailed:
+ return "Screenshot capture failed."
+ case .appNotFound(let name):
+ return "App '\(name)' is not running."
+ case .noFrontmostApplication:
+ return "No frontmost application found."
+ case .noWindow(let app):
+ return "No usable window found for \(app)."
+ case .noSnapshot:
+ return "No snapshot is available. Call snapshot first."
+ case .invalidElement(let ref):
+ return "Element \(ref) was not found in the last semantic snapshot."
+ case .strictModeViolation(let reason):
+ return "Strict background mode rejected the action: \(reason)"
+ case .eventSourceFailed:
+ return "Failed to create CGEventSource."
+ case .eventCreationFailed:
+ return "Failed to create CGEvent."
+ case .unknownKey(let key):
+ return "Unknown key: \(key)"
+ }
+ }
+}
diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/main.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/main.swift
new file mode 100644
index 0000000000..b2b66077ae
--- /dev/null
+++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/main.swift
@@ -0,0 +1,17 @@
+/// HandsFreeComputerUse: semantic AX and background-safe macOS computer use.
+///
+/// The runtime is MCP-independent. This binary exposes it over a small stdio
+/// adapter because existing agent clients already speak MCP.
+
+import Foundation
+
+setbuf(stdout, nil)
+
+let args = CommandLine.arguments
+if args.count >= 2 && args[1] == "mcp" {
+ let server = MCPServer()
+ await server.run()
+} else {
+ fputs("Usage: HandsFreeComputerUse mcp\n", stderr)
+ exit(1)
+}
diff --git a/packages/handsfree/package.json b/packages/handsfree/package.json
new file mode 100644
index 0000000000..6c6c281778
--- /dev/null
+++ b/packages/handsfree/package.json
@@ -0,0 +1,23 @@
+{
+ "name": "@openwork/handsfree",
+ "private": true,
+ "version": "0.1.0",
+ "description": "macOS semantic AX and background computer-use runtime for OpenWork",
+ "license": "MIT",
+ "type": "module",
+ "bin": {
+ "openwork-handsfree-computer-use": "bin/openwork-handsfree-computer-use.mjs"
+ },
+ "scripts": {
+ "check": "pnpm run check:js && pnpm run check:native",
+ "check:js": "node --check bin/openwork-handsfree-computer-use.mjs && node --check src/cua-runner.mjs && node --check src/realtime-tools.mjs",
+ "build:native": "swift build --package-path native/HandsFree -c release --product HandsFreeComputerUse",
+ "check:native": "swift build --package-path native/HandsFree --product HandsFreeComputerUse"
+ },
+ "files": [
+ "bin",
+ "src",
+ "native",
+ "README.md"
+ ]
+}
diff --git a/packages/handsfree/src/cua-runner.mjs b/packages/handsfree/src/cua-runner.mjs
new file mode 100644
index 0000000000..7975a12c78
--- /dev/null
+++ b/packages/handsfree/src/cua-runner.mjs
@@ -0,0 +1,129 @@
+export const CUA_DEFAULT_MODEL = "gpt-5.5";
+export const CUA_MAX_TURNS = 30;
+
+export async function runCuaLoop({
+ task,
+ apiKey,
+ callTool,
+ onProgress,
+ signal,
+ model = CUA_DEFAULT_MODEL,
+ maxTurns = CUA_MAX_TURNS,
+}) {
+ if (!apiKey?.trim()) throw new Error("OpenAI API key required for computer use.");
+ if (typeof callTool !== "function") throw new Error("callTool is required.");
+
+ const display = await callTool("display_info", {});
+ const displayInfo = parseToolText(display) ?? { width: 1440, height: 900 };
+ onProgress?.({ kind: "start", width: displayInfo.width, height: displayInfo.height });
+
+ const items = [{ role: "user", content: String(task ?? "") }];
+ const messages = [];
+
+ for (let turn = 0; turn < maxTurns; turn += 1) {
+ if (signal?.aborted) return { ok: true, messages, turns: turn, aborted: true };
+ onProgress?.({ kind: "turn", turn: turn + 1 });
+
+ const response = await fetch("https://api.openai.com/v1/responses", {
+ method: "POST",
+ headers: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json" },
+ body: JSON.stringify({ model, input: items, tools: [{ type: "computer" }] }),
+ signal,
+ });
+
+ if (!response.ok) {
+ const errorText = await response.text().catch(() => "");
+ throw new Error(`CUA API error ${response.status}: ${errorText.slice(0, 300)}`);
+ }
+
+ const result = await response.json();
+ const output = result.output || [];
+ if (!output.length) throw new Error("No output from CUA model.");
+ items.push(...output);
+
+ let computerCall = null;
+ for (const item of output) {
+ if (item.type === "message") {
+ const text = item.content?.map((part) => part.text || "").join("") || "";
+ if (text) {
+ messages.push(text);
+ onProgress?.({ kind: "message", text });
+ }
+ }
+ if (item.type === "computer_call") computerCall = item;
+ }
+
+ if (!computerCall) return { ok: true, messages, turns: turn + 1 };
+
+ for (const action of computerCall.actions || (computerCall.action ? [computerCall.action] : [])) {
+ if (signal?.aborted) return { ok: true, messages, turns: turn + 1, aborted: true };
+ if (action.type === "screenshot") continue;
+ onProgress?.({ kind: "action", ...summarizeAction(action) });
+ await executeCuaAction(callTool, action);
+ await delay(150);
+ }
+
+ const screenshot = await callTool("cua_screenshot", {});
+ const image = extractImage(screenshot);
+ if (!image) throw new Error("Could not capture screenshot after action.");
+
+ items.push({
+ type: "computer_call_output",
+ call_id: computerCall.call_id,
+ acknowledged_safety_checks: computerCall.pending_safety_checks || [],
+ output: { type: "input_image", image_url: `data:image/png;base64,${image}` },
+ });
+ }
+
+ return { ok: true, messages, turns: maxTurns, truncated: true };
+}
+
+export async function executeCuaAction(callTool, action) {
+ switch (action.type) {
+ case "click":
+ return callTool("cua_click", { x: action.x, y: action.y, button: action.button || "left", ...(action.keys?.length ? { keys: action.keys } : {}) });
+ case "double_click":
+ return callTool("cua_double_click", { x: action.x, y: action.y });
+ case "scroll":
+ return callTool("cua_scroll", { x: action.x, y: action.y, scroll_x: action.scroll_x || 0, scroll_y: action.scroll_y || 0 });
+ case "type":
+ return callTool("cua_type", { text: action.text });
+ case "keypress":
+ return callTool("cua_keypress", { keys: action.keys || [] });
+ case "drag":
+ return callTool("cua_drag", { path: action.path || [] });
+ case "move":
+ return callTool("cua_move", { x: action.x, y: action.y });
+ case "wait":
+ return callTool("cua_wait", {});
+ default:
+ return null;
+ }
+}
+
+function parseToolText(response) {
+ const text = response?.result?.content?.find?.((item) => item.type === "text")?.text
+ ?? response?.content?.find?.((item) => item.type === "text")?.text;
+ if (!text) return null;
+ try { return JSON.parse(text); } catch { return null; }
+}
+
+function extractImage(response) {
+ return response?.result?.content?.find?.((item) => item.type === "image" && item.data)?.data
+ ?? response?.content?.find?.((item) => item.type === "image" && item.data)?.data
+ ?? null;
+}
+
+function summarizeAction(action) {
+ return {
+ type: action.type,
+ x: action.x,
+ y: action.y,
+ text: action.text?.slice?.(0, 60),
+ desc: `${action.type}${action.x != null ? ` (${action.x},${action.y})` : ""}${action.text ? ` "${action.text.slice(0, 30)}"` : ""}`,
+ };
+}
+
+function delay(ms) {
+ return new Promise((resolve) => setTimeout(resolve, ms));
+}
diff --git a/packages/handsfree/src/realtime-tools.mjs b/packages/handsfree/src/realtime-tools.mjs
new file mode 100644
index 0000000000..317e733469
--- /dev/null
+++ b/packages/handsfree/src/realtime-tools.mjs
@@ -0,0 +1,85 @@
+export const HANDSFREE_DEFAULT_MODEL = "gpt-realtime-2";
+export const HANDSFREE_DEFAULT_REASONING_EFFORT = "low";
+export const HANDSFREE_REASONING_EFFORTS = ["minimal", "low", "medium", "high", "xhigh"];
+
+export const HANDSFREE_REALTIME_INSTRUCTIONS = `# Role and Objective
+
+You are HandsFree, a macOS computer-control voice assistant. You control the user's Mac through tools. You respond with voice. You cannot see the screen yourself.
+
+# Personality and Tone
+
+Be concise, calm, and direct. Do not over-explain. Act, then report the result.
+
+# Tool Selection
+
+- Direct typing, keypresses, app launch, clipboard, URLs, and grid clicks are instant tools.
+- Visual or multi-step UI work must use use_computer.
+- Stop/cancel requests must call stop_computer immediately.
+- For MCP servers, list tools before calling unfamiliar tool names.
+
+# Safety
+
+- Type exactly what the user asks; do not paraphrase typed text.
+- Do not use destructive shortcuts unless explicitly requested.
+- For actions that send messages or modify data, confirm content briefly before executing.`;
+
+export function openAIRealtimeTools() {
+ return [
+ functionTool("use_computer", "Control the Mac to complete a visual or UI task using screenshots and native input.", {
+ task: { type: "string", description: "Plain-language task to complete on the computer." },
+ }, ["task"]),
+ functionTool("type_text", "Type exact text into the focused input field.", {
+ text: { type: "string", description: "Exact text to type." },
+ }, ["text"]),
+ functionTool("press_key", "Press a key combo such as return, tab, escape, command+k, or command+shift+a.", {
+ combo: { type: "string", description: "Key combo string." },
+ }, ["combo"]),
+ functionTool("launch_app", "Launch a macOS app by name.", {
+ name: { type: "string", description: "App name." },
+ }, ["name"]),
+ functionTool("activate_app", "Bring a running macOS app to the foreground.", {
+ name: { type: "string", description: "App name." },
+ }, ["name"]),
+ functionTool("list_apps", "List running macOS applications."),
+ functionTool("clipboard_read", "Read the macOS clipboard as text."),
+ functionTool("clipboard_write", "Write text to the macOS clipboard.", {
+ text: { type: "string", description: "Text to copy." },
+ }, ["text"]),
+ functionTool("open_url", "Open a URL in a browser.", {
+ url: { type: "string", description: "URL to open." },
+ app: { type: "string", description: "Optional browser app name." },
+ }, ["url"]),
+ functionTool("mcp_list_servers", "List connected MCP servers."),
+ functionTool("mcp_list_tools", "List tools on a connected MCP server before calling unfamiliar tools.", {
+ serverName: { type: "string", description: "MCP server name." },
+ }, ["serverName"]),
+ functionTool("mcp_call_tool", "Call a tool on a connected MCP server.", {
+ serverName: { type: "string", description: "MCP server name." },
+ toolName: { type: "string", description: "Tool name." },
+ args: { type: "object", description: "Tool arguments.", additionalProperties: true },
+ }, ["serverName", "toolName"]),
+ functionTool("show_grid", "Show a subtle A1-F4 screen grid overlay."),
+ functionTool("hide_grid", "Hide the screen grid overlay."),
+ functionTool("click_grid", "Click the center of a grid zone such as C2.", {
+ zone: { type: "string", description: "Grid zone label." },
+ }, ["zone"]),
+ functionTool("stop_computer", "Stop the current computer-use task."),
+ functionTool("request_permission", "Open System Settings for a macOS permission pane.", {
+ pane: { type: "string", description: "accessibility, screen-recording, or microphone." },
+ }, ["pane"]),
+ ];
+}
+
+function functionTool(name, description, properties = {}, required = []) {
+ return {
+ type: "function",
+ name,
+ description,
+ parameters: {
+ type: "object",
+ properties,
+ required,
+ additionalProperties: false,
+ },
+ };
+}