diff --git a/apps/app/src/app/constants.ts b/apps/app/src/app/constants.ts index fa52e09c0b..fc64403092 100644 --- a/apps/app/src/app/constants.ts +++ b/apps/app/src/app/constants.ts @@ -156,6 +156,18 @@ export const MCP_QUICK_CONNECT: McpDirectoryInfo[] = [ composerPrompt: "Use the Chrome extension to ", defaultEnabled: true, }, + { + id: "handsfree-computer-use", + name: "HandsFree Computer Use", + serverName: "handsfree-computer-use", + description: "Control macOS apps through semantic accessibility refs, background-safe clicks, screenshots, keyboard input, and strict mode.", + type: "local", + command: ["npx", "-y", "@openwork/handsfree", "mcp"], + oauth: false, + kind: "extension", + iconSrc: "/openwork-mark.svg", + composerPrompt: "Use HandsFree Computer Use to ", + }, { id: "openai-image-gen", name: "OpenAI Image Gen", diff --git a/apps/app/src/app/lib/desktop.ts b/apps/app/src/app/lib/desktop.ts index 91ab00a714..10eeac56e3 100644 --- a/apps/app/src/app/lib/desktop.ts +++ b/apps/app/src/app/lib/desktop.ts @@ -157,6 +157,7 @@ declare global { initialDeepLinks?: string[]; platform?: "darwin" | "linux" | "windows"; version?: string; + browserCdpPort?: string; }; }; } diff --git a/apps/app/src/react-app/design-system/extension-card.tsx b/apps/app/src/react-app/design-system/extension-card.tsx index 92e1b642ba..d9486e58d6 100644 --- a/apps/app/src/react-app/design-system/extension-card.tsx +++ b/apps/app/src/react-app/design-system/extension-card.tsx @@ -108,15 +108,11 @@ export function ExtensionCard(props: ExtensionCardProps) {

{name}

- {connected ? ( - - Connected - - ) : ( + {!connected ? ( {kindLabel[kind]} - )} + ) : null}

{description}

{!connected && !connecting && actionLabel ? ( diff --git a/apps/app/src/react-app/design-system/extension-detail-modal.tsx b/apps/app/src/react-app/design-system/extension-detail-modal.tsx index f6823dbf7f..d894a26da9 100644 --- a/apps/app/src/react-app/design-system/extension-detail-modal.tsx +++ b/apps/app/src/react-app/design-system/extension-detail-modal.tsx @@ -255,10 +255,10 @@ export function ExtensionDetailModal(props: ExtensionDetailModalProps) {
) : null} - {kind === "ui-control" ? ( + {launchCommand ? (
Launch - {(launchCommand ?? fallbackUiControlCommand).join(" ")} + {launchCommand.join(" ")}
) : null} diff --git a/apps/app/src/react-app/domains/connections/store.ts b/apps/app/src/react-app/domains/connections/store.ts index f2899bd139..ba4b3f71a7 100644 --- a/apps/app/src/react-app/domains/connections/store.ts +++ b/apps/app/src/react-app/domains/connections/store.ts @@ -273,6 +273,16 @@ export function createConnectionsStore(options: { }; const resolveLocalMcpCommand = async (entry: McpDirectoryInfo) => { + if (entry.serverName === "handsfree-computer-use") { + try { + const command = await (window as any).__OPENWORK_ELECTRON__?.invokeDesktop?.("getHandsFreeMcpCommand"); + if (Array.isArray(command) && command.every((part) => typeof part === "string") && command.length > 0) { + return command; + } + } catch { + // Fall through to the published package command. + } + } if (entry.serverName !== "openwork-ui") { return entry.command; } diff --git a/apps/app/src/react-app/domains/session/surface/composer/composer.tsx b/apps/app/src/react-app/domains/session/surface/composer/composer.tsx index 51aa3ac046..6d790bb958 100644 --- a/apps/app/src/react-app/domains/session/surface/composer/composer.tsx +++ b/apps/app/src/react-app/domains/session/surface/composer/composer.tsx @@ -678,7 +678,16 @@ export function ReactSessionComposer(props: ComposerProps) { }; const applyExtensionSelection = (entry: McpDirectoryInfo) => { - props.onDraftChange(entry.composerPrompt ?? `Use ${entry.name} to `); + if (entry.id === "openwork-browser") { + const port = window.__OPENWORK_ELECTRON__?.meta?.browserCdpPort?.trim(); + props.onDraftChange( + port + ? `Use the OpenWork Browser extension with browser_url "http://127.0.0.1:${port}". Do not use any other browser_url. ` + : entry.composerPrompt ?? `Use ${entry.name} to `, + ); + } else { + props.onDraftChange(entry.composerPrompt ?? `Use ${entry.name} to `); + } setToolMenuOpen(false); }; diff --git a/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx b/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx index f4c5e59a83..3e479e2572 100644 --- a/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx +++ b/apps/app/src/react-app/domains/settings/pages/mcp-view.tsx @@ -196,6 +196,7 @@ export function McpView(props: McpViewProps) { const [detailSkillContent, setDetailSkillContent] = useState(null); const [openworkUiMcpCommand, setOpenworkUiMcpCommand] = useState(null); const [openworkUiMcpEnvironment, setOpenworkUiMcpEnvironment] = useState | null>(null); + const [handsFreeMcpCommand, setHandsFreeMcpCommand] = useState(null); const [search, setSearch] = useState(""); const [filter, setFilter] = useState("all"); const [, setExtensionStateVersion] = useState(0); @@ -266,9 +267,14 @@ export function McpView(props: McpViewProps) { ), )); } + const handsFreeCommand = await (window as any).__OPENWORK_ELECTRON__?.invokeDesktop?.("getHandsFreeMcpCommand"); + if (Array.isArray(handsFreeCommand) && handsFreeCommand.every((part) => typeof part === "string")) { + setHandsFreeMcpCommand(handsFreeCommand); + } } catch { setOpenworkUiMcpCommand(null); setOpenworkUiMcpEnvironment(null); + setHandsFreeMcpCommand(null); } })(); }, []); @@ -341,6 +347,15 @@ export function McpView(props: McpViewProps) { const isQuickConnectConfigured = (entry: McpDirectoryInfo) => props.mcpServers.some((server) => server.name === getMcpIdentityKey(entry)); + const isMcpBackedExtension = (entry: McpDirectoryInfo) => + entry.kind === "extension" && Boolean(entry.type || entry.command?.length || entry.url); + + const launchCommandForEntry = (entry: McpDirectoryInfo) => { + if (entry.serverName === "openwork-ui") return openworkUiMcpCommand ?? undefined; + if (entry.serverName === "handsfree-computer-use") return handsFreeMcpCommand ?? entry.command; + return entry.command; + }; + const supportsOauth = (entry: McpServerEntry) => entry.config.type === "remote" && entry.config.oauth !== false; @@ -468,7 +483,7 @@ export function McpView(props: McpViewProps) { busy={props.busy} connectingName={props.mcpConnectingName} isConfigured={(entry) => - entry.kind === "extension" + entry.kind === "extension" && !isMcpBackedExtension(entry) ? (entry.defaultEnabled ? isOpenWorkExtensionEnabled(entry) : props.isExtensionConnected?.(entry) ?? false) : isQuickConnectConfigured(entry) } @@ -575,7 +590,7 @@ export function McpView(props: McpViewProps) { {detailEntry ? (() => { const extensionConfigSlot = props.configSlotForEntry?.(detailEntry) ?? null; const hasConfigSlot = extensionConfigSlot !== null; - const isConnected = detailEntry.kind === "extension" + const isConnected = detailEntry.kind === "extension" && !isMcpBackedExtension(detailEntry) ? (detailEntry.defaultEnabled ? isOpenWorkExtensionEnabled(detailEntry) : props.isExtensionConnected?.(detailEntry) ?? false) : isQuickConnectConfigured(detailEntry); return ( @@ -590,19 +605,19 @@ export function McpView(props: McpViewProps) { kind={detailEntry.kind ?? "mcp"} connected={isConnected} connecting={props.mcpConnectingName === detailEntry.name} - launchCommand={detailEntry.serverName === "openwork-ui" ? openworkUiMcpCommand ?? undefined : undefined} + launchCommand={launchCommandForEntry(detailEntry)} environment={detailEntry.serverName === "openwork-ui" ? openworkUiMcpEnvironment ?? undefined : undefined} url={typeof detailEntry.url === "string" ? detailEntry.url : undefined} oauth={detailEntry.oauth} configSlot={extensionConfigSlot} - onConnect={detailEntry.defaultEnabled ? () => { + onConnect={detailEntry.defaultEnabled && !isMcpBackedExtension(detailEntry) ? () => { setOpenWorkExtensionEnabled(detailEntry, true); setDetailEntry(null); } : hasConfigSlot ? undefined : () => { props.connectMcp(detailEntry); setDetailEntry(null); }} - onUninstall={detailEntry.defaultEnabled && isConnected ? () => { + onUninstall={detailEntry.defaultEnabled && !isMcpBackedExtension(detailEntry) && isConnected ? () => { setOpenWorkExtensionEnabled(detailEntry, false); } : isQuickConnectConfigured(detailEntry) ? () => { const slug = getMcpIdentityKey(detailEntry); diff --git a/apps/app/src/react-app/domains/settings/shell/settings-page.tsx b/apps/app/src/react-app/domains/settings/shell/settings-page.tsx index 05246742ec..4780b4aa66 100644 --- a/apps/app/src/react-app/domains/settings/shell/settings-page.tsx +++ b/apps/app/src/react-app/domains/settings/shell/settings-page.tsx @@ -3,7 +3,6 @@ import type * as React from "react"; import { ArrowLeft, Bug, - ChevronDown, CloudCog, Cog, Container, @@ -33,12 +32,6 @@ import { SidebarMenuButton, SidebarMenuItem, } from "@/components/ui/sidebar"; -import { - DropdownMenu, - DropdownMenuContent, - DropdownMenuItem, - DropdownMenuTrigger, -} from "@/components/ui/dropdown-menu"; import { t } from "../../../../i18n"; import type { SettingsTab } from "../../../../app/types"; import { @@ -53,7 +46,6 @@ import { SettingsPanelToolbarMessage, SettingsPanelToolbarStatus, } from "./panel"; -import { WorkspaceIcon } from "../../../design-system/workspace-icon"; export function getSettingsTabIcon(tab: SettingsTab) { switch (tab) { @@ -234,31 +226,6 @@ export function SettingsSidebar(props: SettingsSidebarProps) { {t("dashboard.back_to_app")} - - - - - {props.selectedWorkspaceName} - - - } - /> - - {props.workspaces.map((workspace) => ( - props.onSelectWorkspace(workspace.id)} - disabled={workspace.id === props.selectedWorkspaceId} - > - - {workspace.name} - - ))} - - - diff --git a/apps/desktop/electron/main.mjs b/apps/desktop/electron/main.mjs index 5288287cd2..7fba502019 100644 --- a/apps/desktop/electron/main.mjs +++ b/apps/desktop/electron/main.mjs @@ -226,15 +226,15 @@ if (process.platform === "darwin" && APP_ICON_IMAGE && !APP_ICON_IMAGE.isEmpty() // Expose Chrome DevTools Protocol so the opencode-chrome-devtools plugin can // drive the built-in browser panel. Use OPENWORK_ELECTRON_REMOTE_DEBUG_PORT to -// pin a specific port; otherwise pick a default (9223) that stays out of the -// way of common dev-tools ports (9222 = Chrome, 9229 = Node inspector). +// pin a specific port. Prod defaults to 9223; dev defaults to 9823 so both +// apps can run side by side without the dev browser tools attaching to prod. const explicitCdpPort = Number.parseInt( process.env.OPENWORK_ELECTRON_REMOTE_DEBUG_PORT?.trim() ?? "", 10, ); const remoteDebugPort = Number.isFinite(explicitCdpPort) && explicitCdpPort > 0 ? explicitCdpPort - : 9223; + : isDevMode ? 9823 : 9223; app.commandLine.appendSwitch("remote-debugging-port", String(remoteDebugPort)); app.commandLine.appendSwitch("remote-debugging-address", "127.0.0.1"); // Make the port available to the embedded server so it can pass it to OpenCode. @@ -2362,6 +2362,12 @@ async function handleDesktopInvoke(event, command, ...args) { } return ["npx", "-y", "openwork-ui-mcp"]; } + case "getHandsFreeMcpCommand": { + if (process.env.OPENWORK_DEV_MODE === "1") { + return ["node", path.resolve(__dirname, "../../..", "packages/handsfree/bin/openwork-handsfree-computer-use.mjs"), "mcp"]; + } + return ["npx", "-y", "@openwork/handsfree", "mcp"]; + } case "getOpenworkUiMcpEnvironment": { return { OPENWORK_UI_CONTROL_DISCOVERY: path.join(app.getPath("userData"), "openwork-ui-control.json"), diff --git a/apps/desktop/electron/preload.mjs b/apps/desktop/electron/preload.mjs index c8a50f7c4e..9c1c31beb3 100644 --- a/apps/desktop/electron/preload.mjs +++ b/apps/desktop/electron/preload.mjs @@ -133,6 +133,7 @@ contextBridge.exposeInMainWorld("__OPENWORK_ELECTRON__", { initialDeepLinks: [], platform: normalizePlatform(process.platform), version: process.versions.electron, + browserCdpPort: process.env.OPENWORK_ELECTRON_REMOTE_DEBUG_PORT || undefined, }, }); diff --git a/apps/server/src/workspace-init.ts b/apps/server/src/workspace-init.ts index 7e5963adf8..b37c8b8a55 100644 --- a/apps/server/src/workspace-init.ts +++ b/apps/server/src/workspace-init.ts @@ -47,6 +47,7 @@ Browser tools (\`browser_navigate\`, \`browser_snapshot\`, \`browser_click\`, \` - \`browser_url\`: always use \`"http://127.0.0.1:{{BROWSER_CDP_PORT}}"\`. - Use for general browsing tasks. The user sees what you do in real time. - Always call \`browser_list\` first to discover available targets, then use the appropriate \`target_id\`. +- Do not scan common CDP ports or fall back to another port. If this endpoint is unavailable, report that the built-in browser is unavailable. **Chrome (external browser)**: - Use when the user needs their real cookies, sign-ins, or extensions. diff --git a/packages/handsfree/README.md b/packages/handsfree/README.md new file mode 100644 index 0000000000..aaf7ef90b6 --- /dev/null +++ b/packages/handsfree/README.md @@ -0,0 +1,26 @@ +# OpenWork HandsFree Computer Use + +Native macOS computer-use runtime imported from the HandsFree prototype. + +This package focuses on the reusable control layer: + +- Semantic AX snapshots with compact refs like `{e1}`. +- Strict background mode that avoids foreground cursor/HID fallbacks. +- Target-window screenshots via `CGWindowListCreateImage(.optionIncludingWindow)`. +- Background input through `CGEvent.postToPid` with window-addressing fields. +- Background activation using per-process event taps plus AppKit and center-click primers. +- Non-UI orchestration modules from the original Electron prototype: realtime tool schemas/instructions and the GPT computer-use loop. + +Build the native stdio server: + +```bash +pnpm --filter @openwork/handsfree check:native +``` + +Run it as an MCP-compatible adapter: + +```bash +pnpm --filter @openwork/handsfree exec openwork-handsfree-computer-use mcp +``` + +The core runtime is intentionally MCP-independent. `ComputerUseRuntime` exposes a small direct surface (`snapshot`, `click`, `typeText`, `pressKey`, `scroll`, `wait`, `setValue`, `performAction`); `MCPServer` is only a thin stdio wrapper. diff --git a/packages/handsfree/bin/openwork-handsfree-computer-use.mjs b/packages/handsfree/bin/openwork-handsfree-computer-use.mjs new file mode 100755 index 0000000000..c99fc9a340 --- /dev/null +++ b/packages/handsfree/bin/openwork-handsfree-computer-use.mjs @@ -0,0 +1,41 @@ +#!/usr/bin/env node + +import { spawn } from "node:child_process"; +import { existsSync } from "node:fs"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const packageRoot = path.resolve(__dirname, ".."); +const swiftPackagePath = path.join(packageRoot, "native", "HandsFree"); + +const explicitBinary = process.env.HANDSFREE_COMPUTER_USE_BINARY?.trim(); +const candidates = [ + explicitBinary, + path.join(swiftPackagePath, ".build", "release", "HandsFreeComputerUse"), + path.join(swiftPackagePath, ".build", "arm64-apple-macosx", "release", "HandsFreeComputerUse"), + path.join(swiftPackagePath, ".build", "debug", "HandsFreeComputerUse"), + path.join(swiftPackagePath, ".build", "arm64-apple-macosx", "debug", "HandsFreeComputerUse"), +].filter(Boolean); + +const args = process.argv.slice(2); +const binary = candidates.find((candidate) => existsSync(candidate)); +const command = binary ?? "swift"; +const commandArgs = binary + ? args + : ["run", "--package-path", swiftPackagePath, "HandsFreeComputerUse", ...args]; + +const child = spawn(command, commandArgs, { + stdio: "inherit", + env: process.env, +}); + +child.on("exit", (code, signal) => { + if (signal) process.kill(process.pid, signal); + process.exit(code ?? 0); +}); + +child.on("error", (error) => { + console.error(`Failed to start HandsFreeComputerUse: ${error.message}`); + process.exit(1); +}); diff --git a/packages/handsfree/native/HandsFree/.gitignore b/packages/handsfree/native/HandsFree/.gitignore new file mode 100644 index 0000000000..2d9f16e2d2 --- /dev/null +++ b/packages/handsfree/native/HandsFree/.gitignore @@ -0,0 +1,2 @@ +.build/ +.swiftpm/ diff --git a/packages/handsfree/native/HandsFree/Package.swift b/packages/handsfree/native/HandsFree/Package.swift new file mode 100644 index 0000000000..0d507c4598 --- /dev/null +++ b/packages/handsfree/native/HandsFree/Package.swift @@ -0,0 +1,13 @@ +// swift-tools-version: 5.9 +import PackageDescription + +let package = Package( + name: "HandsFree", + platforms: [.macOS(.v14)], + targets: [ + .executableTarget( + name: "HandsFreeComputerUse", + path: "Sources/ComputerUse" + ), + ] +) diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/AccessibilityService.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/AccessibilityService.swift new file mode 100644 index 0000000000..b3d3848fb9 --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/AccessibilityService.swift @@ -0,0 +1,373 @@ +import AppKit +import ApplicationServices + +final class AccessibilityService: @unchecked Sendable { + private let screenshotImageWidth: CGFloat = 768 + private let maxElements = 250 + private let maxDepth = 22 + + private let importantRoles: Set = [ + "AXButton", "AXCheckBox", "AXRadioButton", "AXPopUpButton", "AXMenuButton", + "AXComboBox", "AXTextField", "AXTextArea", "AXSearchField", "AXLink", + "AXSlider", "AXIncrementor", "AXScrollArea", "AXScrollBar", "AXTabGroup", + "AXTab", "AXMenuItem", "AXCell", "AXRow", "AXStaticText", "AXImage", + "AXOutline", "AXTable", "AXList", "AXGroup", + ] + + func resolveTarget(appName: String?) throws -> WindowTarget { + guard AXIsProcessTrusted() else { throw ComputerUseError.accessibilityDenied } + + let app = try resolveApp(appName: appName) + let pid = app.processIdentifier + let axApp = AXUIElementCreateApplication(pid) + let axWindow = firstAXWindow(axApp: axApp) + let title = axWindow.flatMap { axString($0, kAXTitleAttribute) } + let info = firstCGWindowInfo(pid: pid, title: title) + let bounds = axWindow.flatMap(axFrame) ?? info?.bounds + + guard let bounds, bounds.width > 20, bounds.height > 20 else { + throw ComputerUseError.noWindow(app.localizedName ?? appName ?? "frontmost app") + } + + return WindowTarget( + appName: app.localizedName ?? "Unknown", + pid: pid, + windowNumber: info?.number, + windowTitle: title ?? info?.title, + bounds: bounds, + isFrontmost: NSWorkspace.shared.frontmostApplication?.processIdentifier == pid, + axWindow: axWindow + ) + } + + func snapshot(target: WindowTarget, strictMode: Bool, backgroundActivated: Bool) async throws -> AppSnapshot { + let records = target.axWindow.map(semanticRecords(window:)) ?? [] + let (data, meta) = try captureScreenshot(target: target) + + return AppSnapshot( + appName: target.appName, + pid: target.pid, + windowNumber: target.windowNumber, + windowTitle: target.windowTitle, + screenshotData: data, + screenshotMimeType: "image/jpeg", + screenshotMeta: meta, + records: records, + strictMode: strictMode, + backgroundActivated: backgroundActivated + ) + } + + func press(record: AXElementRecord) -> Bool { + AXUIElementPerformAction(record.element, kAXPressAction as CFString) == .success + } + + func focus(record: AXElementRecord) -> Bool { + AXUIElementSetAttributeValue(record.element, kAXFocusedAttribute as CFString, true as CFBoolean) == .success + } + + func setValue(record: AXElementRecord, value: String) -> Bool { + var settable = DarwinBoolean(false) + guard AXUIElementIsAttributeSettable(record.element, kAXValueAttribute as CFString, &settable) == .success, + settable.boolValue else { + return false + } + return AXUIElementSetAttributeValue(record.element, kAXValueAttribute as CFString, value as CFString) == .success + } + + func performAction(record: AXElementRecord, action: String) -> Bool { + AXUIElementPerformAction(record.element, action as CFString) == .success + } + + private func resolveApp(appName: String?) throws -> NSRunningApplication { + guard let rawName = appName?.trimmingCharacters(in: .whitespacesAndNewlines), !rawName.isEmpty else { + guard let frontmost = NSWorkspace.shared.frontmostApplication else { + throw ComputerUseError.noFrontmostApplication + } + return frontmost + } + + let needle = rawName.lowercased() + let regularApps = NSWorkspace.shared.runningApplications.filter { $0.activationPolicy == .regular } + if let exact = regularApps.first(where: { $0.localizedName?.lowercased() == needle }) { + return exact + } + if let contains = regularApps.first(where: { $0.localizedName?.lowercased().contains(needle) == true }) { + return contains + } + throw ComputerUseError.appNotFound(rawName) + } + + private func firstAXWindow(axApp: AXUIElement) -> AXUIElement? { + var windowValue: CFTypeRef? + guard AXUIElementCopyAttributeValue(axApp, kAXWindowsAttribute as CFString, &windowValue) == .success, + let windows = windowValue as? [AXUIElement] else { + return nil + } + return windows.first { window in + guard let frame = axFrame(window) else { return false } + return frame.width > 20 && frame.height > 20 + } + } + + private func semanticRecords(window: AXUIElement) -> [AXElementRecord] { + var records: [AXElementRecord] = [] + collect(element: window, depth: 0, records: &records) + return records + } + + private func collect(element: AXUIElement, depth: Int, records: inout [AXElementRecord]) { + guard depth <= maxDepth, records.count < maxElements else { return } + + let rawRole = axString(element, kAXRoleAttribute) ?? "AXUnknown" + let role = normalizedRole(rawRole) + let value = axString(element, kAXValueAttribute).map { String($0.prefix(120)) } + let label = semanticLabel(element: element, role: role, value: value) + let actions = axActions(element) + let frame = axFrame(element) + let capabilities = capabilitiesFor(element: element, rawRole: rawRole, actions: actions) + let shouldSurface = shouldSurfaceElement(rawRole: rawRole, label: label, value: value, frame: frame, capabilities: capabilities) + + if shouldSurface, let frame { + let id = records.count + 1 + let semantic = SemanticAXElement( + id: id, + ref: "{e\(id)}", + role: role, + label: label, + value: value, + frame: ElementFrame( + x: Int(frame.origin.x), + y: Int(frame.origin.y), + width: Int(frame.width), + height: Int(frame.height) + ), + state: stateFor(element: element, rawRole: rawRole), + capabilities: capabilities + ) + records.append(AXElementRecord(element: element, semantic: semantic)) + } + + for child in axChildren(element) { + collect(element: child, depth: depth + 1, records: &records) + if records.count >= maxElements { break } + } + } + + private func shouldSurfaceElement(rawRole: String, label: String, value: String?, frame: CGRect?, capabilities: AXElementCapabilities) -> Bool { + guard let frame, frame.width > 1, frame.height > 1 else { return false } + let hasSemanticText = !label.isEmpty || value?.isEmpty == false + let interactive = capabilities.canPress || capabilities.canFocus || capabilities.canScroll || capabilities.canAdjust || capabilities.canSetValue + if interactive { return true } + if !importantRoles.contains(rawRole) { return false } + if rawRole == "AXGroup" { return hasSemanticText && frame.width < 900 && frame.height < 700 } + return hasSemanticText + } + + private func semanticLabel(element: AXUIElement, role: String, value: String?) -> String { + let candidates = [ + axString(element, kAXTitleAttribute), + axString(element, kAXDescriptionAttribute), + axString(element, kAXHelpAttribute), + axString(element, kAXIdentifierAttribute), + value, + ] + for candidate in candidates { + if let trimmed = candidate?.trimmingCharacters(in: .whitespacesAndNewlines), !trimmed.isEmpty { + return String(trimmed.prefix(120)) + } + } + return role + } + + private func capabilitiesFor(element: AXUIElement, rawRole: String, actions: [String]) -> AXElementCapabilities { + var valueSettable = DarwinBoolean(false) + let canSetValue = AXUIElementIsAttributeSettable(element, kAXValueAttribute as CFString, &valueSettable) == .success && valueSettable.boolValue + + var focusSettable = DarwinBoolean(false) + let canFocus = AXUIElementIsAttributeSettable(element, kAXFocusedAttribute as CFString, &focusSettable) == .success && focusSettable.boolValue + + let canAdjust = actions.contains(kAXIncrementAction) || actions.contains(kAXDecrementAction) || rawRole == "AXSlider" || rawRole == "AXIncrementor" + let canScroll = actions.contains("AXScrollToVisible") || rawRole == "AXScrollArea" || rawRole == "AXScrollBar" + let canPress = actions.contains(kAXPressAction) || ["AXButton", "AXCheckBox", "AXRadioButton", "AXLink", "AXMenuItem", "AXPopUpButton", "AXMenuButton", "AXCell"].contains(rawRole) + + return AXElementCapabilities( + canPress: canPress, + canFocus: canFocus, + canScroll: canScroll, + canAdjust: canAdjust, + canSetValue: canSetValue, + actions: actions + ) + } + + private func stateFor(element: AXUIElement, rawRole: String) -> AXElementState { + AXElementState( + enabled: axBool(element, kAXEnabledAttribute), + focused: axBool(element, kAXFocusedAttribute), + selected: axBool(element, kAXSelectedAttribute), + expanded: axBool(element, kAXExpandedAttribute), + checked: rawRole == "AXCheckBox" || rawRole == "AXRadioButton" ? axBool(element, kAXValueAttribute) : nil + ) + } + + private func axChildren(_ element: AXUIElement) -> [AXUIElement] { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success, + let children = value as? [AXUIElement] else { + return [] + } + return children + } + + private func axActions(_ element: AXUIElement) -> [String] { + var actionNames: CFArray? + guard AXUIElementCopyActionNames(element, &actionNames) == .success, + let names = actionNames as? [String] else { + return [] + } + return names + } + + private func axString(_ element: AXUIElement, _ attribute: String) -> String? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let value else { + return nil + } + if let string = value as? String, !string.isEmpty { return string } + if let attributed = value as? NSAttributedString, !attributed.string.isEmpty { return attributed.string } + if let number = value as? NSNumber { return number.stringValue } + return nil + } + + private func axBool(_ element: AXUIElement, _ attribute: String) -> Bool? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let value else { + return nil + } + if let bool = value as? Bool { return bool } + if let number = value as? NSNumber { return number.boolValue } + return nil + } + + private func axFrame(_ element: AXUIElement) -> CGRect? { + guard let position = axPoint(element, kAXPositionAttribute), let size = axSize(element, kAXSizeAttribute) else { + return nil + } + return CGRect(origin: position, size: size) + } + + private func axPoint(_ element: AXUIElement, _ attribute: String) -> CGPoint? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let value, + CFGetTypeID(value) == AXValueGetTypeID() else { + return nil + } + let axValue = value as! AXValue + guard AXValueGetType(axValue) == .cgPoint else { return nil } + var point = CGPoint.zero + guard AXValueGetValue(axValue, .cgPoint, &point) else { return nil } + return point + } + + private func axSize(_ element: AXUIElement, _ attribute: String) -> CGSize? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let value, + CFGetTypeID(value) == AXValueGetTypeID() else { + return nil + } + let axValue = value as! AXValue + guard AXValueGetType(axValue) == .cgSize else { return nil } + var size = CGSize.zero + guard AXValueGetValue(axValue, .cgSize, &size) else { return nil } + return size + } + + private func normalizedRole(_ rawRole: String) -> String { + rawRole.hasPrefix("AX") ? String(rawRole.dropFirst(2)) : rawRole + } + + private func firstCGWindowInfo(pid: pid_t, title: String?) -> (number: Int, title: String?, bounds: CGRect)? { + guard let list = CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID) as? [[String: Any]] else { + return nil + } + + let candidates = list.compactMap { info -> (number: Int, title: String?, bounds: CGRect)? in + guard let ownerPID = info[kCGWindowOwnerPID as String] as? Int32, + ownerPID == pid, + let layer = info[kCGWindowLayer as String] as? Int, + layer == 0, + let number = info[kCGWindowNumber as String] as? Int, + let boundsDict = info[kCGWindowBounds as String] as? [String: Any] else { + return nil + } + let bounds = CGRect( + x: cgFloat(boundsDict["X"]), + y: cgFloat(boundsDict["Y"]), + width: cgFloat(boundsDict["Width"]), + height: cgFloat(boundsDict["Height"]) + ) + guard bounds.width > 20, bounds.height > 20 else { return nil } + return (number, info[kCGWindowName as String] as? String, bounds) + } + + if let title, let exact = candidates.first(where: { $0.title == title }) { + return exact + } + return candidates.first + } + + private func captureScreenshot(target: WindowTarget) throws -> (Data, ScreenshotMetadata) { + let cgImage: CGImage? + if let windowNumber = target.windowNumber { + cgImage = CGWindowListCreateImage( + CGRect.null, + .optionIncludingWindow, + CGWindowID(windowNumber), + [.bestResolution, .boundsIgnoreFraming] + ) + } else { + cgImage = CGWindowListCreateImage(target.bounds, .optionOnScreenOnly, kCGNullWindowID, [.bestResolution]) + } + + guard let cgImage else { throw ComputerUseError.screenshotFailed } + + let rawWidth = CGFloat(cgImage.width) + let rawHeight = CGFloat(cgImage.height) + let targetWidth = min(screenshotImageWidth, rawWidth) + let targetHeight = rawHeight * (targetWidth / rawWidth) + + let source = NSImage(cgImage: cgImage, size: NSSize(width: rawWidth, height: rawHeight)) + let resized = NSImage(size: NSSize(width: targetWidth, height: targetHeight)) + resized.lockFocus() + source.draw(in: NSRect(x: 0, y: 0, width: targetWidth, height: targetHeight)) + resized.unlockFocus() + + guard let tiff = resized.tiffRepresentation, + let rep = NSBitmapImageRep(data: tiff), + let jpeg = rep.representation(using: .jpeg, properties: [.compressionFactor: 0.45]) else { + throw ComputerUseError.screenshotFailed + } + + return ( + jpeg, + ScreenshotMetadata( + imageWidth: Int(targetWidth), + imageHeight: Int(targetHeight), + capturedBounds: target.bounds + ) + ) + } + + private func cgFloat(_ value: Any?) -> CGFloat { + if let value = value as? CGFloat { return value } + if let value = value as? Double { return CGFloat(value) } + if let value = value as? Int { return CGFloat(value) } + if let value = value as? NSNumber { return CGFloat(truncating: value) } + return 0 + } +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundActivationSession.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundActivationSession.swift new file mode 100644 index 0000000000..76c02f7d5b --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundActivationSession.swift @@ -0,0 +1,164 @@ +import AppKit +import CoreGraphics + +final class BackgroundActivationSession: @unchecked Sendable { + private final class TapContext { + let suppressFocusMessages: Bool + + init(suppressFocusMessages: Bool) { + self.suppressFocusMessages = suppressFocusMessages + } + } + + private struct TapRef { + let tap: CFMachPort + let source: CFRunLoopSource + let context: TapContext + } + + private let previousPID: pid_t + private let targetPID: pid_t + private var taps: [TapRef] = [] + private var runLoop: CFRunLoop? + private var thread: Thread? + private var startupError: Error? + private var started = false + private var lastTarget: WindowTarget? + + init(previousPID: pid_t, targetPID: pid_t) { + self.previousPID = previousPID + self.targetPID = targetPID + } + + deinit { + stop() + } + + func start() throws { + guard !started else { return } + + let ready = DispatchSemaphore(value: 0) + let thread = Thread { [weak self] in + guard let self else { + ready.signal() + return + } + self.runLoop = CFRunLoopGetCurrent() + do { + try self.installTapsOnCurrentRunLoop() + } catch { + self.startupError = error + } + ready.signal() + if self.startupError == nil { + CFRunLoopRun() + } + } + thread.name = "OpenWorkBackgroundActivationSession" + self.thread = thread + thread.start() + ready.wait() + + if let startupError { + throw startupError + } + started = true + } + + func activate(target: WindowTarget) async throws { + guard let windowNumber = target.windowNumber else { + throw ComputerUseError.strictModeViolation("target window has no CG window number") + } + + postAppKitDefined(subtype: 1, target: target, windowNumber: windowNumber) + try await Task.sleep(nanoseconds: 25_000_000) + try await BackgroundInputDispatcher.click(pid: target.pid, windowNumber: windowNumber, point: target.center) + try await Task.sleep(nanoseconds: 80_000_000) + lastTarget = target + } + + func stop() { + if let target = lastTarget, let windowNumber = target.windowNumber { + postAppKitDefined(subtype: 2, target: target, windowNumber: windowNumber) + } + for tapRef in taps { + CFMachPortInvalidate(tapRef.tap) + } + taps.removeAll() + if let runLoop { + CFRunLoopStop(runLoop) + } + runLoop = nil + thread = nil + started = false + } + + private func installTapsOnCurrentRunLoop() throws { + if previousPID != targetPID { + taps.append(try installTap(pid: previousPID, suppressFocusMessages: true)) + } + taps.append(try installTap(pid: targetPID, suppressFocusMessages: false)) + } + + private func installTap(pid: pid_t, suppressFocusMessages: Bool) throws -> TapRef { + let context = TapContext(suppressFocusMessages: suppressFocusMessages) + let userInfo = Unmanaged.passUnretained(context).toOpaque() + guard let tap = CGEvent.tapCreateForPid( + pid: pid, + place: .headInsertEventTap, + options: .defaultTap, + eventsOfInterest: CGEventMask.max, + callback: BackgroundActivationSession.eventTapCallback, + userInfo: userInfo + ) else { + throw ComputerUseError.strictModeViolation("could not install per-process event tap for pid \(pid)") + } + + guard let source = CFMachPortCreateRunLoopSource(kCFAllocatorDefault, tap, 0) else { + CFMachPortInvalidate(tap) + throw ComputerUseError.strictModeViolation("could not create run loop source for pid \(pid) event tap") + } + + CFRunLoopAddSource(CFRunLoopGetCurrent(), source, .commonModes) + CGEvent.tapEnable(tap: tap, enable: true) + return TapRef(tap: tap, source: source, context: context) + } + + private func postAppKitDefined(subtype: Int16, target: WindowTarget, windowNumber: Int) { + guard let event = NSEvent.otherEvent( + with: .appKitDefined, + location: target.center, + modifierFlags: [], + timestamp: ProcessInfo.processInfo.systemUptime, + windowNumber: windowNumber, + context: nil, + subtype: subtype, + data1: 0, + data2: 0 + )?.cgEvent else { + return + } + + BackgroundInputDispatcher.address(event, pid: target.pid, windowNumber: windowNumber) + event.postToPid(target.pid) + } + + private static let eventTapCallback: CGEventTapCallBack = { _, type, event, userInfo in + if type == .tapDisabledByTimeout || type == .tapDisabledByUserInput { + return Unmanaged.passUnretained(event) + } + guard let userInfo else { + return Unmanaged.passUnretained(event) + } + let context = Unmanaged.fromOpaque(userInfo).takeUnretainedValue() + if context.suppressFocusMessages && BackgroundActivationSession.isFocusMessage(type) { + return nil + } + return Unmanaged.passUnretained(event) + } + + private static func isFocusMessage(_ type: CGEventType) -> Bool { + let raw = Int(type.rawValue) + return raw == 13 || raw == 19 || raw == 20 + } +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundInputDispatcher.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundInputDispatcher.swift new file mode 100644 index 0000000000..1e9f8829b6 --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/BackgroundInputDispatcher.swift @@ -0,0 +1,140 @@ +import Foundation +import CoreGraphics + +enum BackgroundInputDispatcher { + private static let privateWindowField = CGEventField(rawValue: 51) + private static let privateRouteField = CGEventField(rawValue: 58) + + static func click(pid: pid_t, windowNumber: Int, point: CGPoint, doubleClick: Bool = false) async throws { + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + + let clickCount = doubleClick ? 2 : 1 + for clickState in 1...clickCount { + guard let down = CGEvent(mouseEventSource: source, mouseType: .leftMouseDown, mouseCursorPosition: point, mouseButton: .left), + let up = CGEvent(mouseEventSource: source, mouseType: .leftMouseUp, mouseCursorPosition: point, mouseButton: .left) else { + throw ComputerUseError.eventCreationFailed + } + + address(down, pid: pid, windowNumber: windowNumber) + down.setIntegerValueField(.mouseEventClickState, value: Int64(clickState)) + down.setDoubleValueField(.mouseEventPressure, value: 1) + down.postToPid(pid) + + try await Task.sleep(nanoseconds: 30_000_000) + + address(up, pid: pid, windowNumber: windowNumber) + up.setIntegerValueField(.mouseEventClickState, value: Int64(clickState)) + up.setDoubleValueField(.mouseEventPressure, value: 0) + up.postToPid(pid) + + if clickState < clickCount { + try await Task.sleep(nanoseconds: 50_000_000) + } + } + } + + static func scroll(pid: pid_t, windowNumber: Int, point: CGPoint, deltaX: Int32, deltaY: Int32) throws { + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + guard let event = CGEvent(scrollWheelEvent2Source: source, units: .line, wheelCount: 2, wheel1: deltaY, wheel2: deltaX, wheel3: 0) else { + throw ComputerUseError.eventCreationFailed + } + event.location = point + address(event, pid: pid, windowNumber: windowNumber) + event.postToPid(pid) + } + + static func typeText(pid: pid_t, text: String) throws { + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + + let units = Array(text.utf16) + let chunkSize = 20 + for start in stride(from: 0, to: units.count, by: chunkSize) { + let end = min(start + chunkSize, units.count) + let chunk = Array(units[start.. (flags: CGEventFlags, keyCode: CGKeyCode) { + let parts = combo.lowercased().split(separator: "+").map(String.init) + var flags: CGEventFlags = [] + var keyName = "" + + for part in parts { + switch part { + case "command", "cmd", "meta": flags.insert(.maskCommand) + case "shift": flags.insert(.maskShift) + case "control", "ctrl": flags.insert(.maskControl) + case "option", "alt": flags.insert(.maskAlternate) + default: keyName = part + } + } + + guard let keyCode = keyCodes[keyName] else { + throw ComputerUseError.unknownKey(keyName) + } + return (flags, keyCode) + } + + private static let keyCodes: [String: CGKeyCode] = [ + "return": 0x24, "enter": 0x24, "tab": 0x30, "space": 0x31, + "delete": 0x33, "backspace": 0x33, "escape": 0x35, "esc": 0x35, + "up": 0x7E, "down": 0x7D, "left": 0x7B, "right": 0x7C, + "home": 0x73, "end": 0x77, "pageup": 0x74, "pagedown": 0x79, + "a": 0x00, "b": 0x0B, "c": 0x08, "d": 0x02, "e": 0x0E, + "f": 0x03, "g": 0x05, "h": 0x04, "i": 0x22, "j": 0x26, + "k": 0x28, "l": 0x25, "m": 0x2E, "n": 0x2D, "o": 0x1F, + "p": 0x23, "q": 0x0C, "r": 0x0F, "s": 0x01, "t": 0x11, + "u": 0x20, "v": 0x09, "w": 0x0D, "x": 0x07, "y": 0x10, "z": 0x06, + "0": 0x1D, "1": 0x12, "2": 0x13, "3": 0x14, "4": 0x15, + "5": 0x17, "6": 0x16, "7": 0x1A, "8": 0x1C, "9": 0x19, + "f1": 0x7A, "f2": 0x78, "f3": 0x63, "f4": 0x76, + "f5": 0x60, "f6": 0x61, "f7": 0x62, "f8": 0x64, + "f9": 0x65, "f10": 0x6D, "f11": 0x67, "f12": 0x6F, + "-": 0x1B, "=": 0x18, "[": 0x21, "]": 0x1E, + "\\": 0x2A, ";": 0x29, "'": 0x27, ",": 0x2B, + ".": 0x2F, "/": 0x2C, "`": 0x32, + ] +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/ComputerUseRuntime.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/ComputerUseRuntime.swift new file mode 100644 index 0000000000..c8e867a422 --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/ComputerUseRuntime.swift @@ -0,0 +1,227 @@ +import AppKit +import Foundation + +actor ComputerUseRuntime { + private let accessibility = AccessibilityService() + private let foregroundInput = InputService() + private var lastSnapshot: AppSnapshot? + private var strictMode = true + private var activationSession: BackgroundActivationSession? + private var activationKey: String? + private var activatedWindowKey: String? + + func setStrictMode(_ enabled: Bool) -> ActionMetadata { + strictMode = enabled + if !enabled { + activationSession?.stop() + activationSession = nil + activationKey = nil + activatedWindowKey = nil + } + return ActionMetadata( + ok: true, + path: .none, + strictMode: enabled, + backgroundSafe: enabled, + fallbackUsed: false, + message: enabled ? "Strict background mode enabled." : "Strict background mode disabled. Foreground fallback is allowed." + ) + } + + func snapshot(appName: String?, strict requestedStrict: Bool?) async throws -> AppSnapshot { + let effectiveStrict = requestedStrict ?? strictMode + if !effectiveStrict { + activationSession?.stop() + activationSession = nil + activationKey = nil + activatedWindowKey = nil + } + + var target = try accessibility.resolveTarget(appName: appName) + let backgroundActivated: Bool + if effectiveStrict, !target.isFrontmost { + backgroundActivated = try await ensureBackgroundActivation(target: target) + target = try accessibility.resolveTarget(appName: appName) + } else { + backgroundActivated = false + } + + let snapshot = try await accessibility.snapshot( + target: target, + strictMode: effectiveStrict, + backgroundActivated: backgroundActivated + ) + lastSnapshot = snapshot + return snapshot + } + + func click(ref: String?, index: Int?, imageX: Double?, imageY: Double?, clickCount: Int, strict requestedStrict: Bool?) async throws -> ActionMetadata { + let snapshot = try requireSnapshot() + let effectiveStrict = requestedStrict ?? snapshot.strictMode + + if let record = findRecord(ref: ref, index: index, in: snapshot) { + if record.semantic.capabilities.canPress, accessibility.press(record: record) { + return ActionMetadata(ok: true, path: .accessibility, strictMode: effectiveStrict, backgroundSafe: true, fallbackUsed: false, message: "Pressed \(record.semantic.ref) via AXPress.") + } + if record.semantic.capabilities.canFocus, accessibility.focus(record: record) { + return ActionMetadata(ok: true, path: .accessibility, strictMode: effectiveStrict, backgroundSafe: true, fallbackUsed: false, message: "Focused \(record.semantic.ref) via AX.") + } + return try await clickPoint(record.semantic.frame.center, clickCount: clickCount, strict: effectiveStrict, fallbackUsed: true) + } + + if let imageX, let imageY { + let point = snapshot.screenshotMeta.toScreen(imageX: imageX, imageY: imageY) + return try await clickPoint(point, clickCount: clickCount, strict: effectiveStrict, fallbackUsed: false) + } + + throw ComputerUseError.invalidElement(ref ?? index.map(String.init) ?? "") + } + + func typeText(_ text: String, strict requestedStrict: Bool?) throws -> ActionMetadata { + let snapshot = try requireSnapshot() + let effectiveStrict = requestedStrict ?? snapshot.strictMode + if effectiveStrict { + try BackgroundInputDispatcher.typeText(pid: snapshot.pid, text: text) + return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: false, message: "Typed text with postToPid.") + } + + try foregroundInput.typeText(text) + return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Typed text with foreground HID fallback.") + } + + func pressKey(_ combo: String, strict requestedStrict: Bool?) throws -> ActionMetadata { + let snapshot = try requireSnapshot() + let effectiveStrict = requestedStrict ?? snapshot.strictMode + if effectiveStrict { + try BackgroundInputDispatcher.pressKey(pid: snapshot.pid, combo: combo) + return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: false, message: "Pressed key with postToPid.") + } + + try foregroundInput.pressKey(combo) + return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Pressed key with foreground HID fallback.") + } + + func scroll(direction: String?, pages: Double, imageX: Double?, imageY: Double?, strict requestedStrict: Bool?) throws -> ActionMetadata { + let snapshot = try requireSnapshot() + let effectiveStrict = requestedStrict ?? snapshot.strictMode + let amount = max(1, Int32(pages * 5)) + let deltas = scrollDeltas(direction: direction, amount: amount) + let point: CGPoint = { + if let imageX, let imageY { + return snapshot.screenshotMeta.toScreen(imageX: imageX, imageY: imageY) + } + return CGPoint(x: snapshot.screenshotMeta.capturedBounds.midX, y: snapshot.screenshotMeta.capturedBounds.midY) + }() + + if effectiveStrict { + guard let windowNumber = snapshot.windowNumber else { + throw ComputerUseError.strictModeViolation("background scroll requires a CG window number") + } + try BackgroundInputDispatcher.scroll(pid: snapshot.pid, windowNumber: windowNumber, point: point, deltaX: deltas.x, deltaY: deltas.y) + return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: false, message: "Scrolled with postToPid.") + } + + try foregroundInput.scroll(point: point, deltaX: deltas.x, deltaY: deltas.y) + return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Scrolled with foreground HID fallback.") + } + + func setValue(ref: String?, index: Int?, value: String) throws -> ActionMetadata { + let snapshot = try requireSnapshot() + guard let record = findRecord(ref: ref, index: index, in: snapshot) else { + throw ComputerUseError.invalidElement(ref ?? index.map(String.init) ?? "") + } + let ok = accessibility.setValue(record: record, value: value) + return ActionMetadata(ok: ok, path: .accessibility, strictMode: snapshot.strictMode, backgroundSafe: true, fallbackUsed: false, message: ok ? "Set \(record.semantic.ref) via AXValue." : "Element value is not settable.") + } + + func performAction(ref: String?, index: Int?, action: String) throws -> ActionMetadata { + let snapshot = try requireSnapshot() + guard let record = findRecord(ref: ref, index: index, in: snapshot) else { + throw ComputerUseError.invalidElement(ref ?? index.map(String.init) ?? "") + } + let ok = accessibility.performAction(record: record, action: action) + return ActionMetadata(ok: ok, path: .accessibility, strictMode: snapshot.strictMode, backgroundSafe: true, fallbackUsed: false, message: ok ? "Performed \(action) on \(record.semantic.ref)." : "AX action \(action) failed.") + } + + func wait(milliseconds: Int) async -> ActionMetadata { + let clamped = max(0, min(milliseconds, 10_000)) + try? await Task.sleep(nanoseconds: UInt64(clamped) * 1_000_000) + return ActionMetadata(ok: true, path: .none, strictMode: strictMode, backgroundSafe: true, fallbackUsed: false, message: "Waited \(clamped)ms.") + } + + private func clickPoint(_ point: CGPoint, clickCount: Int, strict: Bool, fallbackUsed: Bool) async throws -> ActionMetadata { + let snapshot = try requireSnapshot() + if strict { + guard let windowNumber = snapshot.windowNumber else { + throw ComputerUseError.strictModeViolation("background click requires a CG window number") + } + try await BackgroundInputDispatcher.click(pid: snapshot.pid, windowNumber: windowNumber, point: point, doubleClick: clickCount >= 2) + return ActionMetadata(ok: true, path: .backgroundCGEvent, strictMode: true, backgroundSafe: true, fallbackUsed: fallbackUsed, message: "Clicked with postToPid at \(Int(point.x)),\(Int(point.y)).") + } + + try await foregroundInput.click(point: point, doubleClick: clickCount >= 2) + return ActionMetadata(ok: true, path: .foregroundCGEvent, strictMode: false, backgroundSafe: false, fallbackUsed: true, message: "Clicked with foreground HID fallback at \(Int(point.x)),\(Int(point.y)).") + } + + private func ensureBackgroundActivation(target: WindowTarget) async throws -> Bool { + guard let previousPID = NSWorkspace.shared.frontmostApplication?.processIdentifier else { + throw ComputerUseError.noFrontmostApplication + } + let nextActivationKey = "\(previousPID):\(target.pid)" + if activationKey != nextActivationKey { + activationSession?.stop() + let next = BackgroundActivationSession(previousPID: previousPID, targetPID: target.pid) + try next.start() + activationSession = next + activationKey = nextActivationKey + activatedWindowKey = nil + } + + let nextWindowKey = "\(target.pid):\(target.windowNumber ?? -1)" + if activatedWindowKey != nextWindowKey { + guard let activationSession else { + throw ComputerUseError.strictModeViolation("background activation session was not created") + } + try await activationSession.activate(target: target) + activatedWindowKey = nextWindowKey + } + return true + } + + private func requireSnapshot() throws -> AppSnapshot { + guard let lastSnapshot else { throw ComputerUseError.noSnapshot } + return lastSnapshot + } + + private func findRecord(ref: String?, index: Int?, in snapshot: AppSnapshot) -> AXElementRecord? { + if let ref { + let normalized = normalizeRef(ref) + return snapshot.records.first { $0.semantic.ref == normalized } + } + if let index { + if let byID = snapshot.records.first(where: { $0.semantic.id == index }) { + return byID + } + if index >= 0 && index < snapshot.records.count { + return snapshot.records[index] + } + } + return nil + } + + private func normalizeRef(_ raw: String) -> String { + let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmed.hasPrefix("{e"), trimmed.hasSuffix("}") { return trimmed } + if trimmed.hasPrefix("e") { return "{\(trimmed)}" } + return "{e\(trimmed)}" + } + + private func scrollDeltas(direction: String?, amount: Int32) -> (x: Int32, y: Int32) { + switch direction?.lowercased() ?? "down" { + case "up": return (0, amount) + case "left": return (amount, 0) + case "right": return (-amount, 0) + default: return (0, -amount) + } + } +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/InputService.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/InputService.swift new file mode 100644 index 0000000000..fcea3c86f6 --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/InputService.swift @@ -0,0 +1,102 @@ +import Foundation +import CoreGraphics + +final class InputService: @unchecked Sendable { + func moveMouse(point: CGPoint) throws { + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + guard let event = CGEvent(mouseEventSource: source, mouseType: .mouseMoved, mouseCursorPosition: point, mouseButton: .left) else { + throw ComputerUseError.eventCreationFailed + } + event.post(tap: .cghidEventTap) + } + + func click(point: CGPoint, doubleClick: Bool = false) async throws { + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + + if let move = CGEvent(mouseEventSource: source, mouseType: .mouseMoved, mouseCursorPosition: point, mouseButton: .left) { + move.post(tap: .cghidEventTap) + } + try await Task.sleep(nanoseconds: 50_000_000) + + let count = doubleClick ? 2 : 1 + for clickState in 1...count { + guard let down = CGEvent(mouseEventSource: source, mouseType: .leftMouseDown, mouseCursorPosition: point, mouseButton: .left), + let up = CGEvent(mouseEventSource: source, mouseType: .leftMouseUp, mouseCursorPosition: point, mouseButton: .left) else { + throw ComputerUseError.eventCreationFailed + } + down.setIntegerValueField(.mouseEventClickState, value: Int64(clickState)) + up.setIntegerValueField(.mouseEventClickState, value: Int64(clickState)) + down.post(tap: .cghidEventTap) + up.post(tap: .cghidEventTap) + } + } + + func typeText(_ text: String) throws { + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + let units = Array(text.utf16) + for start in stride(from: 0, to: units.count, by: 20) { + let end = min(start + 20, units.count) + let chunk = Array(units[start..= 2 else { return } + guard let source = CGEventSource(stateID: .combinedSessionState) else { + throw ComputerUseError.eventSourceFailed + } + guard let down = CGEvent(mouseEventSource: source, mouseType: .leftMouseDown, mouseCursorPosition: path[0], mouseButton: .left) else { + throw ComputerUseError.eventCreationFailed + } + down.post(tap: .cghidEventTap) + for point in path.dropFirst().dropLast() { + if let drag = CGEvent(mouseEventSource: source, mouseType: .leftMouseDragged, mouseCursorPosition: point, mouseButton: .left) { + drag.post(tap: .cghidEventTap) + } + try await Task.sleep(nanoseconds: 12_000_000) + } + guard let last = path.last, + let up = CGEvent(mouseEventSource: source, mouseType: .leftMouseUp, mouseCursorPosition: last, mouseButton: .left) else { + throw ComputerUseError.eventCreationFailed + } + up.post(tap: .cghidEventTap) + } +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/MCPServer.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/MCPServer.swift new file mode 100644 index 0000000000..6a3ce78ddd --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/MCPServer.swift @@ -0,0 +1,533 @@ +import AppKit +import ApplicationServices +import Foundation + +actor MCPServer { + private let runtime = ComputerUseRuntime() + private let input = InputService() + + func run() async { + log("HandsFree computer-use server starting") + while let line = readLine(strippingNewline: true) { + guard !line.isEmpty else { continue } + guard let data = line.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { + log("Invalid JSON-RPC line") + continue + } + + let id = json["id"] + let method = json["method"] as? String ?? "" + let params = json["params"] as? [String: Any] ?? [:] + + switch method { + case "initialize": + respond(id: id, result: [ + "protocolVersion": "2025-03-26", + "capabilities": ["tools": [:]], + "serverInfo": ["name": "openwork-handsfree-computer-use", "version": "0.1.0"], + ]) + case "notifications/initialized": + break + case "tools/list": + respond(id: id, result: ["tools": toolSchemas()]) + case "tools/call": + let name = params["name"] as? String ?? "" + let args = params["arguments"] as? [String: Any] ?? [:] + let content = await executeTool(name: name, args: args) + respond(id: id, result: ["content": content]) + default: + if id != nil { + respondError(id: id, code: -32601, message: "Method not found: \(method)") + } + } + } + } + + private func toolSchemas() -> [[String: Any]] { + [ + toolSchema( + name: "snapshot", + description: "Return target-window screenshot plus compact semantic AX state. Uses strict background activation by default.", + properties: [ + "app": ["type": "string", "description": "Optional running app name. Omit for frontmost app."], + "strict": ["type": "boolean", "description": "Keep actions on background-safe AX/postToPid paths. Default true."], + ] + ), + toolSchema( + name: "click", + description: "Click a semantic ref like {e1}, an index, or screenshot x/y. AX is tried first; strict mode only falls back to background postToPid.", + properties: [ + "ref": ["type": "string", "description": "Semantic ref from snapshot, e.g. {e1}."], + "index": ["type": "number", "description": "Element id or zero-based compatibility index."], + "x": ["type": "number", "description": "Screenshot x coordinate."], + "y": ["type": "number", "description": "Screenshot y coordinate."], + "click_count": ["type": "number", "description": "1 or 2. Default 1."], + "strict": ["type": "boolean", "description": "Override strict mode for this action."], + ] + ), + toolSchema( + name: "type_text", + description: "Type text into the target process. In strict mode this uses CGEvent.postToPid and does not move the real cursor.", + properties: [ + "text": ["type": "string", "description": "Text to type."], + "strict": ["type": "boolean", "description": "Override strict mode for this action."], + ] + ), + toolSchema( + name: "press_key", + description: "Press a key combo such as command+k, return, tab, or escape.", + properties: [ + "combo": ["type": "string", "description": "Key combo."], + "strict": ["type": "boolean", "description": "Override strict mode for this action."], + ] + ), + toolSchema( + name: "scroll", + description: "Scroll the target window without foregrounding it in strict mode.", + properties: [ + "direction": ["type": "string", "description": "up, down, left, or right."], + "pages": ["type": "number", "description": "Approximate page count. Default 1."], + "x": ["type": "number", "description": "Optional screenshot x coordinate."], + "y": ["type": "number", "description": "Optional screenshot y coordinate."], + "strict": ["type": "boolean", "description": "Override strict mode for this action."], + ] + ), + toolSchema( + name: "set_value", + description: "Set a semantic AX element value directly. This stays background-safe.", + properties: [ + "ref": ["type": "string", "description": "Semantic ref from snapshot."], + "index": ["type": "number", "description": "Element id or zero-based compatibility index."], + "value": ["type": "string", "description": "Value to set."], + ] + ), + toolSchema( + name: "perform_action", + description: "Perform a named AX action such as AXPress, AXShowMenu, AXIncrement, or AXDecrement.", + properties: [ + "ref": ["type": "string", "description": "Semantic ref from snapshot."], + "index": ["type": "number", "description": "Element id or zero-based compatibility index."], + "action": ["type": "string", "description": "AX action name. Default AXPress."], + ] + ), + toolSchema( + name: "wait", + description: "Wait for UI to settle.", + properties: ["milliseconds": ["type": "number", "description": "Wait time. Default 1000."]] + ), + toolSchema( + name: "set_strict_mode", + description: "Enable or disable strict background mode. Strict mode rejects foreground fallbacks.", + properties: ["enabled": ["type": "boolean", "description": "Whether strict background mode is enabled."]] + ), + toolSchema(name: "check_permissions", description: "Check Accessibility and Screen Recording permission status.", properties: [:]), + toolSchema(name: "get_app_state", description: "Compatibility alias for snapshot.", properties: ["app": ["type": "string"], "strict": ["type": "boolean"]]), + toolSchema(name: "launch_app", description: "Launch a macOS app by name.", properties: ["name": ["type": "string"]]), + toolSchema(name: "activate_app", description: "Bring a running macOS app to the foreground.", properties: ["name": ["type": "string"]]), + toolSchema(name: "list_apps", description: "List running regular macOS apps.", properties: [:]), + toolSchema(name: "open_url", description: "Open a URL in the default browser or a specific browser app.", properties: ["url": ["type": "string"], "app": ["type": "string"]]), + toolSchema(name: "clipboard_read", description: "Read text from the macOS clipboard.", properties: [:]), + toolSchema(name: "clipboard_write", description: "Write text to the macOS clipboard.", properties: ["text": ["type": "string"]]), + toolSchema(name: "display_info", description: "Return main display logical dimensions and scale factor.", properties: [:]), + toolSchema(name: "cua_screenshot", description: "Compatibility full-screen screenshot for CUA loops. Returns logical-size PNG.", properties: [:]), + toolSchema(name: "cua_click", description: "Compatibility click at absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"]]), + toolSchema(name: "cua_double_click", description: "Compatibility double-click at absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"]]), + toolSchema(name: "cua_move", description: "Compatibility mouse move to absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"]]), + toolSchema(name: "cua_type", description: "Compatibility text typing into focused input.", properties: ["text": ["type": "string"]]), + toolSchema(name: "cua_keypress", description: "Compatibility keypress using CUA key names.", properties: ["keys": ["type": "array", "items": ["type": "string"]]]), + toolSchema(name: "cua_scroll", description: "Compatibility scroll at absolute screen coordinates.", properties: ["x": ["type": "number"], "y": ["type": "number"], "scroll_x": ["type": "number"], "scroll_y": ["type": "number"]]), + toolSchema(name: "cua_drag", description: "Compatibility drag over an array of [x,y] points.", properties: ["path": ["type": "array"]]), + toolSchema(name: "cua_wait", description: "Compatibility wait for UI to settle.", properties: [:]), + ] + } + + private func toolSchema(name: String, description: String, properties: [String: Any]) -> [String: Any] { + ["name": name, "description": description, "inputSchema": ["type": "object", "properties": properties]] + } + + private func executeTool(name: String, args: [String: Any]) async -> [[String: Any]] { + do { + switch name { + case "snapshot", "get_app_state": + return try await snapshotResult(args: args) + case "click": + let metadata = try await runtime.click( + ref: args["ref"] as? String, + index: intArg(args, "index"), + imageX: doubleArg(args, "x"), + imageY: doubleArg(args, "y"), + clickCount: intArg(args, "click_count") ?? 1, + strict: boolArg(args, "strict") + ) + return jsonResult(metadata.dictionary) + case "type_text": + let metadata = try await runtime.typeText(args["text"] as? String ?? "", strict: boolArg(args, "strict")) + return jsonResult(metadata.dictionary) + case "press_key": + let metadata = try await runtime.pressKey(args["combo"] as? String ?? "", strict: boolArg(args, "strict")) + return jsonResult(metadata.dictionary) + case "scroll": + let metadata = try await runtime.scroll( + direction: args["direction"] as? String, + pages: doubleArg(args, "pages") ?? 1, + imageX: doubleArg(args, "x"), + imageY: doubleArg(args, "y"), + strict: boolArg(args, "strict") + ) + return jsonResult(metadata.dictionary) + case "set_value": + let metadata = try await runtime.setValue(ref: args["ref"] as? String, index: intArg(args, "index"), value: args["value"] as? String ?? "") + return jsonResult(metadata.dictionary) + case "perform_action": + let metadata = try await runtime.performAction(ref: args["ref"] as? String, index: intArg(args, "index"), action: args["action"] as? String ?? kAXPressAction) + return jsonResult(metadata.dictionary) + case "wait": + let metadata = await runtime.wait(milliseconds: intArg(args, "milliseconds") ?? 1000) + return jsonResult(metadata.dictionary) + case "set_strict_mode": + let metadata = await runtime.setStrictMode(boolArg(args, "enabled") ?? true) + return jsonResult(metadata.dictionary) + case "check_permissions": + return jsonResult(checkPermissions()) + case "launch_app": + return try await jsonResult(handleLaunchApp(args: args)) + case "activate_app": + return jsonResult(handleActivateApp(args: args)) + case "list_apps": + return jsonResult(["ok": true, "apps": runningApps().compactMap(\.localizedName).sorted()]) + case "open_url": + return try await jsonResult(handleOpenURL(args: args)) + case "clipboard_read": + return jsonResult(["ok": true, "text": NSPasteboard.general.string(forType: .string) ?? ""]) + case "clipboard_write": + let pasteboard = NSPasteboard.general + pasteboard.clearContents() + pasteboard.setString(args["text"] as? String ?? "", forType: .string) + return jsonResult(["ok": true]) + case "display_info": + return jsonResult(displayInfo()) + case "cua_screenshot": + return try cuaScreenshotResult() + case "cua_click": + try await input.click(point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0)) + return jsonResult(["ok": true]) + case "cua_double_click": + try await input.click(point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0), doubleClick: true) + return jsonResult(["ok": true]) + case "cua_move": + try input.moveMouse(point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0)) + return jsonResult(["ok": true]) + case "cua_type": + try input.typeText(args["text"] as? String ?? "") + return jsonResult(["ok": true]) + case "cua_keypress": + try input.pressKey(cuaKeysToCombo(args["keys"] as? [String] ?? [])) + return jsonResult(["ok": true]) + case "cua_scroll": + try input.scroll( + point: CGPoint(x: intArg(args, "x") ?? 0, y: intArg(args, "y") ?? 0), + deltaX: Int32(intArg(args, "scroll_x") ?? 0), + deltaY: Int32(-(intArg(args, "scroll_y") ?? 0)) + ) + return jsonResult(["ok": true]) + case "cua_drag": + try await input.drag(path: parsePointPath(args["path"])) + return jsonResult(["ok": true]) + case "cua_wait": + try await Task.sleep(nanoseconds: 1_000_000_000) + return jsonResult(["ok": true]) + default: + return jsonResult(["ok": false, "error": "Unknown tool: \(name)"]) + } + } catch { + return jsonResult(errorPayload(error)) + } + } + + private func snapshotResult(args: [String: Any]) async throws -> [[String: Any]] { + let snapshot = try await runtime.snapshot(appName: args["app"] as? String, strict: boolArg(args, "strict")) + let payload = snapshotPayload(snapshot) + guard let text = jsonString(payload) else { + return textResult("Failed to serialize semantic AX snapshot.") + } + return [ + ["type": "image", "data": snapshot.screenshotData.base64EncodedString(), "mimeType": snapshot.screenshotMimeType], + ["type": "text", "text": text], + ] + } + + private func snapshotPayload(_ snapshot: AppSnapshot) -> [String: Any] { + let elements = snapshot.elements.map { element -> [String: Any] in + var dict = element.dictionary + let imagePoint = snapshot.screenshotMeta.toImage(point: element.frame.center) + dict["center"] = [ + "screenX": Int(element.frame.center.x), + "screenY": Int(element.frame.center.y), + "imageX": Int(imagePoint.x), + "imageY": Int(imagePoint.y), + ] + return dict + } + + var result: [String: Any] = [ + "ok": true, + "semanticAXVersion": 1, + "app": snapshot.appName, + "pid": Int(snapshot.pid), + "windowTitle": snapshot.windowTitle ?? "", + "screenshot": snapshot.screenshotMeta.dictionary, + "execution": [ + "strictMode": snapshot.strictMode, + "backgroundActivated": snapshot.backgroundActivated, + "defaultPath": snapshot.strictMode ? "accessibility_then_background_cgevent" : "accessibility_then_foreground_fallback", + ], + "elements": elements, + "hint": "Use refs like {e1}. Prefer AX-capable refs; strict mode rejects foreground fallback and reports path metadata after every action.", + ] + if let windowNumber = snapshot.windowNumber { + result["windowNumber"] = windowNumber + } + return result + } + + private func checkPermissions() -> [String: Any] { + let screenRecording = CGWindowListCreateImage(CGRect(x: 0, y: 0, width: 1, height: 1), .optionOnScreenOnly, kCGNullWindowID, []) != nil + return ["ok": true, "accessibility": AXIsProcessTrusted(), "screenRecording": screenRecording] + } + + private func handleActivateApp(args: [String: Any]) -> [String: Any] { + let name = args["name"] as? String ?? "" + guard let app = runningApp(named: name) else { + return ["ok": false, "error": "App '\(name)' is not running."] + } + app.activate() + return ["ok": true, "app": app.localizedName ?? name] + } + + private func handleLaunchApp(args: [String: Any]) async throws -> [String: Any] { + let name = (args["name"] as? String ?? "").trimmingCharacters(in: .whitespacesAndNewlines) + guard !name.isEmpty else { return ["ok": false, "error": "App name is required."] } + if let app = runningApp(named: name) { + app.activate() + return ["ok": true, "app": app.localizedName ?? name, "alreadyRunning": true] + } + guard let appURL = applicationURL(named: name) else { + return ["ok": false, "error": "App '\(name)' was not found."] + } + let config = NSWorkspace.OpenConfiguration() + config.activates = true + let app = try await NSWorkspace.shared.openApplication(at: appURL, configuration: config) + return ["ok": true, "app": app.localizedName ?? name] + } + + private func handleOpenURL(args: [String: Any]) async throws -> [String: Any] { + guard let rawURL = args["url"] as? String, let url = URL(string: rawURL) else { + return ["ok": false, "error": "Invalid URL."] + } + if let appName = args["app"] as? String, !appName.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + guard let appURL = applicationURL(named: appName) else { + return ["ok": false, "error": "App '\(appName)' was not found."] + } + _ = try await NSWorkspace.shared.open([url], withApplicationAt: appURL, configuration: NSWorkspace.OpenConfiguration()) + return ["ok": true] + } + return ["ok": NSWorkspace.shared.open(url)] + } + + private func displayInfo() -> [String: Any] { + guard let screen = NSScreen.main else { return ["ok": false, "error": "No main screen."] } + return [ + "ok": true, + "width": Int(screen.frame.width), + "height": Int(screen.frame.height), + "scale_factor": screen.backingScaleFactor, + ] + } + + private func cuaScreenshotResult() throws -> [[String: Any]] { + guard let screen = NSScreen.main else { throw ComputerUseError.screenshotFailed } + guard let cgImage = CGWindowListCreateImage(CGRect.null, .optionOnScreenOnly, kCGNullWindowID, [.bestResolution]) else { + throw ComputerUseError.screenshotFailed + } + let logicalWidth = Int(screen.frame.width) + let logicalHeight = Int(screen.frame.height) + guard let rep = NSBitmapImageRep( + bitmapDataPlanes: nil, + pixelsWide: logicalWidth, + pixelsHigh: logicalHeight, + bitsPerSample: 8, + samplesPerPixel: 4, + hasAlpha: true, + isPlanar: false, + colorSpaceName: .deviceRGB, + bytesPerRow: 0, + bitsPerPixel: 0 + ) else { + throw ComputerUseError.screenshotFailed + } + rep.size = NSSize(width: logicalWidth, height: logicalHeight) + guard let context = NSGraphicsContext(bitmapImageRep: rep) else { + throw ComputerUseError.screenshotFailed + } + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = context + NSImage(cgImage: cgImage, size: NSSize(width: cgImage.width, height: cgImage.height)) + .draw(in: NSRect(x: 0, y: 0, width: logicalWidth, height: logicalHeight)) + NSGraphicsContext.restoreGraphicsState() + guard let png = rep.representation(using: .png, properties: [:]) else { + throw ComputerUseError.screenshotFailed + } + return [ + ["type": "text", "text": jsonString(["ok": true, "width": logicalWidth, "height": logicalHeight]) ?? "{\"ok\":true}"], + ["type": "image", "data": png.base64EncodedString(), "mimeType": "image/png"], + ] + } + + private func runningApps() -> [NSRunningApplication] { + NSWorkspace.shared.runningApplications.filter { $0.activationPolicy == .regular } + } + + private func runningApp(named name: String) -> NSRunningApplication? { + let needle = name.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() + return runningApps().first { $0.localizedName?.lowercased() == needle } + ?? runningApps().first { $0.localizedName?.lowercased().contains(needle) == true } + } + + private func applicationURL(named name: String) -> URL? { + let trimmed = name.trimmingCharacters(in: .whitespacesAndNewlines) + if let url = NSWorkspace.shared.urlForApplication(withBundleIdentifier: bundleId(for: trimmed)) { + return url + } + if let path = NSWorkspace.shared.fullPath(forApplication: trimmed) { + return URL(fileURLWithPath: path) + } + let candidates = [ + "/Applications/\(trimmed).app", + "/System/Applications/\(trimmed).app", + "/Applications/Utilities/\(trimmed).app", + NSString(string: "~/Applications/\(trimmed).app").expandingTildeInPath, + ] + return candidates.map(URL.init(fileURLWithPath:)).first { FileManager.default.fileExists(atPath: $0.path) } + } + + private func bundleId(for appName: String) -> String { + switch appName.lowercased() { + case "safari": return "com.apple.Safari" + case "google chrome", "chrome": return "com.google.Chrome" + case "arc": return "company.thebrowser.Browser" + case "microsoft edge", "edge": return "com.microsoft.edgemac" + case "brave", "brave browser": return "com.brave.Browser" + case "slack": return "com.tinyspeck.slackmacgap" + default: return "" + } + } + + private func cuaKeysToCombo(_ keys: [String]) -> String { + keys.map { key in + switch key.lowercased() { + case "ctrl", "control": return "command" + case "meta", "super", "win", "cmd": return "command" + case "alt": return "option" + case "arrowup": return "up" + case "arrowdown": return "down" + case "arrowleft": return "left" + case "arrowright": return "right" + case "backspace": return "delete" + case " ": return "space" + default: return key.lowercased() + } + }.joined(separator: "+") + } + + private func parsePointPath(_ raw: Any?) -> [CGPoint] { + guard let pairs = raw as? [[Any]] else { return [] } + return pairs.compactMap { pair in + guard pair.count >= 2 else { return nil } + let x = valueAsDouble(pair[0]) + let y = valueAsDouble(pair[1]) + guard let x, let y else { return nil } + return CGPoint(x: x, y: y) + } + } + + private func valueAsDouble(_ value: Any) -> Double? { + if let value = value as? Double { return value } + if let value = value as? Int { return Double(value) } + if let value = value as? String { return Double(value) } + return nil + } + + private func respond(id: Any?, result: Any) { + var response: [String: Any] = ["jsonrpc": "2.0", "result": result] + if let id { response["id"] = id } + writeLine(response) + } + + private func respondError(id: Any?, code: Int, message: String) { + var response: [String: Any] = ["jsonrpc": "2.0", "error": ["code": code, "message": message]] + if let id { response["id"] = id } + writeLine(response) + } + + private func writeLine(_ object: [String: Any]) { + guard let text = jsonString(object) else { return } + print(text) + } + + private func textResult(_ text: String) -> [[String: Any]] { + [["type": "text", "text": text]] + } + + private func jsonResult(_ payload: [String: Any]) -> [[String: Any]] { + textResult(jsonString(payload) ?? "{\"ok\":false,\"error\":\"Failed to serialize result.\"}") + } + + private func jsonString(_ value: Any) -> String? { + guard JSONSerialization.isValidJSONObject(value), + let data = try? JSONSerialization.data(withJSONObject: value), + let text = String(data: data, encoding: .utf8) else { + return nil + } + return text + } + + private func errorPayload(_ error: Error) -> [String: Any] { + let message = error.localizedDescription + var payload: [String: Any] = ["ok": false, "error": message] + if message.localizedCaseInsensitiveContains("accessibility") { + payload["permissionNeeded"] = "accessibility" + } + if message.localizedCaseInsensitiveContains("screenshot") || message.localizedCaseInsensitiveContains("screen recording") { + payload["permissionNeeded"] = "screen-recording" + } + return payload + } + + private func log(_ message: String) { + fputs("[HandsFreeComputerUse] \(message)\n", stderr) + } + + private func intArg(_ args: [String: Any], _ key: String) -> Int? { + if let value = args[key] as? Int { return value } + if let value = args[key] as? Double { return Int(value) } + if let value = args[key] as? String { return Int(value) } + return nil + } + + private func doubleArg(_ args: [String: Any], _ key: String) -> Double? { + if let value = args[key] as? Double { return value } + if let value = args[key] as? Int { return Double(value) } + if let value = args[key] as? String { return Double(value) } + return nil + } + + private func boolArg(_ args: [String: Any], _ key: String) -> Bool? { + if let value = args[key] as? Bool { return value } + if let value = args[key] as? String { + if value == "true" { return true } + if value == "false" { return false } + } + return nil + } +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/Types.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/Types.swift new file mode 100644 index 0000000000..a1498a411e --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/Types.swift @@ -0,0 +1,220 @@ +import AppKit +import ApplicationServices + +struct ElementFrame: Sendable { + let x: Int + let y: Int + let width: Int + let height: Int + + var center: CGPoint { + CGPoint(x: x + width / 2, y: y + height / 2) + } + + var dictionary: [String: Any] { + ["x": x, "y": y, "width": width, "height": height] + } +} + +struct AXElementState: Sendable { + let enabled: Bool? + let focused: Bool? + let selected: Bool? + let expanded: Bool? + let checked: Bool? + + var dictionary: [String: Any] { + var result: [String: Any] = [:] + if let enabled { result["enabled"] = enabled } + if let focused { result["focused"] = focused } + if let selected { result["selected"] = selected } + if let expanded { result["expanded"] = expanded } + if let checked { result["checked"] = checked } + return result + } +} + +struct AXElementCapabilities: Sendable { + let canPress: Bool + let canFocus: Bool + let canScroll: Bool + let canAdjust: Bool + let canSetValue: Bool + let actions: [String] + + var dictionary: [String: Any] { + [ + "press": canPress, + "focus": canFocus, + "scroll": canScroll, + "adjust": canAdjust, + "setValue": canSetValue, + "actions": actions, + ] + } +} + +struct SemanticAXElement: Identifiable, Sendable { + let id: Int + let ref: String + let role: String + let label: String + let value: String? + let frame: ElementFrame + let state: AXElementState + let capabilities: AXElementCapabilities + + var dictionary: [String: Any] { + var result: [String: Any] = [ + "id": id, + "ref": ref, + "role": role, + "label": label, + "frame": frame.dictionary, + "state": state.dictionary, + "capabilities": capabilities.dictionary, + ] + if let value { result["value"] = value } + return result + } +} + +struct AXElementRecord: @unchecked Sendable { + let element: AXUIElement + let semantic: SemanticAXElement +} + +struct ScreenshotMetadata: Sendable { + let imageWidth: Int + let imageHeight: Int + let capturedBounds: CGRect + + var scaleX: CGFloat { capturedBounds.width / CGFloat(imageWidth) } + var scaleY: CGFloat { capturedBounds.height / CGFloat(imageHeight) } + + func toScreen(imageX: Double, imageY: Double) -> CGPoint { + CGPoint( + x: capturedBounds.origin.x + imageX * scaleX, + y: capturedBounds.origin.y + imageY * scaleY + ) + } + + func toImage(point: CGPoint) -> CGPoint { + CGPoint( + x: (point.x - capturedBounds.origin.x) / scaleX, + y: (point.y - capturedBounds.origin.y) / scaleY + ) + } + + var dictionary: [String: Any] { + [ + "imageWidth": imageWidth, + "imageHeight": imageHeight, + "capturedBounds": [ + "x": Int(capturedBounds.origin.x), + "y": Int(capturedBounds.origin.y), + "width": Int(capturedBounds.width), + "height": Int(capturedBounds.height), + ], + ] + } +} + +struct WindowTarget: @unchecked Sendable { + let appName: String + let pid: pid_t + let windowNumber: Int? + let windowTitle: String? + let bounds: CGRect + let isFrontmost: Bool + let axWindow: AXUIElement? + + var center: CGPoint { + CGPoint(x: bounds.midX, y: bounds.midY) + } +} + +struct AppSnapshot: @unchecked Sendable { + let appName: String + let pid: pid_t + let windowNumber: Int? + let windowTitle: String? + let screenshotData: Data + let screenshotMimeType: String + let screenshotMeta: ScreenshotMetadata + let records: [AXElementRecord] + let strictMode: Bool + let backgroundActivated: Bool + + var elements: [SemanticAXElement] { + records.map(\.semantic) + } +} + +enum ExecutionPath: String { + case accessibility = "accessibility" + case backgroundCGEvent = "background_cgevent" + case foregroundCGEvent = "foreground_cgevent" + case none = "none" +} + +struct ActionMetadata: Sendable { + let ok: Bool + let path: ExecutionPath + let strictMode: Bool + let backgroundSafe: Bool + let fallbackUsed: Bool + let message: String + + var dictionary: [String: Any] { + [ + "ok": ok, + "path": path.rawValue, + "strictMode": strictMode, + "backgroundSafe": backgroundSafe, + "fallbackUsed": fallbackUsed, + "message": message, + ] + } +} + +enum ComputerUseError: LocalizedError { + case accessibilityDenied + case screenshotFailed + case appNotFound(String) + case noFrontmostApplication + case noWindow(String) + case noSnapshot + case invalidElement(String) + case strictModeViolation(String) + case eventSourceFailed + case eventCreationFailed + case unknownKey(String) + + var errorDescription: String? { + switch self { + case .accessibilityDenied: + return "Accessibility permission is not granted." + case .screenshotFailed: + return "Screenshot capture failed." + case .appNotFound(let name): + return "App '\(name)' is not running." + case .noFrontmostApplication: + return "No frontmost application found." + case .noWindow(let app): + return "No usable window found for \(app)." + case .noSnapshot: + return "No snapshot is available. Call snapshot first." + case .invalidElement(let ref): + return "Element \(ref) was not found in the last semantic snapshot." + case .strictModeViolation(let reason): + return "Strict background mode rejected the action: \(reason)" + case .eventSourceFailed: + return "Failed to create CGEventSource." + case .eventCreationFailed: + return "Failed to create CGEvent." + case .unknownKey(let key): + return "Unknown key: \(key)" + } + } +} diff --git a/packages/handsfree/native/HandsFree/Sources/ComputerUse/main.swift b/packages/handsfree/native/HandsFree/Sources/ComputerUse/main.swift new file mode 100644 index 0000000000..b2b66077ae --- /dev/null +++ b/packages/handsfree/native/HandsFree/Sources/ComputerUse/main.swift @@ -0,0 +1,17 @@ +/// HandsFreeComputerUse: semantic AX and background-safe macOS computer use. +/// +/// The runtime is MCP-independent. This binary exposes it over a small stdio +/// adapter because existing agent clients already speak MCP. + +import Foundation + +setbuf(stdout, nil) + +let args = CommandLine.arguments +if args.count >= 2 && args[1] == "mcp" { + let server = MCPServer() + await server.run() +} else { + fputs("Usage: HandsFreeComputerUse mcp\n", stderr) + exit(1) +} diff --git a/packages/handsfree/package.json b/packages/handsfree/package.json new file mode 100644 index 0000000000..6c6c281778 --- /dev/null +++ b/packages/handsfree/package.json @@ -0,0 +1,23 @@ +{ + "name": "@openwork/handsfree", + "private": true, + "version": "0.1.0", + "description": "macOS semantic AX and background computer-use runtime for OpenWork", + "license": "MIT", + "type": "module", + "bin": { + "openwork-handsfree-computer-use": "bin/openwork-handsfree-computer-use.mjs" + }, + "scripts": { + "check": "pnpm run check:js && pnpm run check:native", + "check:js": "node --check bin/openwork-handsfree-computer-use.mjs && node --check src/cua-runner.mjs && node --check src/realtime-tools.mjs", + "build:native": "swift build --package-path native/HandsFree -c release --product HandsFreeComputerUse", + "check:native": "swift build --package-path native/HandsFree --product HandsFreeComputerUse" + }, + "files": [ + "bin", + "src", + "native", + "README.md" + ] +} diff --git a/packages/handsfree/src/cua-runner.mjs b/packages/handsfree/src/cua-runner.mjs new file mode 100644 index 0000000000..7975a12c78 --- /dev/null +++ b/packages/handsfree/src/cua-runner.mjs @@ -0,0 +1,129 @@ +export const CUA_DEFAULT_MODEL = "gpt-5.5"; +export const CUA_MAX_TURNS = 30; + +export async function runCuaLoop({ + task, + apiKey, + callTool, + onProgress, + signal, + model = CUA_DEFAULT_MODEL, + maxTurns = CUA_MAX_TURNS, +}) { + if (!apiKey?.trim()) throw new Error("OpenAI API key required for computer use."); + if (typeof callTool !== "function") throw new Error("callTool is required."); + + const display = await callTool("display_info", {}); + const displayInfo = parseToolText(display) ?? { width: 1440, height: 900 }; + onProgress?.({ kind: "start", width: displayInfo.width, height: displayInfo.height }); + + const items = [{ role: "user", content: String(task ?? "") }]; + const messages = []; + + for (let turn = 0; turn < maxTurns; turn += 1) { + if (signal?.aborted) return { ok: true, messages, turns: turn, aborted: true }; + onProgress?.({ kind: "turn", turn: turn + 1 }); + + const response = await fetch("https://api.openai.com/v1/responses", { + method: "POST", + headers: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json" }, + body: JSON.stringify({ model, input: items, tools: [{ type: "computer" }] }), + signal, + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => ""); + throw new Error(`CUA API error ${response.status}: ${errorText.slice(0, 300)}`); + } + + const result = await response.json(); + const output = result.output || []; + if (!output.length) throw new Error("No output from CUA model."); + items.push(...output); + + let computerCall = null; + for (const item of output) { + if (item.type === "message") { + const text = item.content?.map((part) => part.text || "").join("") || ""; + if (text) { + messages.push(text); + onProgress?.({ kind: "message", text }); + } + } + if (item.type === "computer_call") computerCall = item; + } + + if (!computerCall) return { ok: true, messages, turns: turn + 1 }; + + for (const action of computerCall.actions || (computerCall.action ? [computerCall.action] : [])) { + if (signal?.aborted) return { ok: true, messages, turns: turn + 1, aborted: true }; + if (action.type === "screenshot") continue; + onProgress?.({ kind: "action", ...summarizeAction(action) }); + await executeCuaAction(callTool, action); + await delay(150); + } + + const screenshot = await callTool("cua_screenshot", {}); + const image = extractImage(screenshot); + if (!image) throw new Error("Could not capture screenshot after action."); + + items.push({ + type: "computer_call_output", + call_id: computerCall.call_id, + acknowledged_safety_checks: computerCall.pending_safety_checks || [], + output: { type: "input_image", image_url: `data:image/png;base64,${image}` }, + }); + } + + return { ok: true, messages, turns: maxTurns, truncated: true }; +} + +export async function executeCuaAction(callTool, action) { + switch (action.type) { + case "click": + return callTool("cua_click", { x: action.x, y: action.y, button: action.button || "left", ...(action.keys?.length ? { keys: action.keys } : {}) }); + case "double_click": + return callTool("cua_double_click", { x: action.x, y: action.y }); + case "scroll": + return callTool("cua_scroll", { x: action.x, y: action.y, scroll_x: action.scroll_x || 0, scroll_y: action.scroll_y || 0 }); + case "type": + return callTool("cua_type", { text: action.text }); + case "keypress": + return callTool("cua_keypress", { keys: action.keys || [] }); + case "drag": + return callTool("cua_drag", { path: action.path || [] }); + case "move": + return callTool("cua_move", { x: action.x, y: action.y }); + case "wait": + return callTool("cua_wait", {}); + default: + return null; + } +} + +function parseToolText(response) { + const text = response?.result?.content?.find?.((item) => item.type === "text")?.text + ?? response?.content?.find?.((item) => item.type === "text")?.text; + if (!text) return null; + try { return JSON.parse(text); } catch { return null; } +} + +function extractImage(response) { + return response?.result?.content?.find?.((item) => item.type === "image" && item.data)?.data + ?? response?.content?.find?.((item) => item.type === "image" && item.data)?.data + ?? null; +} + +function summarizeAction(action) { + return { + type: action.type, + x: action.x, + y: action.y, + text: action.text?.slice?.(0, 60), + desc: `${action.type}${action.x != null ? ` (${action.x},${action.y})` : ""}${action.text ? ` "${action.text.slice(0, 30)}"` : ""}`, + }; +} + +function delay(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/packages/handsfree/src/realtime-tools.mjs b/packages/handsfree/src/realtime-tools.mjs new file mode 100644 index 0000000000..317e733469 --- /dev/null +++ b/packages/handsfree/src/realtime-tools.mjs @@ -0,0 +1,85 @@ +export const HANDSFREE_DEFAULT_MODEL = "gpt-realtime-2"; +export const HANDSFREE_DEFAULT_REASONING_EFFORT = "low"; +export const HANDSFREE_REASONING_EFFORTS = ["minimal", "low", "medium", "high", "xhigh"]; + +export const HANDSFREE_REALTIME_INSTRUCTIONS = `# Role and Objective + +You are HandsFree, a macOS computer-control voice assistant. You control the user's Mac through tools. You respond with voice. You cannot see the screen yourself. + +# Personality and Tone + +Be concise, calm, and direct. Do not over-explain. Act, then report the result. + +# Tool Selection + +- Direct typing, keypresses, app launch, clipboard, URLs, and grid clicks are instant tools. +- Visual or multi-step UI work must use use_computer. +- Stop/cancel requests must call stop_computer immediately. +- For MCP servers, list tools before calling unfamiliar tool names. + +# Safety + +- Type exactly what the user asks; do not paraphrase typed text. +- Do not use destructive shortcuts unless explicitly requested. +- For actions that send messages or modify data, confirm content briefly before executing.`; + +export function openAIRealtimeTools() { + return [ + functionTool("use_computer", "Control the Mac to complete a visual or UI task using screenshots and native input.", { + task: { type: "string", description: "Plain-language task to complete on the computer." }, + }, ["task"]), + functionTool("type_text", "Type exact text into the focused input field.", { + text: { type: "string", description: "Exact text to type." }, + }, ["text"]), + functionTool("press_key", "Press a key combo such as return, tab, escape, command+k, or command+shift+a.", { + combo: { type: "string", description: "Key combo string." }, + }, ["combo"]), + functionTool("launch_app", "Launch a macOS app by name.", { + name: { type: "string", description: "App name." }, + }, ["name"]), + functionTool("activate_app", "Bring a running macOS app to the foreground.", { + name: { type: "string", description: "App name." }, + }, ["name"]), + functionTool("list_apps", "List running macOS applications."), + functionTool("clipboard_read", "Read the macOS clipboard as text."), + functionTool("clipboard_write", "Write text to the macOS clipboard.", { + text: { type: "string", description: "Text to copy." }, + }, ["text"]), + functionTool("open_url", "Open a URL in a browser.", { + url: { type: "string", description: "URL to open." }, + app: { type: "string", description: "Optional browser app name." }, + }, ["url"]), + functionTool("mcp_list_servers", "List connected MCP servers."), + functionTool("mcp_list_tools", "List tools on a connected MCP server before calling unfamiliar tools.", { + serverName: { type: "string", description: "MCP server name." }, + }, ["serverName"]), + functionTool("mcp_call_tool", "Call a tool on a connected MCP server.", { + serverName: { type: "string", description: "MCP server name." }, + toolName: { type: "string", description: "Tool name." }, + args: { type: "object", description: "Tool arguments.", additionalProperties: true }, + }, ["serverName", "toolName"]), + functionTool("show_grid", "Show a subtle A1-F4 screen grid overlay."), + functionTool("hide_grid", "Hide the screen grid overlay."), + functionTool("click_grid", "Click the center of a grid zone such as C2.", { + zone: { type: "string", description: "Grid zone label." }, + }, ["zone"]), + functionTool("stop_computer", "Stop the current computer-use task."), + functionTool("request_permission", "Open System Settings for a macOS permission pane.", { + pane: { type: "string", description: "accessibility, screen-recording, or microphone." }, + }, ["pane"]), + ]; +} + +function functionTool(name, description, properties = {}, required = []) { + return { + type: "function", + name, + description, + parameters: { + type: "object", + properties, + required, + additionalProperties: false, + }, + }; +}