Custom Computers
The Agent SDK supports defining custom computer handlers using a simple dictionary interface. This enables integration with custom automation backends, testing frameworks, or specialized computer control systems.
Example: Defining a Custom Computer Handler
import asyncio
from PIL import Image
# Define your custom computer functions
async def take_screenshot():
"""Your custom screenshot implementation"""
# Return PIL Image, bytes, or base64 string
return Image.new('RGB', (1920, 1080), color='white')
# Create dict-based computer handler - only 'screenshot' is required
custom_computer = {
'screenshot': take_screenshot, # required
# everything below is optional
'environment': 'linux', # linux, mac, windows, browser
'dimensions': (1920, 1080), # (width, height)
'click': lambda x, y, button: print(f"Clicking at ({x}, {y}) with {button} button"),
}
You can then use this as a tool for your agent:
from agent import ComputerAgent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20240620",
tools=[custom_computer],
)
# Agent will automatically convert dict to agent.computers.CustomComputerHandler
await agent.run("Take a screenshot and click at coordinates 100, 200")
Class-Based Implementation
For more complex implementations, you can create a custom class by inheriting from AsyncComputerHandler
:
from agent.computers import AsyncComputerHandler
from PIL import Image
from typing import Literal, List, Dict, Union, Optional
class MyCustomComputer(AsyncComputerHandler):
"""Custom computer handler implementation."""
def __init__(self):
# Initialize your custom computer interface here
pass
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
...
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
...
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
...
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
...
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
...
async def type(self, text: str) -> None:
"""Type text."""
...
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
...
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
...
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
...
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
...
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
...
# ==== Anthropic Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
...
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
...
# Use with agent
custom_computer = MyCustomComputer()
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20240620",
tools=[custom_computer],
)
await agent.run("Take a screenshot and click at coordinates 100, 200")