openai · rm-openai · Sep 11, 2025 · Sep 4, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -84,7 +84,5 @@ jobs:
  enable-cache: true
  - name: Install dependencies
  run: make sync
- - name: Install Python 3.9 dependencies
- run: UV_PROJECT_ENVIRONMENT=.venv_39 uv sync --all-extras --all-packages --group dev
  - name: Run tests
  run: make old_version_tests
diff --git a/.gitignore b/.gitignore
@@ -100,7 +100,8 @@ celerybeat.pid
 *.sage.py
 
 # Environments
-.env
+.python-version
+.env*
 .venv
 env/
 venv/

diff --git a/Makefile b/Makefile
@@ -39,7 +39,8 @@ snapshots-create:
 uv run pytest --inline-snapshot=create 
 
 .PHONY: old_version_tests
-old_version_tests: 
+old_version_tests:
+UV_PROJECT_ENVIRONMENT=.venv_39 uv sync --python 3.9 --all-extras --all-packages --group dev
 UV_PROJECT_ENVIRONMENT=.venv_39 uv run --python 3.9 -m pytest
 
 .PHONY: build-docs

diff --git a/examples/realtime/app/README.md b/examples/realtime/app/README.md
@@ -29,14 +29,19 @@ To use the same UI with your own agents, edit `agent.py` and ensure get_starting
 1. Click **Connect** to establish a realtime session
 2. Audio capture starts automatically - just speak naturally
 3. Click the **Mic On/Off** button to mute/unmute your microphone
-4. Watch the conversation unfold in the left pane
-5. Monitor raw events in the right pane (click to expand/collapse)
-6. Click **Disconnect** when done
+4. To send an image, enter an optional prompt and click **🖼️ Send Image** (select a file)
+5. Watch the conversation unfold in the left pane (image thumbnails are shown)
+6. Monitor raw events in the right pane (click to expand/collapse)
+7. Click **Disconnect** when done
 
 ## Architecture
 
 - **Backend**: FastAPI server with WebSocket connections for real-time communication
 - **Session Management**: Each connection gets a unique session with the OpenAI Realtime API
+- **Image Inputs**: The UI uploads images and the server forwards a
+ `conversation.item.create` event with `input_image` (plus optional `input_text`),
+ followed by `response.create` to start the model response. The messages pane
+ renders image bubbles for `input_image` content.
 - **Audio Processing**: 24kHz mono audio capture and playback
 - **Event Handling**: Full event stream processing with transcript generation
 - **Frontend**: Vanilla JavaScript with clean, responsive CSS

diff --git a/examples/realtime/app/server.py b/examples/realtime/app/server.py
@@ -12,6 +12,8 @@
 from typing_extensions import assert_never
 
 from agents.realtime import RealtimeRunner, RealtimeSession, RealtimeSessionEvent
+from agents.realtime.config import RealtimeUserInputMessage
+from agents.realtime.model_inputs import RealtimeModelSendRawMessage
 
 # Import TwilioHandler class - handle both module and package use cases
 if TYPE_CHECKING:
@@ -64,6 +66,34 @@ async def send_audio(self, session_id: str, audio_bytes: bytes):
  if session_id in self.active_sessions:
  await self.active_sessions[session_id].send_audio(audio_bytes)
 
+ async def send_client_event(self, session_id: str, event: dict[str, Any]):
+ """Send a raw client event to the underlying realtime model."""
+ session = self.active_sessions.get(session_id)
+ if not session:
+ return
+ await session.model.send_event(
+ RealtimeModelSendRawMessage(
+ message={
+ "type": event["type"],
+ "other_data": {k: v for k, v in event.items() if k != "type"},
+ }
+ )
+ )
+
+ async def send_user_message(self, session_id: str, message: RealtimeUserInputMessage):
+ """Send a structured user message via the higher-level API (supports input_image)."""
+ session = self.active_sessions.get(session_id)
+ if not session:
+ return
+ await session.send_message(message) # delegates to RealtimeModelSendUserInput path
+
+ async def interrupt(self, session_id: str) -> None:
+ """Interrupt current model playback/response for a session."""
+ session = self.active_sessions.get(session_id)
+ if not session:
+ return
+ await session.interrupt()
+
  async def _process_events(self, session_id: str):
  try:
  session = self.active_sessions[session_id]
@@ -101,7 +131,11 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
  elif event.type == "history_updated":
  base_event["history"] = [item.model_dump(mode="json") for item in event.history]
  elif event.type == "history_added":
- pass
+ # Provide the added item so the UI can render incrementally.
+ try:
+ base_event["item"] = event.item.model_dump(mode="json")
+ except Exception:
+ base_event["item"] = None
  elif event.type == "guardrail_tripped":
  base_event["guardrail_results"] = [
  {"name": result.guardrail.name} for result in event.guardrail_results
@@ -134,6 +168,7 @@ async def lifespan(app: FastAPI):
 @app.websocket("/ws/{session_id}")
 async def websocket_endpoint(websocket: WebSocket, session_id: str):
  await manager.connect(websocket, session_id)
+ image_buffers: dict[str, dict[str, Any]] = {}
  try:
  while True:
  data = await websocket.receive_text()
@@ -144,6 +179,124 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
  int16_data = message["data"]
  audio_bytes = struct.pack(f"{len(int16_data)}h", *int16_data)
  await manager.send_audio(session_id, audio_bytes)
+ elif message["type"] == "image":
+ logger.info("Received image message from client (session %s).", session_id)
+ # Build a conversation.item.create with input_image (and optional input_text)
+ data_url = message.get("data_url")
+ prompt_text = message.get("text") or "Please describe this image."
+ if data_url:
+ logger.info(
+ "Forwarding image (structured message) to Realtime API (len=%d).",
+ len(data_url),
+ )
+ user_msg: RealtimeUserInputMessage = {
+ "type": "message",
+ "role": "user",
+ "content": (
+ [
+ {"type": "input_image", "image_url": data_url, "detail": "high"},
+ {"type": "input_text", "text": prompt_text},
+ ]
+ if prompt_text
+ else [
+ {"type": "input_image", "image_url": data_url, "detail": "high"}
+ ]
+ ),
+ }
+ await manager.send_user_message(session_id, user_msg)
+ # Acknowledge to client UI
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "client_info",
+ "info": "image_enqueued",
+ "size": len(data_url),
+ }
+ )
+ )
+ else:
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "error",
+ "error": "No data_url for image message.",
+ }
+ )
+ )
+ elif message["type"] == "commit_audio":
+ # Force close the current input audio turn
+ await manager.send_client_event(session_id, {"type": "input_audio_buffer.commit"})
+ elif message["type"] == "image_start":
+ img_id = str(message.get("id"))
+ image_buffers[img_id] = {
+ "text": message.get("text") or "Please describe this image.",
+ "chunks": [],
+ }
+ await websocket.send_text(
+ json.dumps({"type": "client_info", "info": "image_start_ack", "id": img_id})
+ )
+ elif message["type"] == "image_chunk":
+ img_id = str(message.get("id"))
+ chunk = message.get("chunk", "")
+ if img_id in image_buffers:
+ image_buffers[img_id]["chunks"].append(chunk)
+ if len(image_buffers[img_id]["chunks"]) % 10 == 0:
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "client_info",
+ "info": "image_chunk_ack",
+ "id": img_id,
+ "count": len(image_buffers[img_id]["chunks"]),
+ }
+ )
+ )
+ elif message["type"] == "image_end":
+ img_id = str(message.get("id"))
+ buf = image_buffers.pop(img_id, None)
+ if buf is None:
+ await websocket.send_text(
+ json.dumps({"type": "error", "error": "Unknown image id for image_end."})
+ )
+ else:
+ data_url = "".join(buf["chunks"]) if buf["chunks"] else None
+ prompt_text = buf["text"]
+ if data_url:
+ logger.info(
+ "Forwarding chunked image (structured message) to Realtime API (len=%d).",
+ len(data_url),
+ )
+ user_msg2: RealtimeUserInputMessage = {
+ "type": "message",
+ "role": "user",
+ "content": (
+ [
+ {"type": "input_image", "image_url": data_url, "detail": "high"},
+ {"type": "input_text", "text": prompt_text},
+ ]
+ if prompt_text
+ else [
+ {"type": "input_image", "image_url": data_url, "detail": "high"}
+ ]
+ ),
+ }
+ await manager.send_user_message(session_id, user_msg2)
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "client_info",
+ "info": "image_enqueued",
+ "id": img_id,
+ "size": len(data_url),
+ }
+ )
+ )
+ else:
+ await websocket.send_text(
+ json.dumps({"type": "error", "error": "Empty image."})
+ )
+ elif message["type"] == "interrupt":
+ await manager.interrupt(session_id)
 
  except WebSocketDisconnect:
  await manager.disconnect(session_id)
@@ -160,4 +313,10 @@ async def read_index():
 if __name__ == "__main__":
  import uvicorn
 
- uvicorn.run(app, host="0.0.0.0", port=8000)
+ uvicorn.run(
+ app,
+ host="0.0.0.0",
+ port=8000,
+ # Increased WebSocket frame size to comfortably handle image data URLs.
+ ws_max_size=16 * 1024 * 1024,
+ )
-Original file line number
+Diff line change
@@ Expand Up / @@ -100,7 +100,8 @@ celerybeat.pid @@
  *.sage.py
  # Environments
- .env
+ .python-version
+ .env*
  .venv
  env/
  venv/
@@ Expand Down @@