Streaming agent (SSE)

Use this when your agent streams response chunks back to a caller in real time while keeping the span open for the full duration of the stream. Pass { streaming: true } (TypeScript) or streaming=True (Python) to tell the wrapper not to auto-close the span when the function returns. Then call ctx.complete(output) once the full output is assembled.

TypeScript — Vercel AI SDK
TypeScript — manual stream bridge
Python

The cleanest pattern with the Vercel AI SDK is to call ctx.complete() inside onFinish and return the data stream response:

import { registerOTel, agent } from "@uselemma/tracing";
import { streamText } from "ai";
import { openai } from "@ai-sdk/openai";

registerOTel();

const wrapped = agent("my-agent", async (input: string, ctx) => {
  const result = await streamText({
    model: openai("gpt-4o"),
    prompt: input,
    experimental_telemetry: { isEnabled: true },
    onFinish({ text }) {
      ctx.complete(text); // closes the span with the full assembled output
    },
  });
  return result.toDataStreamResponse();
}, { streaming: true });

export function handleRequest(userMessage: string) {
  const { result } = await wrapped(userMessage);
  return result; // a Response that streams chunks to the client
}

When you need to forward raw chunks to a client while consuming the stream inside the wrapper:

import { registerOTel, agent } from "@uselemma/tracing";

registerOTel();

const wrapped = agent("my-agent", async (input: {
  userMessage: string;
  controller: ReadableStreamDefaultController<string>;
}, ctx) => {
  let fullResponse = "";

  for await (const chunk of streamLLM(input.userMessage)) {
    fullResponse += chunk;
    input.controller.enqueue(chunk);
  }

  ctx.complete(fullResponse);
  input.controller.close();
  return fullResponse;
}, { streaming: true });

export function handleRequest(userMessage: string) {
  let runIdPromise: Promise<string>;

  const stream = new ReadableStream<string>({
    start(controller) {
      runIdPromise = wrapped({ userMessage, controller }).then(({ runId }) => runId);
    },
  });

  return { stream, runIdPromise };
}

import asyncio
from uselemma_tracing import register_otel, TraceContext, agent

register_otel()


async def run_agent(input: dict, ctx: TraceContext) -> str:
    queue: asyncio.Queue = input["queue"]
    full_response = ""

    async for chunk in stream_llm(input["user_message"]):
        full_response += chunk
        await queue.put(chunk)

    await queue.put(None)  # sentinel — signals end of stream to consumer
    ctx.complete(full_response)  # close span with assembled output
    return full_response


wrapped = agent("my-agent", run_agent, streaming=True)


async def handle_request(user_message: str):
    queue: asyncio.Queue = asyncio.Queue()

    # Start the agent in the background; consume chunks concurrently
    agent_task = asyncio.create_task(
        wrapped({"user_message": user_message, "queue": queue})
    )

    while True:
        chunk = await queue.get()
        if chunk is None:
            break
        yield chunk  # write to SSE / WebSocket / response stream here

    await agent_task  # ensure span closes and run_id is available
    _, run_id, _ = agent_task.result()

Key points:

Pass { streaming: true } / streaming=True so the wrapper knows not to auto-close when the function returns.
Call ctx.complete(output) with the assembled output once the stream ends.
Wait for the wrapped invocation to finish before relying on runId or assuming the span is closed.
Replace the queue or ReadableStream example with whatever your framework uses to write SSE events.

Recipes