Streaming
Consume the SSE stream with the OpenAI SDKs in Python and JavaScript.
For the wire-format details, see API → Streaming.
Python
from openai import OpenAI
client = OpenAI(
base_url="https://litellm.tensorloop.tech/v1",
api_key="tl_...",
)
stream = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Stream a haiku about caching."}],
stream=True,
)
text = ""
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
text += delta
print(delta, end="", flush=True)
print()
print(f"\n--- final ({len(text)} chars) ---")flush=True and end="" keep print from line-buffering, so each token appears as it arrives.
JavaScript / TypeScript
import OpenAI from "openai";
const client = new OpenAI({
baseURL: "https://litellm.tensorloop.tech/v1",
apiKey: process.env.TENSORLOOP_KEY!,
});
const stream = await client.chat.completions.create({
model: "gpt-4o-mini",
messages: [{ role: "user", content: "Stream a haiku about caching." }],
stream: true,
});
let text = "";
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta?.content ?? "";
text += delta;
process.stdout.write(delta);
}
process.stdout.write(`\n--- final (${text.length} chars) ---\n`);Raw fetch (no SDK)
If you're in an environment without the OpenAI SDK (Cloudflare Workers without bundling, Deno, browsers proxying through a backend):
const res = await fetch("https://litellm.tensorloop.tech/v1/chat/completions", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.TENSORLOOP_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-mini",
stream: true,
messages: [{ role: "user", content: "Stream a haiku about caching." }],
}),
});
const reader = res.body!.getReader();
const decoder = new TextDecoder();
let buffer = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop() ?? "";
for (const line of lines) {
if (!line.startsWith("data: ")) continue;
const payload = line.slice(6).trim();
if (payload === "[DONE]") return;
const chunk = JSON.parse(payload);
process.stdout.write(chunk.choices[0]?.delta?.content ?? "");
}
}The buffer trick handles the case where a data: line is split across two fetch reads — which happens often.
Forwarding a stream from a Next.js route
When proxying a stream from your backend to a browser client:
export async function POST(req: Request) {
const body = await req.json();
const upstream = await fetch(
"https://litellm.tensorloop.tech/v1/chat/completions",
{
method: "POST",
headers: {
Authorization: `Bearer ${process.env.TENSORLOOP_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({ ...body, stream: true }),
},
);
return new Response(upstream.body, {
headers: {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
Connection: "keep-alive",
},
});
}Passing upstream.body straight through preserves backpressure. The browser then consumes it with EventSource or fetch().body.getReader().
Disable buffering
Add X-Accel-Buffering: no to the response headers if your CDN or proxy is
collapsing the stream. Cloudflare honors it for Workers responses.