How to Stream AI Responses in Real-Time
Implement streaming for Claude and GPT responses to improve user experience.
Jay Banlasan
The AI Systems Guy
Streaming AI responses means the user sees words appearing as they are generated, instead of waiting for the entire response to complete before anything shows up. The stream ai api responses real-time technique is the difference between an interface that feels instant and one that feels slow. For responses that take 5 to 15 seconds to complete, streaming cuts the perceived wait time dramatically. Every customer-facing AI feature I build uses streaming.
The implementation is nearly identical across OpenAI, Anthropic, and most other providers. You pass stream=True, then iterate over the response chunks instead of waiting for the full response.
What You Need Before Starting
- API key for OpenAI or Anthropic
- Python 3.9+ with
openaioranthropicSDK - A web framework (Flask or FastAPI) if you need browser streaming
- Basic understanding of generators and async/await
Step 1: Stream a Response in the Terminal (OpenAI)
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def stream_response(prompt: str, system_prompt: str = None) -> str:
"""
Stream a GPT response, printing tokens as they arrive.
Returns the full response text.
"""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
print("Response: ", end="", flush=True)
full_response = ""
with client.chat.completions.create(
model="gpt-4o",
messages=messages,
stream=True
) as stream:
for chunk in stream:
if chunk.choices[0].delta.content is not None:
token = chunk.choices[0].delta.content
print(token, end="", flush=True) # Print without newline
full_response += token
print() # Add newline at end
return full_response
# Test it
result = stream_response(
"Explain the business case for AI automation in 3 paragraphs.",
system_prompt="Be concise and practical."
)
print(f"\nTotal length: {len(result)} characters")
Step 2: Stream a Claude Response
import anthropic
claude = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
def stream_claude(prompt: str, system_prompt: str = None) -> str:
"""
Stream a Claude response with token-by-token output.
"""
messages = [{"role": "user", "content": prompt}]
kwargs = {
"model": "claude-opus-4-5",
"max_tokens": 1000,
"messages": messages
}
if system_prompt:
kwargs["system"] = system_prompt
print("Claude: ", end="", flush=True)
full_response = ""
with claude.messages.stream(**kwargs) as stream:
for text in stream.text_stream:
print(text, end="", flush=True)
full_response += text
print()
return full_response
result = stream_claude(
"List 5 specific ways a marketing agency could use AI to save 10 hours per week.",
system_prompt="Be specific. One line per item. No preamble."
)
Step 3: Stream to a Web Browser with Flask Server-Sent Events
This is the pattern I use for web-based AI interfaces. SSE (Server-Sent Events) lets the server push chunks to the browser as they arrive:
pip install flask
from flask import Flask, request, Response, stream_with_context
import json
app = Flask(__name__)
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@app.route("/stream", methods=["POST"])
def stream_endpoint():
data = request.json
user_message = data.get("message", "")
def generate():
with openai_client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": user_message}],
stream=True
) as stream:
for chunk in stream:
if chunk.choices[0].delta.content is not None:
token = chunk.choices[0].delta.content
# SSE format: data: <payload>\n\n
yield f"data: {json.dumps({'token': token})}\n\n"
# Signal completion
yield f"data: {json.dumps({'done': True})}\n\n"
return Response(
stream_with_context(generate()),
mimetype="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no" # Disable nginx buffering
}
)
HTML = """
<!DOCTYPE html>
<html>
<head><title>Streaming AI</title></head>
<body>
<textarea id="input" rows="4" cols="60" placeholder="Ask anything..."></textarea><br>
<button onclick="sendMessage()">Send</button>
<div id="output" style="margin-top:20px; white-space:pre-wrap; font-family:monospace;"></div>
<script>
function sendMessage() {
const message = document.getElementById('input').value;
const output = document.getElementById('output');
output.textContent = '';
fetch('/stream', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({message})
}).then(response => {
const reader = response.body.getReader();
const decoder = new TextDecoder();
function read() {
reader.read().then(({done, value}) => {
if (done) return;
const text = decoder.decode(value);
const lines = text.split('\\n\\n').filter(l => l.startsWith('data: '));
lines.forEach(line => {
const data = JSON.parse(line.replace('data: ', ''));
if (data.token) output.textContent += data.token;
});
read();
});
}
read();
});
}
</script>
</body>
</html>
"""
@app.route("/")
def index():
return HTML
if __name__ == "__main__":
app.run(debug=True, port=5001)
Step 4: Stream with FastAPI for Async Performance
FastAPI handles streaming more cleanly at higher concurrency:
pip install fastapi uvicorn
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import asyncio
fastapi_app = FastAPI()
class ChatRequest(BaseModel):
message: str
system_prompt: str = None
async def openai_stream_generator(message: str, system_prompt: str = None):
"""Async generator for OpenAI streaming."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": message})
with openai_client.chat.completions.create(
model="gpt-4o",
messages=messages,
stream=True
) as stream:
for chunk in stream:
if chunk.choices[0].delta.content is not None:
token = chunk.choices[0].delta.content
yield f"data: {json.dumps({'token': token})}\n\n"
await asyncio.sleep(0) # Yield control to event loop
yield f"data: {json.dumps({'done': True})}\n\n"
@fastapi_app.post("/stream")
async def stream_chat(request: ChatRequest):
return StreamingResponse(
openai_stream_generator(request.message, request.system_prompt),
media_type="text/event-stream"
)
Run with:
uvicorn your_file:fastapi_app --reload --port 8000
Step 5: Capture the Full Response While Streaming
Sometimes you need to stream to the user AND capture the full text for logging:
from typing import Generator
def stream_and_capture(prompt: str) -> Generator[str, None, str]:
"""
Generator that yields tokens for streaming AND returns the full response.
Usage:
gen = stream_and_capture("Your prompt")
for token in gen:
print(token, end='', flush=True)
# Full response available after the loop via gen.send(None) or captured inside
"""
messages = [{"role": "user", "content": prompt}]
full_text = []
with openai_client.chat.completions.create(
model="gpt-4o",
messages=messages,
stream=True
) as stream:
for chunk in stream:
token = chunk.choices[0].delta.content
if token:
full_text.append(token)
yield token
# After iteration completes, full_text contains everything
complete = "".join(full_text)
# Log or process complete response here
print(f"\n[Logged {len(complete)} chars]")
What to Build Next
- Add a "stop generation" button on the frontend by aborting the fetch request
- Buffer partial sentences before displaying to avoid showing incomplete mid-word chunks
- Combine streaming with function calling so tool results appear inline as they complete
Related Reading
- How to Build a Multi-Turn Conversation with Claude - Streaming works naturally with conversation history
- How to Create Your First AI Chatbot with OpenAI - Apply streaming to the chatbot interface
- How to Handle AI API Rate Limits Gracefully - Streaming connections count against rate limits the same as regular calls
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment