Streaming UI: Building Responsive Interfaces on Top of LLM Token Streams
From SSE to React Suspense — how I build chat and agent UIs that feel instant, handle interruption gracefully, and don't fall apart when the network does.
A code snippet from this post was tested
Node.js v22.22.3 · Verified June 10, 2026
A code snippet from this post was tested
Node.js v22.22.3 · Verified June 10, 2026
Logic from this post, adapted into a runnable form and executed by the publishing pipeline.
node verify.mjsSnippet
function processStreamedMessages(initialSegments, message) {
let segments = [...initialSegments]; // Create a mutable copy
if (message.type === 'token') {
const lastSegment = segments[segments.length - 1];
if (lastSegment && lastSegment.type === 'text') {
segments = segments.slice(0, -1).concat({ ...lastSegment, content: (lastSegment.content || '') + message.payload });
} else {
// In a real app, crypto.randomUUID() would be used.
// For deterministic testing, we'll use a simple incrementing ID.
const newId = segments.length > 0 ? parseInt(segments[segments.length - 1].id) + 1 : 1;
segments = segments.concat({ id: String(newId), type: 'text', content: message.payload });
}
} else if (message.type === 'status') {
const newId = segments.length > 0 ? parseInt(segments[segments.length - 1].id) + 1 : 1;
segments = segments.concat({ id: String(newId), type: 'status', content: message.payload });
} else if (message.type === 'tool_call') {
const newId = segments.length > 0 ? parseInt(segments[segments.length - 1].id) + 1 : 1;
segments = segments.concat({ id: String(newId), type: 'tool_call', toolCall: message.payload, isPending: true });
} else if (message.type === 'tool_result') {
segments = segments.map(seg =>
seg.type === 'tool_call' && seg.isPending && seg.toolCall.tool === message.payload.tool
? { ...seg, toolResult: message.payload, isPending: false }
: seg
);
}
return segments;
}
// --- Test Cases ---
// Test case 1: Initial tokens
let currentSegments = [];
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: 'Hello' });
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: ' world' });
console.log('Test Case 1 (initial tokens):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"}]
// Test case 2: Status message after tokens
currentSegments = processStreamedMessages(currentSegments, { type: 'status', payload: 'Thinking about tools...' });
console.log('Test Case 2 (status after tokens):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."}]
// Test case 3: Tool call
currentSegments = processStreamedMessages(currentSegments, { type: 'tool_call', payload: { tool: 'search_web', input: 'weather in London' } });
console.log('Test Case 3 (tool call):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true}]
// Test case 4: More tokens
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: 'The current ' });
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: 'weather is' });
console.log('Test Case 4 (more tokens):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true},{"id":"4","type":"text","content":"The current weather is"}]
// Test case 5: Tool result for the pending tool call
currentSegments = processStreamedMessages(currentSegments, { type: 'tool_result', payload: { tool: 'search_web', output: 'Sunny with 20°C' } });
console.log('Test Case 5 (tool result):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"toolResult":{"tool":"search_web","output":"Sunny with 20°C"},"isPending":false},{"id":"4","type":"text","content":"The current weather is"}]
// Test case 6: New text segment after tool result
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: ' Sunny and 20 degrees Celsius.' });
console.log('Test Case 6 (text after tool result):', JSON.stringify(currentSegments));
// Expected: Same as above for first four, then combines: "content":"The current weather is Sunny and 20 degrees Celsius."
// Test case 7: Empty payload token (should be filtered by the useEffect where this function is used,
// but our function does not explicitly filter it, so it would append it)
// For demonstration, let's assume valid non-empty tokens here.
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: '.' });
console.log('Test Case 7 (final token):', JSON.stringify(currentSegments));
// Test case 8: Handling multiple tool calls in sequence
let multipleToolsSegments = [];
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { id: "1", type: 'tool_call', payload: { tool: 'toolA', input: 'inputA' }, isPending: true });
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { id: "2", type: 'tool_call', payload: { tool: 'toolB', input: 'inputB' }, isPending: true });
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { type: 'tool_result', payload: { tool: 'toolA', output: 'outputA' } });
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { type: 'tool_result', payload: { tool: 'toolB', output: 'outputB' } });
console.log('Test Case 8 (multiple tool calls):', JSON.stringify(multipleToolsSegments));
// Expected: Both toolA and toolB to be marked as not pending with their respective results.
// Reset for another example
let resetSegments = [];
resetSegments = processStreamedMessages(resetSegments, { type: 'token', payload: 'Start. ' });
resetSegments = processStreamedMessages(resetSegments, { type: 'tool_call', payload: { tool: 'fetch_data', input: 'id123' } });
resetSegments = processStreamedMessages(resetSegments, { type: 'token', payload: 'Processing data. ' });
resetSegments = processStreamedMessages(resetSegments, { type: 'tool_result', payload: { tool: 'fetch_data', output: 'Data for id123' } });
resetSegments = processStreamedMessages(resetSegments, { type: 'token', payload: 'Done.' });
console.log('Test Case 9 (full flow):', JSON.stringify(resetSegments));Captured output
Test Case 1 (initial tokens): [{"id":"1","type":"text","content":"Hello world"}]
Test Case 2 (status after tokens): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."}]
Test Case 3 (tool call): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true}]
Test Case 4 (more tokens): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true},{"id":"4","type":"text","content":"The current weather is"}]
Test Case 5 (tool result): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":false,"toolResult":{"tool":"search_web","output":"Sunny with 20°C"}},{"id":"4","type":"text","content":"The current weather is"}]
Test Case 6 (text after tool result): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":false,"toolResult":{"tool":"search_web","output":"Sunny with 20°C"}},{"id":"4","type":"text","content":"The current weather is Sunny and 20 degrees Celsius."}]
Test Case 7 (final token): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":false,"toolResult":{"tool":"search_web","output":"Sunny with 20°C"}},{"id":"4","type":"text","content":"The current weather is Sunny and 20 degrees Celsius.."}]
Test Case 8 (multiple tool calls): [{"id":"1","type":"tool_call","toolCall":{"tool":"toolA","input":"inputA"},"isPending":false,"toolResult":{"tool":"toolA","output":"outputA"}},{"id":"2","type":"tool_call","toolCall":{"tool":"toolB","input":"inputB"},"isPending":false,"toolResult":{"tool":"toolB","output":"outputB"}}]
Test Case 9 (full flow): [{"id":"1","type":"text","content":"Start. "},{"id":"2","type":"tool_call","toolCall":{"tool":"fetch_data","input":"id123"},"isPending":false,"toolResult":{"tool":"fetch_data","output":"Data for id123"}},{"id":"3","type":"text","content":"Processing data. Done."}]
Streaming UI: Building Responsive Interfaces on Top of LLM Token Streams
The world has changed. Large Language Models (LLMs) aren't just a backend curiosity anymore; they're the new foundation for interaction. Building UIs that can keep up with the real-time, token-by-token output of an LLM isn't just a nice-to-have – it's a necessity. As a Senior Software Engineer with a decade of experience bridging complex backends with intuitive frontend experiences, I've spent the better part of the last year refining how I build these "Streaming UIs."
This post is about sharing my practical, battle-tested approach to constructing chat and agent interfaces that feel instant, gracefully handle the inherent unpredictability of LLM responses, and stay robust even when the network wobbles. Forget janky loading spinners; we're talking about a fluid, dynamic user experience.
The Problem: LLMs Don't Speak HTTP/1.1 Fluently
Traditional web requests are "request-response." You ask, the server computes, and then eventually sends back a complete answer. LLMs, however, stream their output. They generate token by token, word by word. If your frontend waits for the entire response, your users are staring at a blank screen wondering if the AI is still "thinking" or just stuck. This latency is frustrating and antithetical to a good user experience.
Imagine a user asking your agent to "Summarize the last 5 emails and draft a reply for the third one." The agent first has to process that request, then fetch emails, then summarize, and then draft. Each step could involve LLM calls. If we wait for the final draft, the user is left in the dark for a long time. This is where streaming becomes critical.
The Foundation: Server-Sent Events (SSE)
My preferred mechanism for receiving LLM streams from the backend is Server-Sent Events (SSE). While WebSockets offer bi-directional communication, SSE is simpler for one-way server-to-client streaming, leveraging standard HTTP/1.1 and offering automatic reconnection. It's perfectly suited for receiving continuous token streams.
Here's a simplified example of how you might consume an SSE stream in TypeScript:
// utils/sse.ts
interface SseMessage {
type: string;
payload: string; // Or a more structured object if applicable
}
export async function consumeSseStream(
url: string,
onMessage: (message: SseMessage) => void,
onError: (event: Event) => void,
onClose: () => void
) {
const eventSource = new EventSource(url);
eventSource.onmessage = (event: MessageEvent) => {
try {
const data = JSON.parse(event.data);
if (data.type && data.payload !== undefined) {
onMessage(data as SseMessage);
} else {
console.warn("Received malformed SSE data:", data);
}
} catch (e) {
console.error("Failed to parse SSE message:", e);
}
};
eventSource.onerror = (event: Event) => {
console.error("SSE Error:", event);
eventSource.close(); // Important to close on error to prevent infinite retries if backend is down
onError(event);
};
eventSource.onopen = () => {
console.log("SSE connection opened.");
};
// Graceful shutdown on reload/close
window.addEventListener('beforeunload', () => {
eventSource.close();
});
return () => {
eventSource.close();
console.log("SSE connection closed manually.");
};
}On the backend, you'd configure your endpoint to send Content-Type: text/event-stream and then push data: {json_payload}\n\n messages as tokens become available.
Bringing it to Life: React and Streaming State
Consuming the stream is one thing; rendering it effectively in React is another. This is where reactive state management and a bit of suspense (pun intended) come into play.
Progressive Rendering with useState
The most straightforward way is to simply append received tokens to a piece of state.
// components/ChatResponse.tsx
import React, { useState, useEffect, useCallback } from 'react';
import { consumeSseStream } from '../utils/sse'; // Assume this utility is defined
interface ChatResponseProps {
sessionId: string;
startStream: boolean; // Control when to start/stop the stream
}
const ChatResponse: React.FC<ChatResponseProps> = ({ sessionId, startStream }) => {
const [response, setResponse] = useState<string>('');
const [isStreaming, setIsStreaming] = useState<boolean>(false);
const [error, setError] = useState<string | null>(null);
useEffect(() => {
if (!startStream) {
setResponse(''); // Reset on new session/no stream
setIsStreaming(false);
return;
}
setIsStreaming(true);
setError(null);
const cleanup = consumeSseStream(
`/api/chat/stream/${sessionId}`, // Adjust your API endpoint
(message) => {
// Assuming 'token' type messages carry the text
if (message.type === 'token') {
// LLMs can sometimes send empty tokens or just newlines; filter them
if (message.payload && message.payload !== '\n') {
setResponse(prev => prev + message.payload);
}
}
// Handle other message types like 'status', 'tool_call', 'complete'
if (message.type === 'complete') {
setIsStreaming(false);
// Optional: Acknowledge the completion
}
},
(event) => {
setIsStreaming(false);
setError('Failed to connect or stream received an error. Please try again.');
console.error('Streaming error:', event);
},
() => {
// Stream finished or closed by server
setIsStreaming(false);
console.log('Stream closed.');
}
);
return () => {
cleanup(); // Disconnect on unmount or when `startStream` becomes false
setResponse(''); // Clear response on unmount or stream end
setIsStreaming(false);
};
}, [sessionId, startStream]); // Dependency array
return (
<div className="chat-response">
<div className="message-content">
{response.length > 0 ? response : (isStreaming ? "Thinking..." : "Ready to chat.")}
{isStreaming && <span className="streaming-cursor">█</span>}
</div>
{error && <div className="error-message">{error}</div>}
</div>
);
};
export default ChatResponse;This basic pattern immediately gives users feedback. They see the text appearing word by word, which feels significantly faster than waiting for the entire response.
Enhanced Interactivity with Intermediate Steps
However, LLM operations are rarely just "text streaming." Complex agents perform tool calls, fetch data, and reason. We want to show these intermediate steps to the user. My SSE protocol often includes message types for these stages:
// Example SSE messages
data: {"type": "status", "payload": "Thinking about your request..."}\n\n
data: {"type": "tool_call", "payload": {"tool": "search_web", "input": "latest stock price of TSLA"}}\n\n
data: {"type": "token", "payload": "According"}\n\n
data: {"type": "token", "payload": " to "}\n\n
// ... more tokens
data: {"type": "tool_result", "payload": {"tool": "search_web", "output": "TSLA is trading at $180.50"}}\n\n
data: {"type": "token", "payload": "Tesla ("}\n\n
// ... more tokens
data: {"type": "complete", "payload": {"final_response": "..."}}\n\nTo render this, you'd maintain an array of "message segments" where each segment could be plain text, a tool call, or a status update.
// Inside ChatResponse, modifying state management
interface MessageSegment {
id: string; // Unique ID for React keys
type: 'text' | 'status' | 'tool_call' | 'tool_result';
content?: string; // For 'text' or 'status'
toolCall?: { tool: string; input: string | object }; // For 'tool_call'
toolResult?: { tool: string; output: string | object }; // For 'tool_result'
isPending?: boolean; // For differentiating active tool calls
}
const [segments, setSegments] = useState<MessageSegment[]>([]);
// ... inside onMessage callback
if (message.type === 'token') {
setSegments(prevSegments => {
const lastSegment = prevSegments[prevSegments.length - 1];
if (lastSegment && lastSegment.type === 'text') {
return prevSegments.slice(0, -1).concat({ ...lastSegment, content: (lastSegment.content || '') + message.payload });
} else {
return prevSegments.concat({ id: crypto.randomUUID(), type: 'text', content: message.payload });
}
});
} else if (message.type === 'status') {
setSegments(prevSegments => prevSegments.concat({ id: crypto.randomUUID(), type: 'status', content: message.payload }));
} else if (message.type === 'tool_call') {
setSegments(prevSegments => prevSegments.concat({ id: crypto.randomUUID(), type: 'tool_call', toolCall: message.payload as any, isPending: true }));
} else if (message.type === 'tool_result') {
setSegments(prevSegments => {
// Find the pending tool call and update its status/result
return prevSegments.map(seg =>
seg.type === 'tool_call' && seg.isPending && seg.toolCall?.tool === (message.payload as any).tool
? { ...seg, toolResult: message.payload as any, isPending: false }
: seg
);
});
}
// Render segments array into UI componentsThis significantly improves transparency, letting users follow the agent's internal monologue and actions. They can see why the agent might be taking a moment, rather than just a generic "thinking" message.
Handling Interruption and Network Resilience
LLM streams can be long, and users might want to modify their prompt mid-stream, or the network might briefly drop.
User Interruption
Often, users want to stop a long-running generation. This requires a way to signal the backend. While SSE itself is unidirectional, you'd send a separate HTTP DELETE or POST request to your backend endpoint to signal cancellation. Your SSE consumer then gracefully cleans up.
// In ChatResponse or a parent component
const handleCancelStream = useCallback(async () => {
setIsStreaming(false); // Optimistically update UI
setResponse(''); // Clear current response
setSegments([]); // Clear segments
setError(null);
console.log("Cancelling stream...");
try {
await fetch(`/api/chat/cancel/${sessionId}`, { method: 'POST' }); // Send cancellation to backend
// Cleanup performed by effect's return function
} catch (cancelError) {
console.error("Failed to send cancellation signal:", cancelError);
// Potentially re-enable streaming if cancellation failed unexpectedly
}
}, [sessionId]);
// Add a button: <button onClick={handleCancelStream} disabled={!isStreaming}>Stop Generation</button>Network Resilience
SSE inherently offers some reconnection capabilities, but it's important to differentiate between transient network issues and a completely dead backend. My onError handler closes the EventSource and sets an error message, preventing endless retries against a truly unresponsive server. For transient issues, the browser might attempt reconnecting automatically (controlled by EventSource's internal logic and retry field in SSE messages). For more robust control, you might implement exponential backoff yourself around the EventSource instantiation within the useEffect.
The Role of React Suspense (and its limitations)
While the title mentions React Suspense, its direct application for streaming data itself like individual LLM tokens is not its primary design goal. Suspense is designed for asynchronous data fetching that resolves to a complete value, allowing React to "suspend" rendering until that data is ready.
However, Suspense can be beneficial when integrating LLM responses with other asynchronous parts of your UI:
- Loading external components based on LLM output: Imagine the LLM suggests a specific chart type. You could
lazyload that chart component usingReact.lazyandSuspense. - Fetching supporting data: If an agent's response contains references to data that needs to be fetched (e.g., "See details for user ID 123"), you could use a Suspense-enabled data fetching library (like React Query with its
suspense: trueoption) to fetchUser 123's profile after the LLM has generated the ID.
The direct-streaming of text is often best handled with useState and progressive rendering, as shown above, rather than trying to fit it into a Suspense boundary designed for "all or nothing" data loads. The goal is immediate feedback, not waiting for "readiness."
Architectural Flow for a Streaming LLM UI
Here's a simplified sequence of events for a typical prompt submission and streaming response.
sequenceDiagram
participant User
participant Frontend
participant BackendAPI
participant LLMService
User->>Frontend: Submits Prompt (e.g., "Summarize emails")
Frontend->>BackendAPI: POST /api/chat/session/start (gets sessionId)
BackendAPI->>Frontend: Response (sessionId)
Frontend->>BackendAPI: GET /api/chat/stream/{sessionId} (initiates SSE)
Note over Frontend: UI shows "Connecting..." or "Thinking..."
BackendAPI->>LLMService: Initial Prompt Call
LLMService-->>BackendAPI: Streams tokens/intermediate steps
loop Stream Tokens & Steps
BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'status', 'payload': 'Fetching emails...'}\n\n")
Frontend->>Frontend: Updates UI (shows status)
BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'tool_call', 'payload': {'tool': 'email_api', 'input': 'recent'}}\n\n")
Frontend->>Frontend: Updates UI (shows tool call)
BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'token', 'payload': 'Emails'}\n\n")
Frontend->>Frontend: Appends "Emails" to response
end
LLMService-->>BackendAPI: Final Response (or 'complete' signal)
BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'complete'}\n\n")
Frontend->>Frontend: Marks stream as complete, possibly shows final state
Frontend->>User: Shows complete responseConclusion
Building responsive UIs on top of LLM token streams is an exciting challenge that fundamentally changes how we think about frontend architecture. By leveraging Server-Sent Events for real-time delivery, stateful React components for progressive rendering, and carefully designed message protocols from the backend, we can create interfaces that are not only blazingly fast but also transparent and resilient. The key is to embrace the asynchronous, iterative nature of LLM generation from the ground up, rather than trying to shoehorn it into traditional request/response paradigms.
This approach ensures users are always informed, can react to the agent's progress, and never feel like they're waiting in the dark. It's about making the AI feel like a true co-pilot, communicating its thought process every step of the way.
Feel free to connect with me on LinkedIn or X (formerly Twitter) to discuss these patterns or anything else related to AI-powered frontend development!