Back to Blog
Frontend

Streaming UI: Building Responsive Interfaces on Top of LLM Token Streams

From SSE to React Suspense — how I build chat and agent UIs that feel instant, handle interruption gracefully, and don't fall apart when the network does.

Amit ShrivastavaJune 10, 202610 min read

A code snippet from this post was tested

Node.js v22.22.3 · Verified June 10, 2026

Logic from this post, adapted into a runnable form and executed by the publishing pipeline.

node verify.mjs

Snippet

function processStreamedMessages(initialSegments, message) {
  let segments = [...initialSegments]; // Create a mutable copy

  if (message.type === 'token') {
    const lastSegment = segments[segments.length - 1];
    if (lastSegment && lastSegment.type === 'text') {
      segments = segments.slice(0, -1).concat({ ...lastSegment, content: (lastSegment.content || '') + message.payload });
    } else {
      // In a real app, crypto.randomUUID() would be used.
      // For deterministic testing, we'll use a simple incrementing ID.
      const newId = segments.length > 0 ? parseInt(segments[segments.length - 1].id) + 1 : 1;
      segments = segments.concat({ id: String(newId), type: 'text', content: message.payload });
    }
  } else if (message.type === 'status') {
    const newId = segments.length > 0 ? parseInt(segments[segments.length - 1].id) + 1 : 1;
    segments = segments.concat({ id: String(newId), type: 'status', content: message.payload });
  } else if (message.type === 'tool_call') {
    const newId = segments.length > 0 ? parseInt(segments[segments.length - 1].id) + 1 : 1;
    segments = segments.concat({ id: String(newId), type: 'tool_call', toolCall: message.payload, isPending: true });
  } else if (message.type === 'tool_result') {
    segments = segments.map(seg =>
      seg.type === 'tool_call' && seg.isPending && seg.toolCall.tool === message.payload.tool
        ? { ...seg, toolResult: message.payload, isPending: false }
        : seg
    );
  }
  return segments;
}


// --- Test Cases ---

// Test case 1: Initial tokens
let currentSegments = [];
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: 'Hello' });
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: ' world' });
console.log('Test Case 1 (initial tokens):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"}]

// Test case 2: Status message after tokens
currentSegments = processStreamedMessages(currentSegments, { type: 'status', payload: 'Thinking about tools...' });
console.log('Test Case 2 (status after tokens):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."}]

// Test case 3: Tool call
currentSegments = processStreamedMessages(currentSegments, { type: 'tool_call', payload: { tool: 'search_web', input: 'weather in London' } });
console.log('Test Case 3 (tool call):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true}]

// Test case 4: More tokens
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: 'The current ' });
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: 'weather is' });
console.log('Test Case 4 (more tokens):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true},{"id":"4","type":"text","content":"The current weather is"}]

// Test case 5: Tool result for the pending tool call
currentSegments = processStreamedMessages(currentSegments, { type: 'tool_result', payload: { tool: 'search_web', output: 'Sunny with 20°C' } });
console.log('Test Case 5 (tool result):', JSON.stringify(currentSegments));
// Expected: [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"toolResult":{"tool":"search_web","output":"Sunny with 20°C"},"isPending":false},{"id":"4","type":"text","content":"The current weather is"}]

// Test case 6: New text segment after tool result
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: ' Sunny and 20 degrees Celsius.' });
console.log('Test Case 6 (text after tool result):', JSON.stringify(currentSegments));
// Expected: Same as above for first four, then combines: "content":"The current weather is Sunny and 20 degrees Celsius."

// Test case 7: Empty payload token (should be filtered by the useEffect where this function is used,
// but our function does not explicitly filter it, so it would append it)
// For demonstration, let's assume valid non-empty tokens here.
currentSegments = processStreamedMessages(currentSegments, { type: 'token', payload: '.' });
console.log('Test Case 7 (final token):', JSON.stringify(currentSegments));

// Test case 8: Handling multiple tool calls in sequence
let multipleToolsSegments = [];
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { id: "1", type: 'tool_call', payload: { tool: 'toolA', input: 'inputA' }, isPending: true });
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { id: "2", type: 'tool_call', payload: { tool: 'toolB', input: 'inputB' }, isPending: true });
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { type: 'tool_result', payload: { tool: 'toolA', output: 'outputA' } });
multipleToolsSegments = processStreamedMessages(multipleToolsSegments, { type: 'tool_result', payload: { tool: 'toolB', output: 'outputB' } });
console.log('Test Case 8 (multiple tool calls):', JSON.stringify(multipleToolsSegments));
// Expected: Both toolA and toolB to be marked as not pending with their respective results.

// Reset for another example
let resetSegments = [];
resetSegments = processStreamedMessages(resetSegments, { type: 'token', payload: 'Start. ' });
resetSegments = processStreamedMessages(resetSegments, { type: 'tool_call', payload: { tool: 'fetch_data', input: 'id123' } });
resetSegments = processStreamedMessages(resetSegments, { type: 'token', payload: 'Processing data. ' });
resetSegments = processStreamedMessages(resetSegments, { type: 'tool_result', payload: { tool: 'fetch_data', output: 'Data for id123' } });
resetSegments = processStreamedMessages(resetSegments, { type: 'token', payload: 'Done.' });
console.log('Test Case 9 (full flow):', JSON.stringify(resetSegments));

Captured output

Test Case 1 (initial tokens): [{"id":"1","type":"text","content":"Hello world"}]
Test Case 2 (status after tokens): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."}]
Test Case 3 (tool call): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true}]
Test Case 4 (more tokens): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":true},{"id":"4","type":"text","content":"The current weather is"}]
Test Case 5 (tool result): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":false,"toolResult":{"tool":"search_web","output":"Sunny with 20°C"}},{"id":"4","type":"text","content":"The current weather is"}]
Test Case 6 (text after tool result): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":false,"toolResult":{"tool":"search_web","output":"Sunny with 20°C"}},{"id":"4","type":"text","content":"The current weather is Sunny and 20 degrees Celsius."}]
Test Case 7 (final token): [{"id":"1","type":"text","content":"Hello world"},{"id":"2","type":"status","content":"Thinking about tools..."},{"id":"3","type":"tool_call","toolCall":{"tool":"search_web","input":"weather in London"},"isPending":false,"toolResult":{"tool":"search_web","output":"Sunny with 20°C"}},{"id":"4","type":"text","content":"The current weather is Sunny and 20 degrees Celsius.."}]
Test Case 8 (multiple tool calls): [{"id":"1","type":"tool_call","toolCall":{"tool":"toolA","input":"inputA"},"isPending":false,"toolResult":{"tool":"toolA","output":"outputA"}},{"id":"2","type":"tool_call","toolCall":{"tool":"toolB","input":"inputB"},"isPending":false,"toolResult":{"tool":"toolB","output":"outputB"}}]
Test Case 9 (full flow): [{"id":"1","type":"text","content":"Start. "},{"id":"2","type":"tool_call","toolCall":{"tool":"fetch_data","input":"id123"},"isPending":false,"toolResult":{"tool":"fetch_data","output":"Data for id123"}},{"id":"3","type":"text","content":"Processing data. Done."}]

Streaming UI: Building Responsive Interfaces on Top of LLM Token Streams

The world has changed. Large Language Models (LLMs) aren't just a backend curiosity anymore; they're the new foundation for interaction. Building UIs that can keep up with the real-time, token-by-token output of an LLM isn't just a nice-to-have – it's a necessity. As a Senior Software Engineer with a decade of experience bridging complex backends with intuitive frontend experiences, I've spent the better part of the last year refining how I build these "Streaming UIs."

This post is about sharing my practical, battle-tested approach to constructing chat and agent interfaces that feel instant, gracefully handle the inherent unpredictability of LLM responses, and stay robust even when the network wobbles. Forget janky loading spinners; we're talking about a fluid, dynamic user experience.

The Problem: LLMs Don't Speak HTTP/1.1 Fluently

Traditional web requests are "request-response." You ask, the server computes, and then eventually sends back a complete answer. LLMs, however, stream their output. They generate token by token, word by word. If your frontend waits for the entire response, your users are staring at a blank screen wondering if the AI is still "thinking" or just stuck. This latency is frustrating and antithetical to a good user experience.

Imagine a user asking your agent to "Summarize the last 5 emails and draft a reply for the third one." The agent first has to process that request, then fetch emails, then summarize, and then draft. Each step could involve LLM calls. If we wait for the final draft, the user is left in the dark for a long time. This is where streaming becomes critical.

The Foundation: Server-Sent Events (SSE)

My preferred mechanism for receiving LLM streams from the backend is Server-Sent Events (SSE). While WebSockets offer bi-directional communication, SSE is simpler for one-way server-to-client streaming, leveraging standard HTTP/1.1 and offering automatic reconnection. It's perfectly suited for receiving continuous token streams.

Here's a simplified example of how you might consume an SSE stream in TypeScript:

// utils/sse.ts
interface SseMessage {
  type: string;
  payload: string; // Or a more structured object if applicable
}

export async function consumeSseStream(
  url: string,
  onMessage: (message: SseMessage) => void,
  onError: (event: Event) => void,
  onClose: () => void
) {
  const eventSource = new EventSource(url);

  eventSource.onmessage = (event: MessageEvent) => {
    try {
      const data = JSON.parse(event.data);
      if (data.type && data.payload !== undefined) {
        onMessage(data as SseMessage);
      } else {
        console.warn("Received malformed SSE data:", data);
      }
    } catch (e) {
      console.error("Failed to parse SSE message:", e);
    }
  };

  eventSource.onerror = (event: Event) => {
    console.error("SSE Error:", event);
    eventSource.close(); // Important to close on error to prevent infinite retries if backend is down
    onError(event);
  };

  eventSource.onopen = () => {
    console.log("SSE connection opened.");
  };

  // Graceful shutdown on reload/close
  window.addEventListener('beforeunload', () => {
    eventSource.close();
  });

  return () => {
    eventSource.close();
    console.log("SSE connection closed manually.");
  };
}

On the backend, you'd configure your endpoint to send Content-Type: text/event-stream and then push data: {json_payload}\n\n messages as tokens become available.

Bringing it to Life: React and Streaming State

Consuming the stream is one thing; rendering it effectively in React is another. This is where reactive state management and a bit of suspense (pun intended) come into play.

Progressive Rendering with useState

The most straightforward way is to simply append received tokens to a piece of state.

// components/ChatResponse.tsx
import React, { useState, useEffect, useCallback } from 'react';
import { consumeSseStream } from '../utils/sse'; // Assume this utility is defined

interface ChatResponseProps {
  sessionId: string;
  startStream: boolean; // Control when to start/stop the stream
}

const ChatResponse: React.FC<ChatResponseProps> = ({ sessionId, startStream }) => {
  const [response, setResponse] = useState<string>('');
  const [isStreaming, setIsStreaming] = useState<boolean>(false);
  const [error, setError] = useState<string | null>(null);

  useEffect(() => {
    if (!startStream) {
      setResponse(''); // Reset on new session/no stream
      setIsStreaming(false);
      return;
    }

    setIsStreaming(true);
    setError(null);

    const cleanup = consumeSseStream(
      `/api/chat/stream/${sessionId}`, // Adjust your API endpoint
      (message) => {
        // Assuming 'token' type messages carry the text
        if (message.type === 'token') {
          // LLMs can sometimes send empty tokens or just newlines; filter them
          if (message.payload && message.payload !== '\n') {
            setResponse(prev => prev + message.payload);
          }
        }
        // Handle other message types like 'status', 'tool_call', 'complete'
        if (message.type === 'complete') {
          setIsStreaming(false);
          // Optional: Acknowledge the completion
        }
      },
      (event) => {
        setIsStreaming(false);
        setError('Failed to connect or stream received an error. Please try again.');
        console.error('Streaming error:', event);
      },
      () => {
        // Stream finished or closed by server
        setIsStreaming(false);
        console.log('Stream closed.');
      }
    );

    return () => {
      cleanup(); // Disconnect on unmount or when `startStream` becomes false
      setResponse(''); // Clear response on unmount or stream end
      setIsStreaming(false);
    };
  }, [sessionId, startStream]); // Dependency array

  return (
    <div className="chat-response">
      <div className="message-content">
        {response.length > 0 ? response : (isStreaming ? "Thinking..." : "Ready to chat.")}
        {isStreaming && <span className="streaming-cursor">█</span>}
      </div>
      {error && <div className="error-message">{error}</div>}
    </div>
  );
};

export default ChatResponse;

This basic pattern immediately gives users feedback. They see the text appearing word by word, which feels significantly faster than waiting for the entire response.

Enhanced Interactivity with Intermediate Steps

However, LLM operations are rarely just "text streaming." Complex agents perform tool calls, fetch data, and reason. We want to show these intermediate steps to the user. My SSE protocol often includes message types for these stages:

// Example SSE messages
data: {"type": "status", "payload": "Thinking about your request..."}\n\n
data: {"type": "tool_call", "payload": {"tool": "search_web", "input": "latest stock price of TSLA"}}\n\n
data: {"type": "token", "payload": "According"}\n\n
data: {"type": "token", "payload": " to "}\n\n
// ... more tokens
data: {"type": "tool_result", "payload": {"tool": "search_web", "output": "TSLA is trading at $180.50"}}\n\n
data: {"type": "token", "payload": "Tesla ("}\n\n
// ... more tokens
data: {"type": "complete", "payload": {"final_response": "..."}}\n\n

To render this, you'd maintain an array of "message segments" where each segment could be plain text, a tool call, or a status update.

// Inside ChatResponse, modifying state management
interface MessageSegment {
  id: string; // Unique ID for React keys
  type: 'text' | 'status' | 'tool_call' | 'tool_result';
  content?: string; // For 'text' or 'status'
  toolCall?: { tool: string; input: string | object }; // For 'tool_call'
  toolResult?: { tool: string; output: string | object }; // For 'tool_result'
  isPending?: boolean; // For differentiating active tool calls
}

const [segments, setSegments] = useState<MessageSegment[]>([]);

// ... inside onMessage callback
if (message.type === 'token') {
  setSegments(prevSegments => {
    const lastSegment = prevSegments[prevSegments.length - 1];
    if (lastSegment && lastSegment.type === 'text') {
      return prevSegments.slice(0, -1).concat({ ...lastSegment, content: (lastSegment.content || '') + message.payload });
    } else {
      return prevSegments.concat({ id: crypto.randomUUID(), type: 'text', content: message.payload });
    }
  });
} else if (message.type === 'status') {
  setSegments(prevSegments => prevSegments.concat({ id: crypto.randomUUID(), type: 'status', content: message.payload }));
} else if (message.type === 'tool_call') {
  setSegments(prevSegments => prevSegments.concat({ id: crypto.randomUUID(), type: 'tool_call', toolCall: message.payload as any, isPending: true }));
} else if (message.type === 'tool_result') {
  setSegments(prevSegments => {
    // Find the pending tool call and update its status/result
    return prevSegments.map(seg =>
      seg.type === 'tool_call' && seg.isPending && seg.toolCall?.tool === (message.payload as any).tool
        ? { ...seg, toolResult: message.payload as any, isPending: false }
        : seg
    );
  });
}
// Render segments array into UI components

This significantly improves transparency, letting users follow the agent's internal monologue and actions. They can see why the agent might be taking a moment, rather than just a generic "thinking" message.

Handling Interruption and Network Resilience

LLM streams can be long, and users might want to modify their prompt mid-stream, or the network might briefly drop.

User Interruption

Often, users want to stop a long-running generation. This requires a way to signal the backend. While SSE itself is unidirectional, you'd send a separate HTTP DELETE or POST request to your backend endpoint to signal cancellation. Your SSE consumer then gracefully cleans up.

// In ChatResponse or a parent component
const handleCancelStream = useCallback(async () => {
  setIsStreaming(false); // Optimistically update UI
  setResponse('');          // Clear current response
  setSegments([]);          // Clear segments
  setError(null);
  console.log("Cancelling stream...");
  try {
    await fetch(`/api/chat/cancel/${sessionId}`, { method: 'POST' }); // Send cancellation to backend
    // Cleanup performed by effect's return function
  } catch (cancelError) {
    console.error("Failed to send cancellation signal:", cancelError);
    // Potentially re-enable streaming if cancellation failed unexpectedly
  }
}, [sessionId]);

// Add a button: <button onClick={handleCancelStream} disabled={!isStreaming}>Stop Generation</button>

Network Resilience

SSE inherently offers some reconnection capabilities, but it's important to differentiate between transient network issues and a completely dead backend. My onError handler closes the EventSource and sets an error message, preventing endless retries against a truly unresponsive server. For transient issues, the browser might attempt reconnecting automatically (controlled by EventSource's internal logic and retry field in SSE messages). For more robust control, you might implement exponential backoff yourself around the EventSource instantiation within the useEffect.

The Role of React Suspense (and its limitations)

While the title mentions React Suspense, its direct application for streaming data itself like individual LLM tokens is not its primary design goal. Suspense is designed for asynchronous data fetching that resolves to a complete value, allowing React to "suspend" rendering until that data is ready.

However, Suspense can be beneficial when integrating LLM responses with other asynchronous parts of your UI:

  1. Loading external components based on LLM output: Imagine the LLM suggests a specific chart type. You could lazy load that chart component using React.lazy and Suspense.
  2. Fetching supporting data: If an agent's response contains references to data that needs to be fetched (e.g., "See details for user ID 123"), you could use a Suspense-enabled data fetching library (like React Query with its suspense: true option) to fetch User 123's profile after the LLM has generated the ID.

The direct-streaming of text is often best handled with useState and progressive rendering, as shown above, rather than trying to fit it into a Suspense boundary designed for "all or nothing" data loads. The goal is immediate feedback, not waiting for "readiness."

Architectural Flow for a Streaming LLM UI

Here's a simplified sequence of events for a typical prompt submission and streaming response.

sequenceDiagram
    participant User
    participant Frontend
    participant BackendAPI
    participant LLMService

    User->>Frontend: Submits Prompt (e.g., "Summarize emails")
    Frontend->>BackendAPI: POST /api/chat/session/start (gets sessionId)
    BackendAPI->>Frontend: Response (sessionId)
    Frontend->>BackendAPI: GET /api/chat/stream/{sessionId} (initiates SSE)
    Note over Frontend: UI shows "Connecting..." or "Thinking..."
    BackendAPI->>LLMService: Initial Prompt Call
    LLMService-->>BackendAPI: Streams tokens/intermediate steps
    loop Stream Tokens & Steps
        BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'status', 'payload': 'Fetching emails...'}\n\n")
        Frontend->>Frontend: Updates UI (shows status)
        BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'tool_call', 'payload': {'tool': 'email_api', 'input': 'recent'}}\n\n")
        Frontend->>Frontend: Updates UI (shows tool call)
        BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'token', 'payload': 'Emails'}\n\n")
        Frontend->>Frontend: Appends "Emails" to response
    end
    LLMService-->>BackendAPI: Final Response (or 'complete' signal)
    BackendAPI->>Frontend: SSE Message (e.g., "data: {'type': 'complete'}\n\n")
    Frontend->>Frontend: Marks stream as complete, possibly shows final state
    Frontend->>User: Shows complete response

Conclusion

Building responsive UIs on top of LLM token streams is an exciting challenge that fundamentally changes how we think about frontend architecture. By leveraging Server-Sent Events for real-time delivery, stateful React components for progressive rendering, and carefully designed message protocols from the backend, we can create interfaces that are not only blazingly fast but also transparent and resilient. The key is to embrace the asynchronous, iterative nature of LLM generation from the ground up, rather than trying to shoehorn it into traditional request/response paradigms.

This approach ensures users are always informed, can react to the agent's progress, and never feel like they're waiting in the dark. It's about making the AI feel like a true co-pilot, communicating its thought process every step of the way.

Feel free to connect with me on LinkedIn or X (formerly Twitter) to discuss these patterns or anything else related to AI-powered frontend development!

React
Streaming
LLMs
UX