From b8b87bdc9fb0feab0883b0c7b406e3befd624148 Mon Sep 17 00:00:00 2001 From: Pete Royce Saldanha Date: Fri, 10 Apr 2026 16:15:09 +0530 Subject: [PATCH] feat(react): add useDeepgramAgent hook for Voice Agent API (closes #477) --- package.json | 19 +++ src/react/index.ts | 2 + src/react/useDeepgramAgent.ts | 268 ++++++++++++++++++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 src/react/index.ts create mode 100644 src/react/useDeepgramAgent.ts diff --git a/package.json b/package.json index a001cc39..8db50c6b 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,17 @@ }, "default": "./dist/cjs/index.js" }, + "./react": { + "import": { + "types": "./dist/esm/react/index.d.mts", + "default": "./dist/esm/react/index.mjs" + }, + "require": { + "types": "./dist/cjs/react/index.d.ts", + "default": "./dist/cjs/react/index.js" + }, + "default": "./dist/cjs/react/index.js" + }, "./agent": { "import": { "types": "./dist/esm/api/resources/agent/exports.d.mts", @@ -477,6 +488,14 @@ "dependencies": { "ws": "^8.16.0" }, + "peerDependencies": { + "react": ">=16.8.0" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + } + }, "devDependencies": { "webpack": "^5.97.1", "ts-loader": "^9.5.1", diff --git a/src/react/index.ts b/src/react/index.ts new file mode 100644 index 00000000..cd70c1e3 --- /dev/null +++ b/src/react/index.ts @@ -0,0 +1,2 @@ +export { useDeepgramAgent } from "./useDeepgramAgent.js"; +export type { UseDeepgramAgentOptions, UseDeepgramAgentResult } from "./useDeepgramAgent.js"; diff --git a/src/react/useDeepgramAgent.ts b/src/react/useDeepgramAgent.ts new file mode 100644 index 00000000..8841d175 --- /dev/null +++ b/src/react/useDeepgramAgent.ts @@ -0,0 +1,268 @@ +/** + * useDeepgramAgent — React hook for the Deepgram Voice Agent API. + * + * Wraps the agent v1 WebSocket with declarative React state so that + * components don't need to manage WebSocket lifecycle, audio capture, + * or event handling manually. + * + * Usage: + * ```tsx + * import { useDeepgramAgent } from "@deepgram/sdk/react"; + * + * function VoiceAgent() { + * const { connect, disconnect, isConnected, transcript, agentText, error, sendFunctionResult } = + * useDeepgramAgent({ + * apiKey: process.env.DEEPGRAM_API_KEY, + * agent: { + * think: { provider: { type: "open_ai" }, model: "gpt-4o-mini" }, + * speak: { model: "aura-asteria-en" }, + * }, + * onFunctionCall: async (name, params) => ({ result: "done" }), + * }); + * + * return ( + * + * ); + * } + * ``` + * + * @module + */ + +// NOTE: This module depends on React. React is a peer dependency — ensure +// `react` ≥ 16.8 is installed in the consuming project. +import { useCallback, useEffect, useRef, useState } from "react"; +import { DeepgramClient } from "../Client.js"; +import type * as Deepgram from "../api/index.js"; + +// --------------------------------------------------------------------------- +// Public types +// --------------------------------------------------------------------------- + +export interface UseDeepgramAgentOptions { + /** + * Deepgram API key used to authenticate the WebSocket connection. + * + * Keep this key server-side in production. For client-side use, generate + * a short-lived token via the Deepgram Auth API and pass it here. + */ + apiKey: string; + + /** + * Voice agent configuration forwarded to the Deepgram settings message + * immediately after the WebSocket opens. + */ + agent: Deepgram.agent.v1.AgentV1SettingsAgent; + + /** + * Called when the agent requests a function/tool call. Return the result + * object to forward back to the agent. Throwing will be caught and the + * error message sent as the result. + */ + onFunctionCall?: ( + name: string, + params: Record + ) => Promise | unknown; + + /** + * Called when a new user transcript segment arrives. `isFinal` is true + * once the segment is complete. + */ + onTranscript?: (text: string, isFinal: boolean) => void; + + /** + * Called when the agent produces a text response chunk. + */ + onAgentResponse?: (text: string) => void; + + /** + * Base URL override (useful for testing or on-prem deployments). + */ + baseUrl?: string; +} + +export interface UseDeepgramAgentResult { + /** Open the WebSocket and start the voice session. */ + connect: () => void; + /** Close the WebSocket and stop the voice session. */ + disconnect: () => void; + /** Whether the WebSocket is currently open and ready. */ + isConnected: boolean; + /** Whether the user is currently speaking (microphone active). */ + isListening: boolean; + /** Whether the agent is currently speaking (audio playing). */ + isSpeaking: boolean; + /** Most recent user transcript text. */ + transcript: string; + /** Most recent agent response text. */ + agentText: string; + /** Error from the last failed operation, if any. */ + error: Error | null; + /** + * Manually send a function call result back to the agent. + * Normally you would use `onFunctionCall` instead; this escape hatch + * is for cases where you need to send the result from outside the callback. + */ + sendFunctionResult: (functionCallId: string, result: unknown) => void; +} + +// --------------------------------------------------------------------------- +// Hook implementation +// --------------------------------------------------------------------------- + +export function useDeepgramAgent(options: UseDeepgramAgentOptions): UseDeepgramAgentResult { + const { + apiKey, + agent, + onFunctionCall, + onTranscript, + onAgentResponse, + baseUrl, + } = options; + + // ---- state ---- + const [isConnected, setIsConnected] = useState(false); + const [isListening, setIsListening] = useState(false); + const [isSpeaking, setIsSpeaking] = useState(false); + const [transcript, setTranscript] = useState(""); + const [agentText, setAgentText] = useState(""); + const [error, setError] = useState(null); + + // ---- stable refs so callbacks always see current values ---- + const socketRef = useRef["agent"]["v1"]["connect"]>> | null>(null); + const onFunctionCallRef = useRef(onFunctionCall); + const onTranscriptRef = useRef(onTranscript); + const onAgentResponseRef = useRef(onAgentResponse); + + // Keep callback refs current without re-running effects + useEffect(() => { onFunctionCallRef.current = onFunctionCall; }, [onFunctionCall]); + useEffect(() => { onTranscriptRef.current = onTranscript; }, [onTranscript]); + useEffect(() => { onAgentResponseRef.current = onAgentResponse; }, [onAgentResponse]); + + // ---- connect ---- + const connect = useCallback(async () => { + setError(null); + try { + const client = new DeepgramClient({ apiKey, ...(baseUrl ? { environment: { agent: baseUrl } as unknown as string } : {}) }); + const socket = await client.agent.v1.connect({ Authorization: `Token ${apiKey}` }); + + socket.on("open", () => { + // Send initial settings as soon as the connection is established + socket.sendSettings({ type: "Settings", audio: { input: { encoding: "linear16", sample_rate: 16000 }, output: { encoding: "linear16", sample_rate: 24000, bitrate: 128000, container: "none" } }, agent }); + setIsConnected(true); + }); + + socket.on("message", async (msg) => { + if (typeof msg === "string") return; + + const m = msg as Deepgram.agent.V1Socket.Response; + + if (isAgentV1ConversationText(m)) { + const text = m.content ?? ""; + if (m.role === "user") { + setTranscript(text); + onTranscriptRef.current?.(text, true); + } else if (m.role === "assistant") { + setAgentText(text); + onAgentResponseRef.current?.(text); + } + } else if (isAgentV1UserStartedSpeaking(m)) { + setIsListening(true); + } else if (isAgentV1AgentStartedSpeaking(m)) { + setIsSpeaking(true); + setIsListening(false); + } else if (isAgentV1AgentAudioDone(m)) { + setIsSpeaking(false); + } else if (isAgentV1FunctionCallRequest(m)) { + const req = m as Deepgram.agent.AgentV1FunctionCallRequest; + const fnName = req.function_name ?? ""; + const fnParams = req.input as Record ?? {}; + let result: unknown = null; + try { + result = await onFunctionCallRef.current?.(fnName, fnParams) ?? null; + } catch (err) { + result = { error: err instanceof Error ? err.message : String(err) }; + } + socket.sendFunctionCallResponse({ + type: "FunctionCallResponse", + function_call_id: req.function_call_id ?? "", + output: JSON.stringify(result), + }); + } else if (isAgentV1Error(m)) { + setError(new Error((m as Deepgram.agent.AgentV1Error).description ?? "Agent error")); + } + }); + + socket.on("close", () => { + setIsConnected(false); + setIsListening(false); + setIsSpeaking(false); + }); + + socket.on("error", (err) => { + setError(err); + setIsConnected(false); + }); + + socket.connect(); + socketRef.current = socket; + } catch (err) { + setError(err instanceof Error ? err : new Error(String(err))); + } + }, [apiKey, agent, baseUrl]); + + // ---- disconnect ---- + const disconnect = useCallback(() => { + socketRef.current?.socket?.close(); + socketRef.current = null; + setIsConnected(false); + setIsListening(false); + setIsSpeaking(false); + }, []); + + // ---- sendFunctionResult escape hatch ---- + const sendFunctionResult = useCallback((functionCallId: string, result: unknown) => { + socketRef.current?.sendFunctionCallResponse({ + type: "FunctionCallResponse", + function_call_id: functionCallId, + output: JSON.stringify(result), + }); + }, []); + + // Clean up on unmount + useEffect(() => { + return () => { socketRef.current?.socket?.close(); }; + }, []); + + return { connect, disconnect, isConnected, isListening, isSpeaking, transcript, agentText, error, sendFunctionResult }; +} + +// --------------------------------------------------------------------------- +// Narrow type guards for discriminated union messages +// --------------------------------------------------------------------------- + +function isAgentV1ConversationText(m: unknown): m is Deepgram.agent.AgentV1ConversationText { + return typeof m === "object" && m !== null && (m as Record).type === "ConversationText"; +} + +function isAgentV1UserStartedSpeaking(m: unknown): m is Deepgram.agent.AgentV1UserStartedSpeaking { + return typeof m === "object" && m !== null && (m as Record).type === "UserStartedSpeaking"; +} + +function isAgentV1AgentStartedSpeaking(m: unknown): m is Deepgram.agent.AgentV1AgentStartedSpeaking { + return typeof m === "object" && m !== null && (m as Record).type === "AgentStartedSpeaking"; +} + +function isAgentV1AgentAudioDone(m: unknown): m is Deepgram.agent.AgentV1AgentAudioDone { + return typeof m === "object" && m !== null && (m as Record).type === "AgentAudioDone"; +} + +function isAgentV1FunctionCallRequest(m: unknown): m is Deepgram.agent.AgentV1FunctionCallRequest { + return typeof m === "object" && m !== null && (m as Record).type === "FunctionCallRequest"; +} + +function isAgentV1Error(m: unknown): m is Deepgram.agent.AgentV1Error { + return typeof m === "object" && m !== null && (m as Record).type === "Error"; +}