Package agents
Sub-modules
agents.a2aagents.agentagents.agent_sessionagents.avataragents.backend-
Backend communication module for VideoSDK Agents …
agents.background_audioagents.console_modeagents.content_generationagents.debugagents.denoiseagents.dtmf_handleragents.eouagents.event_busagents.event_emitteragents.execution-
Execution module for VideoSDK Agents …
agents.fallback_baseagents.imagesagents.inference-
VideoSDK Inference Gateway Plugins …
agents.init_config-
Agent initialization configuration utilities …
agents.jobagents.knowledge_baseagents.llmagents.mcpagents.metricsagents.pipelineagents.pipeline_hooksagents.pipeline_orchestratoragents.pipeline_utils-
Helper functions for pipeline component management.
agents.playground_manageragents.realtime_base_modelagents.realtime_llm_adapteragents.roomagents.speech_generationagents.speech_understandingagents.sttagents.transportsagents.ttsagents.utilsagents.utterance_handleagents.vadagents.voice_mail_detectoragents.worker
Functions
def build_gemini_schema(function_tool: FunctionTool) ‑> google.genai.types.FunctionDeclaration-
Expand source code
def build_gemini_schema(function_tool: FunctionTool) -> types.FunctionDeclaration: """Build Gemini-compatible schema from a function tool""" tool_info = get_tool_info(function_tool) parameter_json_schema_for_gemini: Optional[dict[str, Any]] = None if tool_info.parameters_schema is not None: if tool_info.parameters_schema and tool_info.parameters_schema.get("properties", True) is not None: simplified_schema = simplify_gemini_schema(tool_info.parameters_schema) parameter_json_schema_for_gemini = simplified_schema else: openai_schema = build_openai_schema(function_tool) if openai_schema.get("parameters") and openai_schema["parameters"].get("properties", True) is not None: simplified_schema = simplify_gemini_schema(openai_schema["parameters"]) parameter_json_schema_for_gemini = simplified_schema func_declaration = types.FunctionDeclaration( name=tool_info.name, description=tool_info.description or "", parameters=parameter_json_schema_for_gemini ) return func_declarationBuild Gemini-compatible schema from a function tool
def build_nova_sonic_schema(function_tool: FunctionTool) ‑> dict[str, typing.Any]-
Expand source code
def build_nova_sonic_schema(function_tool: FunctionTool) -> dict[str, Any]: """Build Amazon Nova Sonic-compatible schema from a function tool""" tool_info = get_tool_info(function_tool) params_schema_to_use: Optional[dict] = None if tool_info.parameters_schema is not None: params_schema_to_use = tool_info.parameters_schema else: model = build_pydantic_args_model(function_tool) params_schema_to_use = model.model_json_schema() final_params_schema_for_nova = params_schema_to_use if params_schema_to_use is not None else {"type": "object", "properties": {}} input_schema_json_string = json.dumps(final_params_schema_for_nova) description = tool_info.description or tool_info.name return { "toolSpec": { "name": tool_info.name, "description": description, "inputSchema": { "json": input_schema_json_string } } }Build Amazon Nova Sonic-compatible schema from a function tool
def build_openai_schema(function_tool: FunctionTool) ‑> dict[str, typing.Any]-
Expand source code
def build_openai_schema(function_tool: FunctionTool) -> dict[str, Any]: """Build OpenAI-compatible schema from a function tool""" tool_info = get_tool_info(function_tool) params_schema_to_use: Optional[dict] = None if tool_info.parameters_schema is not None: params_schema_to_use = tool_info.parameters_schema else: model = build_pydantic_args_model(function_tool) params_schema_to_use = model.model_json_schema() final_params_schema = params_schema_to_use if params_schema_to_use is not None else {"type": "object", "properties": {}} return { "name": tool_info.name, "description": tool_info.description or "", "parameters": final_params_schema, "type": "function", }Build OpenAI-compatible schema from a function tool
def encode(frame: av.VideoFrame,
options: EncodeOptions) ‑> bytes-
Expand source code
def encode(frame: av.VideoFrame, options: EncodeOptions) -> bytes: """Encode an av.VideoFrame into bytes by resizing and compressing it according to the given options.""" img = frame.to_image() if options.resize_options: img = img.resize( (options.resize_options.width, options.resize_options.height), resample=PILImage.Resampling.LANCZOS ) buffer = io.BytesIO() img.save(buffer, format=options.format, quality=options.quality, optimize=True, subsampling=0, qtables="web_high" ) return buffer.getvalue()Encode an av.VideoFrame into bytes by resizing and compressing it according to the given options.
def function_tool(func: Optional[Callable] = None, *, name: Optional[str] = None)-
Expand source code
def function_tool(func: Optional[Callable] = None, *, name: Optional[str] = None): """Decorator to mark a function as a tool. Can be used with or without parentheses.""" def create_wrapper(fn: Callable) -> FunctionTool: tool_info = FunctionToolInfo( name=name or fn.__name__, description=fn.__doc__ ) if asyncio.iscoroutinefunction(fn): @wraps(fn) async def async_wrapper(*args, **kwargs): return await fn(*args, **kwargs) setattr(async_wrapper, "_tool_info", tool_info) return async_wrapper else: @wraps(fn) def sync_wrapper(*args, **kwargs): return fn(*args, **kwargs) setattr(sync_wrapper, "_tool_info", tool_info) return sync_wrapper if func is None: return create_wrapper return create_wrapper(func)Decorator to mark a function as a tool. Can be used with or without parentheses.
def get_tool_info(tool: FunctionTool) ‑> FunctionToolInfo-
Expand source code
def get_tool_info(tool: FunctionTool) -> FunctionToolInfo: """Get the tool info from a function tool""" if not is_function_tool(tool): raise ValueError("Object is not a function tool") if inspect.ismethod(tool): tool = tool.__func__ return getattr(tool, "_tool_info")Get the tool info from a function tool
def is_function_tool(obj: Any) ‑> bool-
Expand source code
def is_function_tool(obj: Any) -> bool: """Check if an object is a function tool""" if inspect.ismethod(obj): obj = obj.__func__ return hasattr(obj, "_tool_info")Check if an object is a function tool
async def run_stt(audio_stream: AsyncIterator[bytes]) ‑> AsyncIterator[Any]-
Expand source code
async def run_stt(audio_stream: AsyncIterator[bytes]) -> AsyncIterator[Any]: """ Run STT on an audio stream. Delegates to the STT component's stream_transcribe method. Args: audio_stream: Async iterator of audio bytes Yields: SpeechEvent objects (with text, etc.) """ from .job import get_current_job_context ctx = get_current_job_context() if not ctx or not ctx._pipeline or not ctx._pipeline.stt: raise RuntimeError("No STT component available in current context") async for event in ctx._pipeline.stt.stream_transcribe(audio_stream): yield eventRun STT on an audio stream.
Delegates to the STT component's stream_transcribe method.
Args
audio_stream- Async iterator of audio bytes
Yields
SpeechEvent objects (with text, etc.)
async def run_tts(text_stream: AsyncIterator[str]) ‑> AsyncIterator[bytes]-
Expand source code
async def run_tts(text_stream: AsyncIterator[str]) -> AsyncIterator[bytes]: """ Run TTS on a text stream. Delegates to the TTS component's stream_synthesize method. Args: text_stream: Async iterator of text Yields: Audio bytes """ from .job import get_current_job_context ctx = get_current_job_context() if not ctx or not ctx._pipeline or not ctx._pipeline.tts: raise RuntimeError("No TTS component available in current context") async for frame in ctx._pipeline.tts.stream_synthesize(text_stream): yield frameRun TTS on a text stream.
Delegates to the TTS component's stream_synthesize method.
Args
text_stream- Async iterator of text
Yields
Audio bytes
async def segment_text(chunks: AsyncIterator[str],
delimiters: str = '.?!,;:\n',
keep_delimiter: bool = True,
min_chars: int = 50,
min_words: int = 12,
max_buffer: int = 600) ‑> AsyncIterator[str]-
Expand source code
async def segment_text( chunks: AsyncIterator[str], delimiters: str = ".?!,;:\n", keep_delimiter: bool = True, min_chars: int = 50, min_words: int = 12, max_buffer: int = 600, ) -> AsyncIterator[str]: """ Segment an async stream of text on delimiters or soft boundaries to reduce TTS latency. Yields segments while keeping the delimiter if requested. """ buffer = "" def words_count(s: str) -> int: return len(s.split()) def find_first_delim_index(s: str) -> int: indices = [i for d in delimiters if (i := s.find(d)) != -1] return min(indices) if indices else -1 async for chunk in chunks: if not chunk: continue buffer += chunk while True: di = find_first_delim_index(buffer) if di != -1: seg = buffer[: di + (1 if keep_delimiter else 0)] yield seg buffer = buffer[di + 1 :].lstrip() continue else: if len(buffer) >= max_buffer or words_count(buffer) >= (min_words * 2): target = max(min_chars, min(len(buffer), max_buffer)) cut_idx = buffer.rfind(" ", 0, target) if cut_idx == -1: cut_idx = target seg = buffer[:cut_idx].rstrip() if seg: yield seg buffer = buffer[cut_idx:].lstrip() continue break if buffer: yield bufferSegment an async stream of text on delimiters or soft boundaries to reduce TTS latency. Yields segments while keeping the delimiter if requested.
def setup_logging(level=20)-
Expand source code
def setup_logging(level=logging.INFO): """Setup logging configuration for videosdk-agents.""" # Create a formatter formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) # Setup console handler console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(formatter) # Get the logger for videosdk.agents logger = logging.getLogger("videosdk.agents") logger.setLevel(level) # Remove existing handlers to avoid duplicates for handler in logger.handlers[:]: logger.removeHandler(handler) # Add our handler logger.addHandler(console_handler) # Prevent propagation to root logger to avoid duplicate messages logger.propagate = False return loggerSetup logging configuration for videosdk-agents.
Classes
class A2AMessage (from_agent: str,
to_agent: str,
type: str,
content: Dict[str, Any],
id: str = <factory>,
timestamp: float = <factory>,
metadata: Dict[str, Any] | None = None)-
Expand source code
@dataclass class A2AMessage: """ Message format for agent-to-agent communication. Attributes: from_agent (str): ID of the agent sending the message. to_agent (str): ID of the agent receiving the message. type (str): Type/category of the message (e.g., "specialist_query", "model_response"). content (Dict[str, Any]): The actual message content and data. id (str): Unique identifier for the message. Auto-generated if not provided. timestamp (float): Unix timestamp when the message was created. metadata (Optional[Dict[str, Any]]): Additional message metadata. """ from_agent: str to_agent: str type: str content: Dict[str, Any] id: str = field(default_factory=lambda: str(uuid.uuid4())) timestamp: float = field(default_factory=time.time) metadata: Optional[Dict[str, Any]] = NoneMessage format for agent-to-agent communication.
Attributes
from_agent:str- ID of the agent sending the message.
to_agent:str- ID of the agent receiving the message.
type:str- Type/category of the message (e.g., "specialist_query", "model_response").
content:Dict[str, Any]- The actual message content and data.
id:str- Unique identifier for the message. Auto-generated if not provided.
timestamp:float- Unix timestamp when the message was created.
metadata:Optional[Dict[str, Any]]- Additional message metadata.
Instance variables
var content : Dict[str, Any]var from_agent : strvar id : strvar metadata : Dict[str, Any] | Nonevar timestamp : floatvar to_agent : strvar type : str
class Agent (instructions: str,
tools: List[FunctionTool] = None,
agent_id: str = None,
mcp_servers: List[MCPServiceProvider] = None,
inherit_context: bool = False,
knowledge_base: KnowledgeBase | None = None)-
Expand source code
class Agent(EventEmitter[Literal["agent_started"]], ABC): """ Abstract base class for creating custom agents. Inherits from EventEmitter to handle agent events and state updates. """ def __init__(self, instructions: str, tools: List[FunctionTool] = None, agent_id: str = None, mcp_servers: List[MCPServiceProvider] = None, inherit_context: bool = False, knowledge_base: KnowledgeBase | None = None): super().__init__() self._tools = tools self._llm = None self._stt = None self._tts = None self.chat_context = ChatContext.empty() self.instructions = instructions self._tools = tools if tools else [] self._mcp_servers = mcp_servers if mcp_servers else [] self._mcp_initialized = False self._register_class_tools() self.register_tools() self.a2a = A2AProtocol(self) self._agent_card = None self.id = agent_id or str(uuid.uuid4()) self.mcp_manager = MCPToolManager() self.session: AgentSession | None = None self._thinking_background_config: Optional[BackgroundAudioHandlerConfig] = None self.knowledge_base = knowledge_base self.inherit_context = inherit_context def _register_class_tools(self) -> None: """Internal Method: Register all function tools defined in the class""" for name, attr in inspect.getmembers(self): if is_function_tool(attr): self._tools.append(attr) @property def instructions(self) -> str: """Get the instructions for the agent""" return self._instructions @instructions.setter def instructions(self, value: str) -> None: """Set the instructions for the agent""" self._instructions = value self.chat_context.add_message( role=ChatRole.SYSTEM, content=value ) @property def tools(self) -> ToolList[FunctionTool]: """Get the tools for the agent""" return ToolList(self._tools) def register_tools(self) -> None: """Internal Method: Register external function tools for the agent""" for tool in self._tools: if not is_function_tool(tool): raise ValueError(f"Tool {tool.__name__ if hasattr(tool, '__name__') else tool} is not a valid FunctionTool") def update_tools(self, tools: List[FunctionTool]) -> None: """Update the tools for the agent""" self._tools = tools self._register_class_tools() self.register_tools() async def hangup(self) -> None: """Hang up the agent""" await self.session.hangup("manual_hangup") def set_thinking_audio(self, file: str = None, volume: float = 0.3): """Set the thinking background for the agent""" if file is None: file = os.path.join(os.path.dirname(__file__), 'resources', 'agent_keyboard.wav') self._thinking_background_config = BackgroundAudioHandlerConfig(file_path=file,volume=volume,looping=True,enabled=True) async def play_background_audio(self, file: str = None, volume: float = 1.0, looping: bool = False, override_thinking: bool = True) -> None: """Play background audio on demand""" if file is None: file = os.path.join(os.path.dirname(__file__), 'resources', 'classical.wav') config = BackgroundAudioHandlerConfig(file_path=file,volume=volume,looping=looping,enabled=True,mode='mixing') await self.session.play_background_audio(config, override_thinking) async def stop_background_audio(self) -> None: """Stop background audio on demand""" await self.session.stop_background_audio() async def initialize_mcp(self) -> None: """Internal Method: Initialize the agent, including any MCP server if provided.""" if self._mcp_servers and not self._mcp_initialized: for server in self._mcp_servers: await self.add_server(server) self._mcp_initialized = True async def add_server(self, mcp_server: MCPServiceProvider) -> None: """Internal Method: Initialize the MCP server and register the tools""" existing_tool_count = len(self.mcp_manager.tools) await self.mcp_manager.add_mcp_server(mcp_server) new_tools = self.mcp_manager.tools[existing_tool_count:] self._tools.extend(new_tools) @abstractmethod async def on_enter(self) -> None: """Called when session starts, to be implemented in your custom agent implementation.""" pass async def register_a2a(self, card: AgentCard) -> None: """ Register the agent for A2A communication""" self._agent_card = card await self.a2a.register(card) async def unregister_a2a(self) -> None: """Unregister the agent from A2A communication""" await self.a2a.unregister() self._agent_card = None async def cleanup(self) -> None: """Internal Method: Cleanup agent resources""" logger.info("Cleaning up agent resources") if self.mcp_manager: try: await self.mcp_manager.cleanup() logger.info("MCP manager cleaned up") except Exception as e: logger.error(f"Error cleaning up MCP manager: {e}") self.mcp_manager = None self._tools = [] self._mcp_servers = [] self.chat_context = None self._agent_card = None if hasattr(self, 'session'): self.session = None logger.info("Agent cleanup completed") @abstractmethod async def on_exit(self) -> None: """Called when session ends, to be implemented in your custom agent implementation.""" pass def capture_frames(self, num_of_frames: int = 1) -> list[av.VideoFrame]: """Capture the latest video frames from the pipeline (max 5).""" if num_of_frames > 5: raise ValueError("num_of_frames cannot exceed 5") pipeline = getattr(getattr(self, 'session', None), 'pipeline', None) if not pipeline: logger.warning("Pipeline not available") return [] return pipeline.get_latest_frames(num_of_frames)Abstract base class for creating custom agents. Inherits from EventEmitter to handle agent events and state updates.
Ancestors
- EventEmitter
- typing.Generic
- abc.ABC
Instance variables
prop instructions : str-
Expand source code
@property def instructions(self) -> str: """Get the instructions for the agent""" return self._instructionsGet the instructions for the agent
prop tools : ToolList[FunctionTool]-
Expand source code
@property def tools(self) -> ToolList[FunctionTool]: """Get the tools for the agent""" return ToolList(self._tools)Get the tools for the agent
Methods
async def add_server(self, mcp_server: MCPServiceProvider) ‑> None-
Expand source code
async def add_server(self, mcp_server: MCPServiceProvider) -> None: """Internal Method: Initialize the MCP server and register the tools""" existing_tool_count = len(self.mcp_manager.tools) await self.mcp_manager.add_mcp_server(mcp_server) new_tools = self.mcp_manager.tools[existing_tool_count:] self._tools.extend(new_tools)Internal Method: Initialize the MCP server and register the tools
def capture_frames(self, num_of_frames: int = 1) ‑> list[av.video.frame.VideoFrame]-
Expand source code
def capture_frames(self, num_of_frames: int = 1) -> list[av.VideoFrame]: """Capture the latest video frames from the pipeline (max 5).""" if num_of_frames > 5: raise ValueError("num_of_frames cannot exceed 5") pipeline = getattr(getattr(self, 'session', None), 'pipeline', None) if not pipeline: logger.warning("Pipeline not available") return [] return pipeline.get_latest_frames(num_of_frames)Capture the latest video frames from the pipeline (max 5).
async def cleanup(self) ‑> None-
Expand source code
async def cleanup(self) -> None: """Internal Method: Cleanup agent resources""" logger.info("Cleaning up agent resources") if self.mcp_manager: try: await self.mcp_manager.cleanup() logger.info("MCP manager cleaned up") except Exception as e: logger.error(f"Error cleaning up MCP manager: {e}") self.mcp_manager = None self._tools = [] self._mcp_servers = [] self.chat_context = None self._agent_card = None if hasattr(self, 'session'): self.session = None logger.info("Agent cleanup completed")Internal Method: Cleanup agent resources
async def hangup(self) ‑> None-
Expand source code
async def hangup(self) -> None: """Hang up the agent""" await self.session.hangup("manual_hangup")Hang up the agent
async def initialize_mcp(self) ‑> None-
Expand source code
async def initialize_mcp(self) -> None: """Internal Method: Initialize the agent, including any MCP server if provided.""" if self._mcp_servers and not self._mcp_initialized: for server in self._mcp_servers: await self.add_server(server) self._mcp_initialized = TrueInternal Method: Initialize the agent, including any MCP server if provided.
async def on_enter(self) ‑> None-
Expand source code
@abstractmethod async def on_enter(self) -> None: """Called when session starts, to be implemented in your custom agent implementation.""" passCalled when session starts, to be implemented in your custom agent implementation.
async def on_exit(self) ‑> None-
Expand source code
@abstractmethod async def on_exit(self) -> None: """Called when session ends, to be implemented in your custom agent implementation.""" passCalled when session ends, to be implemented in your custom agent implementation.
async def play_background_audio(self,
file: str = None,
volume: float = 1.0,
looping: bool = False,
override_thinking: bool = True) ‑> None-
Expand source code
async def play_background_audio(self, file: str = None, volume: float = 1.0, looping: bool = False, override_thinking: bool = True) -> None: """Play background audio on demand""" if file is None: file = os.path.join(os.path.dirname(__file__), 'resources', 'classical.wav') config = BackgroundAudioHandlerConfig(file_path=file,volume=volume,looping=looping,enabled=True,mode='mixing') await self.session.play_background_audio(config, override_thinking)Play background audio on demand
async def register_a2a(self,
card: AgentCard) ‑> None-
Expand source code
async def register_a2a(self, card: AgentCard) -> None: """ Register the agent for A2A communication""" self._agent_card = card await self.a2a.register(card)Register the agent for A2A communication
def register_tools(self) ‑> None-
Expand source code
def register_tools(self) -> None: """Internal Method: Register external function tools for the agent""" for tool in self._tools: if not is_function_tool(tool): raise ValueError(f"Tool {tool.__name__ if hasattr(tool, '__name__') else tool} is not a valid FunctionTool")Internal Method: Register external function tools for the agent
def set_thinking_audio(self, file: str = None, volume: float = 0.3)-
Expand source code
def set_thinking_audio(self, file: str = None, volume: float = 0.3): """Set the thinking background for the agent""" if file is None: file = os.path.join(os.path.dirname(__file__), 'resources', 'agent_keyboard.wav') self._thinking_background_config = BackgroundAudioHandlerConfig(file_path=file,volume=volume,looping=True,enabled=True)Set the thinking background for the agent
async def stop_background_audio(self) ‑> None-
Expand source code
async def stop_background_audio(self) -> None: """Stop background audio on demand""" await self.session.stop_background_audio()Stop background audio on demand
async def unregister_a2a(self) ‑> None-
Expand source code
async def unregister_a2a(self) -> None: """Unregister the agent from A2A communication""" await self.a2a.unregister() self._agent_card = NoneUnregister the agent from A2A communication
def update_tools(self,
tools: List[FunctionTool]) ‑> None-
Expand source code
def update_tools(self, tools: List[FunctionTool]) -> None: """Update the tools for the agent""" self._tools = tools self._register_class_tools() self.register_tools()Update the tools for the agent
Inherited members
class AgentCard (id: str,
name: str,
domain: str,
capabilities: List[str],
description: str,
metadata: Dict[str, Any] | None = None)-
Expand source code
@dataclass class AgentCard: """ Represents an agent's capabilities and identity for agent-to-agent communication. Attributes: id (str): Unique identifier for the agent. Auto-generated if not provided. name (str): Human-readable name of the agent. domain (str): The domain or category this agent specializes in. capabilities (List[str]): List of capabilities this agent can perform. description (str): Detailed description of the agent's purpose and functionality. metadata (Optional[Dict[str, Any]]): Additional custom metadata for the agent. """ id: str name: str domain: str capabilities: List[str] description: str metadata: Optional[Dict[str, Any]] = None def __post_init__(self): """ Internal method: Automatically generates a UUID if no ID is provided. """ if not self.id: self.id = str(uuid.uuid4())Represents an agent's capabilities and identity for agent-to-agent communication.
Attributes
id:str- Unique identifier for the agent. Auto-generated if not provided.
name:str- Human-readable name of the agent.
domain:str- The domain or category this agent specializes in.
capabilities:List[str]- List of capabilities this agent can perform.
description:str- Detailed description of the agent's purpose and functionality.
metadata:Optional[Dict[str, Any]]- Additional custom metadata for the agent.
Instance variables
var capabilities : List[str]var description : strvar domain : strvar id : strvar metadata : Dict[str, Any] | Nonevar name : str
class AgentSession (agent: Agent,
pipeline: Pipeline,
wake_up: Optional[int] = None,
background_audio: Optional[BackgroundAudioHandlerConfig] = None,
dtmf_handler: Optional[DTMFHandler] = None,
voice_mail_detector: Optional[VoiceMailDetector] = None)-
Expand source code
class AgentSession(EventEmitter[Literal["user_state_changed", "agent_state_changed"]]): """ Manages an agent session with its associated conversation flow and pipeline. """ def __init__( self, agent: Agent, pipeline: Pipeline, wake_up: Optional[int] = None, background_audio: Optional[BackgroundAudioHandlerConfig] = None, dtmf_handler: Optional[DTMFHandler] = None, voice_mail_detector: Optional[VoiceMailDetector] = None, ) -> None: """ Initialize an agent session. Args: agent: Instance of an Agent class that handles the core logic pipeline: Pipeline instance to process the agent's operations wake_up: Time in seconds after which to trigger wake-up callback if no speech detected background_audio: Configuration for background audio (optional) dtmf_handler: DTMF handler for phone number input (optional) voice_mail_detector: Voicemail detector (optional) """ super().__init__() self.agent = agent self.pipeline = pipeline self.agent.session = self self.wake_up = wake_up self.on_wake_up: Optional[Callable[[], None] | Callable[[], Any]] = None self._wake_up_task: Optional[asyncio.Task] = None self._wake_up_timer_active = False self._closed: bool = False self._reply_in_progress: bool = False self._user_state: UserState = UserState.IDLE self._agent_state: AgentState = AgentState.IDLE self.current_utterance: Optional[UtteranceHandle] = None self._thinking_audio_player: Optional[BackgroundAudioHandler] = None self._background_audio_player: Optional[BackgroundAudioHandler] = None self._thinking_was_playing = False self.background_audio_config = background_audio self._is_executing_tool = False self._job_context = None self.dtmf_handler = dtmf_handler self.voice_mail_detector = voice_mail_detector self._is_voice_mail_detected = False self._playground_manager = None self._playground = False self._send_analytics_to_pubsub = False # Set agent on pipeline (pipeline handles all internal wiring) if hasattr(self.pipeline, 'set_agent'): self.pipeline.set_agent(self.agent) # Setup voicemail detection if self.voice_mail_detector: if hasattr(self.pipeline, "set_voice_mail_detector"): self.pipeline.set_voice_mail_detector(self.voice_mail_detector) if hasattr(self.pipeline, "on"): self.pipeline.on("voicemail_result", self._handle_voicemail_result) # Setup wake-up callback if hasattr(self.pipeline, 'set_wake_up_callback'): self.pipeline.set_wake_up_callback(self._reset_wake_up_timer) # Get job context try: job_ctx = get_current_job_context() if job_ctx: self._job_context = job_ctx job_ctx.add_shutdown_callback(self.close) self._playground = job_ctx.room_options.playground self._send_analytics_to_pubsub = job_ctx.room_options.send_analytics_to_pubsub except Exception as e: logger.error(f"AgentSession: Error in session initialization: {e}") self._job_context = None @property def is_voicemail_detected(self) -> bool: """Returns True if voicemail was detected in this session.""" return self._is_voicemail_detected def _handle_voicemail_result(self, data: dict) -> None: """ Handler for the voicemail_result event from ConversationFlow. Updates session state and executes callback if needed. """ is_vm = data.get("is_voicemail", False) self._is_voicemail_detected = is_vm if is_vm: logger.info("AgentSession: Voicemail confirmed. Executing callback.") if self.voice_mail_detector.callback: asyncio.create_task(self._safe_execute_vmd_callback()) async def _safe_execute_vmd_callback(self) -> None: try: if self.voice_mail_detector.callback: await self.voice_mail_detector.callback() except Exception as e: logger.error(f"Error executing voicemail callback: {e}") def _start_wake_up_timer(self) -> None: if self.wake_up is not None and self.on_wake_up is not None: self._wake_up_timer_active = True self._wake_up_task = asyncio.create_task(self._wake_up_timer_loop()) def _reset_wake_up_timer(self) -> None: if self.wake_up is not None and self.on_wake_up is not None: if self._reply_in_progress: return if self._wake_up_task and not self._wake_up_task.done(): self._wake_up_task.cancel() self._wake_up_timer_active = True self._wake_up_task = asyncio.create_task(self._wake_up_timer_loop()) def _pause_wake_up_timer(self) -> None: if self._wake_up_task and not self._wake_up_task.done(): self._wake_up_task.cancel() def _cancel_wake_up_timer(self) -> None: if self._wake_up_task and not self._wake_up_task.done(): self._wake_up_task.cancel() self._wake_up_timer_active = False async def _wake_up_timer_loop(self) -> None: try: await asyncio.sleep(self.wake_up) if self._wake_up_timer_active and self.on_wake_up and not self._reply_in_progress: if asyncio.iscoroutinefunction(self.on_wake_up): asyncio.create_task(self.on_wake_up()) else: self.on_wake_up() except asyncio.CancelledError: pass def _emit_user_state(self, state: UserState, data: dict | None = None) -> None: if state != self._user_state: self._user_state = state payload = {"state": state.value, **(data or {})} self.emit("user_state_changed", payload) def _emit_agent_state(self, state: AgentState, data: dict | None = None) -> None: if state != self._agent_state: self._agent_state = state payload = {"state": state.value, **(data or {})} self.emit("agent_state_changed", payload) global_event_emitter.emit("AGENT_STATE_CHANGED", {"state": state.value}) @property def user_state(self) -> UserState: return self._user_state @property def agent_state(self) -> AgentState: return self._agent_state @property def is_background_audio_enabled(self) -> bool: """Check if background audio is enabled in the pipeline""" audio_track = self._get_audio_track() return hasattr(audio_track, 'add_background_bytes') async def start( self, wait_for_participant: bool = False, run_until_shutdown: bool = False, **kwargs: Any ) -> None: """ Start the agent session. This will: 1. Initialize the agent (including MCP tools if configured) 2. Call the agent's on_enter hook 3. Start the pipeline processing 4. Start wake-up timer if configured (but only if callback is set) 5. Optionally handle full lifecycle management (connect, wait, shutdown) Args: wait_for_participant: If True, wait for a participant to join before starting run_until_shutdown: If True, manage the full lifecycle including connection, waiting for shutdown signals, and cleanup. This is a convenience that internally calls ctx.run_until_shutdown() with this session. **kwargs: Additional arguments to pass to the pipeline start method Examples: Simple start (manual lifecycle management): ```python await session.start() ``` Full lifecycle management (recommended): ```python await session.start(wait_for_participant=True, run_until_shutdown=True) ``` """ if run_until_shutdown: try: ctx = get_current_job_context() if ctx: logger.info("Starting session with full lifecycle management") await ctx.run_until_shutdown( session=self, wait_for_participant=wait_for_participant ) return else: logger.warning( "run_until_shutdown=True requires a JobContext, " "falling back to normal start()" ) except Exception as e: logger.warning( f"Failed to get JobContext for run_until_shutdown: {e}, " "falling back to normal start()" ) self._emit_agent_state(AgentState.STARTING) if self.agent._mcp_servers: await self.agent.initialize_mcp() if self.dtmf_handler: await self.dtmf_handler.start() if self._playground or self._send_analytics_to_pubsub: job_ctx = get_current_job_context() self.playground_manager = PlaygroundManager(job_ctx) metrics_collector.set_playground_manager(self.playground_manager) # Configure metrics with session info metrics_collector.set_system_instructions(self.agent.instructions) # Set provider info based on pipeline components if not self.pipeline.config.is_realtime: if self.pipeline.stt: p_class, p_model = self._get_provider_info(self.pipeline.stt, 'stt') metrics_collector.set_provider_info("stt", p_class, p_model) if self.pipeline.llm: p_class, p_model = self._get_provider_info(self.pipeline.llm, 'llm') metrics_collector.set_provider_info("llm", p_class, p_model) if self.pipeline.tts: p_class, p_model = self._get_provider_info(self.pipeline.tts, 'tts') metrics_collector.set_provider_info("tts", p_class, p_model) if hasattr(self.pipeline, 'vad') and self.pipeline.vad: p_class, p_model = self._get_provider_info(self.pipeline.vad, 'vad') metrics_collector.set_provider_info("vad", p_class, p_model) if hasattr(self.pipeline, 'turn_detector') and self.pipeline.turn_detector: p_class, p_model = self._get_provider_info(self.pipeline.turn_detector, 'eou') metrics_collector.set_provider_info("eou", p_class, p_model) else: if self.pipeline._realtime_model: metrics_collector.set_provider_info("realtime", self.pipeline._realtime_model.__class__.__name__, getattr(self.pipeline._realtime_model, 'model', '')) if self.pipeline.stt: p_class, p_model = self._get_provider_info(self.pipeline.stt, 'stt') metrics_collector.set_provider_info("stt", p_class, p_model) if self.pipeline.tts: p_class, p_model = self._get_provider_info(self.pipeline.tts, 'tts') metrics_collector.set_provider_info("tts", p_class, p_model) # Traces flow manager setup traces_flow_manager = metrics_collector.traces_flow_manager if traces_flow_manager: config_attributes = { "system_instructions": self.agent.instructions, "function_tools": [ get_tool_info(tool).name for tool in ( [tool for tool in self.agent.tools if tool not in self.agent.mcp_manager.tools] if self.agent.mcp_manager else self.agent.tools ) ] if self.agent.tools else [], "mcp_tools": [ tool._tool_info.name for tool in self.agent.mcp_manager.tools ] if self.agent.mcp_manager else [], "pipeline": self.pipeline.__class__.__name__, "pipeline_mode": self.pipeline.config.pipeline_mode.value, "transport_mode": metrics_collector.transport_mode } start_time = time.perf_counter() config_attributes["start_time"] = start_time await traces_flow_manager.start_agent_session_config(config_attributes) await traces_flow_manager.start_agent_session({"start_time": start_time}) if hasattr(self.pipeline, 'set_agent'): self.pipeline.set_agent(self.agent) await self.pipeline.start() if hasattr(self.agent, 'a2a'): self.agent.a2a._attach_deferred_listeners() if self._should_delay_for_sip_user(): logger.info("SIP user detected, waiting for audio stream to be enabled before calling on_enter") audio_stream_enabled = asyncio.Event() def on_audio_stream_enabled(data): stream = data.get("stream") participant = data.get("participant") if stream and stream.kind == "audio" and participant and participant.meta_data.get("sipUser"): logger.info(f"SIP user audio stream enabled for participant {participant.id}") audio_stream_enabled.set() global_event_emitter.on("AUDIO_STREAM_ENABLED", on_audio_stream_enabled) async def wait_and_start(): try: await audio_stream_enabled.wait() logger.info("SIP user audio stream enabled, proceeding with on_enter") await self.agent.on_enter() global_event_emitter.emit("AGENT_STARTED", {"session": self}) if self.on_wake_up is not None: self._start_wake_up_timer() self._emit_agent_state(AgentState.IDLE) except Exception as e: logger.error(f"Error in wait_and_start: {e}") finally: global_event_emitter.off("AUDIO_STREAM_ENABLED", on_audio_stream_enabled) asyncio.create_task(wait_and_start()) return await self.agent.on_enter() global_event_emitter.emit("AGENT_STARTED", {"session": self}) if self.on_wake_up is not None: self._start_wake_up_timer() self._emit_agent_state(AgentState.IDLE) def _get_provider_info(self, component, comp_name): configs = self.pipeline.get_component_configs() if hasattr(self.pipeline, 'get_component_configs') else {} if not component: return "", "" default_model = configs.get(comp_name, {}).get('model', '') if hasattr(component, 'active_provider') and component.active_provider is not None: provider_class = component.active_provider.__class__.__name__ model = getattr(component.active_provider, 'model', getattr(component.active_provider, 'model_id', getattr(component.active_provider, 'speech_model', getattr(component.active_provider, 'voice_id', getattr(component.active_provider, 'voice', getattr(component.active_provider, 'speaker', default_model)))))) else: provider_class = component.__class__.__name__ model = getattr(component, 'model', getattr(component, 'model_id', getattr(component, 'speech_model', getattr(component, 'voice_id', getattr(component, 'voice', getattr(component, 'speaker', default_model)))))) return provider_class, str(model) async def say(self, message: str, interruptible: bool = True) -> UtteranceHandle: """ Send an initial message to the agent and return a handle to track it. When called from inside a function tool (_is_executing_tool), the current turn's utterance is not interrupted or replaced, so the LLM stream can continue after the tool returns. """ handle = UtteranceHandle(utterance_id=f"utt_{uuid.uuid4().hex[:8]}", interruptible=interruptible) if not self._is_executing_tool: if self.current_utterance and not self.current_utterance.done(): self.current_utterance.interrupt() self.current_utterance = handle traces_flow_manager = metrics_collector.traces_flow_manager if traces_flow_manager: traces_flow_manager.agent_say_called(message) self.agent.chat_context.add_message(role=ChatRole.ASSISTANT, content=message) if hasattr(self.pipeline, 'send_message'): await self.pipeline.send_message(message, handle=handle) return handle async def play_background_audio(self, config: BackgroundAudioHandlerConfig, override_thinking: bool) -> None: """Play background audio on demand""" if override_thinking and self._thinking_audio_player and self._thinking_audio_player.is_playing: await self.stop_thinking_audio() self._thinking_was_playing = True audio_track = self._get_audio_track() if not hasattr(audio_track, 'add_background_bytes'): logger.warning( "Cannot play background audio. This feature requires the mixing audio track. " "Enable it by setting `background_audio=True` in RoomOptions." ) return if audio_track: self._background_audio_player = BackgroundAudioHandler(config, audio_track) await self._background_audio_player.start() # Track background audio start for metrics metrics_collector.on_background_audio_start( file_path=config.file_path, looping=config.looping ) async def stop_background_audio(self) -> None: """Stop background audio on demand""" if self._background_audio_player: await self._background_audio_player.stop() self._background_audio_player = None # Track background audio stop for metrics metrics_collector.on_background_audio_stop() if self._thinking_was_playing: await self.start_thinking_audio() self._thinking_was_playing = False def _get_audio_track(self): """Get audio track from pipeline""" if self.pipeline is None: return None if self.pipeline.config.is_realtime: model = getattr(self.pipeline, '_realtime_model', None) if model and hasattr(model, 'audio_track'): return model.audio_track if self.pipeline.tts and hasattr(self.pipeline.tts, 'audio_track'): return self.pipeline.tts.audio_track return None async def start_thinking_audio(self): """Start thinking audio""" if self._background_audio_player and self._background_audio_player.is_playing: return audio_track = self._get_audio_track() if not hasattr(audio_track, 'add_background_bytes'): logger.warning( "Cannot play 'thinking' audio. This feature requires the mixing audio track. " "Enable it by setting `background_audio=True` in RoomOptions." ) return if self.agent._thinking_background_config and audio_track: self._thinking_audio_player = BackgroundAudioHandler(self.agent._thinking_background_config, audio_track) await self._thinking_audio_player.start() # Track thinking audio start for metrics metrics_collector.on_thinking_audio_start( file_path=self.agent._thinking_background_config.file_path, looping=self.agent._thinking_background_config.looping ) async def stop_thinking_audio(self): """Stop thinking audio""" if self._thinking_audio_player: await self._thinking_audio_player.stop() self._thinking_audio_player = None # Track thinking audio stop for metrics metrics_collector.on_thinking_audio_stop() async def reply(self, instructions: str, wait_for_playback: bool = True, frames: list[av.VideoFrame] | None = None, interruptible: bool = True) -> UtteranceHandle: """ Generate a response from agent using instructions and current chat context. This method is safe to call from function tools - it will automatically detect re-entrant calls and schedule them as background tasks. Args: instructions: Instructions to add to chat context wait_for_playback: If True, wait for playback to complete frames: Optional list of VideoFrame objects to include in the reply Returns: UtteranceHandle: A handle to track the utterance lifecycle """ if self._reply_in_progress: if self.current_utterance: return self.current_utterance handle = UtteranceHandle(utterance_id="placeholder", interruptible=interruptible) handle._mark_done() return handle handle = UtteranceHandle(utterance_id=f"utt_{uuid.uuid4().hex[:8]}", interruptible=interruptible) self.current_utterance = handle if self._is_executing_tool: asyncio.create_task( self._internal_blocking_reply(instructions, wait_for_playback, handle, frames) ) return handle else: await self._internal_blocking_reply(instructions, wait_for_playback, handle, frames) return handle async def _internal_blocking_reply(self, instructions: str, wait_for_playback: bool, handle: UtteranceHandle, frames: list[av.VideoFrame] | None = None) -> None: """ The original, blocking logic of the reply method. """ if not instructions: handle._mark_done() return self._reply_in_progress = True self._pause_wake_up_timer() try: # Call pipeline's reply_with_context if hasattr(self.pipeline, 'reply_with_context'): await self.pipeline.reply_with_context(instructions, wait_for_playback, handle=handle, frames=frames) if wait_for_playback: await handle finally: self._reply_in_progress = False if not handle.done(): handle._mark_done() def interrupt(self, *, force: bool = False) -> None: """ Interrupt the agent's current speech. """ if self.current_utterance and not self.current_utterance.interrupted: try: self.current_utterance.interrupt(force=force) except RuntimeError as e: logger.warning(f"Could not interrupt utterance: {e}") return if hasattr(self.pipeline, 'interrupt'): self.pipeline.interrupt() async def close(self) -> None: """ Close the agent session. """ logger.info("Closing agent session") if self._closed: logger.info("Agent session already closed") return self._closed = True self._emit_agent_state(AgentState.CLOSING) metrics_collector.finalize_session() traces_flow_manager = metrics_collector.traces_flow_manager if traces_flow_manager: start_time = time.perf_counter() await traces_flow_manager.start_agent_session_closed({"start_time": start_time}) traces_flow_manager.end_agent_session_closed() self._cancel_wake_up_timer() logger.info("Cleaning up agent session") try: await self.agent.on_exit() except Exception as e: logger.error(f"Error in agent.on_exit(): {e}") if self._thinking_audio_player: await self._thinking_audio_player.stop() if self._background_audio_player: await self._background_audio_player.stop() try: await self.pipeline.cleanup() except Exception as e: logger.error(f"Error cleaning up pipeline: {e}") try: await self.agent.cleanup() except Exception as e: logger.error(f"Error cleaning up agent: {e}") self.agent = None self.pipeline = None self.on_wake_up = None self._wake_up_task = None logger.info("Agent session cleaned up") async def leave(self) -> None: """ Leave the agent session. """ self._emit_agent_state(AgentState.CLOSING) await self.pipeline.leave() async def hangup(self, reason: str = "manual_hangup") -> None: """ Hang up the session, leaving the room immediately if possible. """ job_ctx = self._job_context if not job_ctx: try: job_ctx = get_current_job_context() except Exception: job_ctx = None room = getattr(job_ctx, "room", None) if job_ctx else None if room and hasattr(room, "force_end_session"): try: await room.force_end_session(reason) return except Exception as exc: logger.error(f"Error forcing room to end session: {exc}") await self.close() async def call_transfer(self,token: str, transfer_to: str) -> None: """ Transfer the call to a provided Phone number or SIP endpoint. Args: token: VideoSDK auth token. transfer_to: Phone number or SIP endpoint to transfer the call to. """ job_ctx = self._job_context if not job_ctx: try: job_ctx = get_current_job_context() except Exception: job_ctx = None room = getattr(job_ctx, "room", None) if job_ctx else None if room and hasattr(room, "call_transfer"): try: await room.call_transfer(token, transfer_to) return except Exception as exc: logger.error(f"Error calling call_transfer: {exc}") def _should_delay_for_sip_user(self) -> bool: """Check if there are SIP users in the room that need audio stream initialization""" job_ctx = self._job_context if not job_ctx: try: job_ctx = get_current_job_context() except Exception: job_ctx = None room = getattr(job_ctx, "room", None) if job_ctx else None if room and hasattr(room, "participants_data"): participants = room.participants_data for participant_info in participants.values(): # SIP-specific on_enter logic is currently limited to outbound calls. if participant_info.get("sipUser") and participant_info.get("sipCallType") == "outbound": return True return FalseManages an agent session with its associated conversation flow and pipeline.
Initialize an agent session.
Args
agent- Instance of an Agent class that handles the core logic
pipeline- Pipeline instance to process the agent's operations
wake_up- Time in seconds after which to trigger wake-up callback if no speech detected
background_audio- Configuration for background audio (optional)
dtmf_handler- DTMF handler for phone number input (optional)
voice_mail_detector- Voicemail detector (optional)
Ancestors
- EventEmitter
- typing.Generic
Instance variables
prop agent_state : AgentState-
Expand source code
@property def agent_state(self) -> AgentState: return self._agent_state prop is_background_audio_enabled : bool-
Expand source code
@property def is_background_audio_enabled(self) -> bool: """Check if background audio is enabled in the pipeline""" audio_track = self._get_audio_track() return hasattr(audio_track, 'add_background_bytes')Check if background audio is enabled in the pipeline
prop is_voicemail_detected : bool-
Expand source code
@property def is_voicemail_detected(self) -> bool: """Returns True if voicemail was detected in this session.""" return self._is_voicemail_detectedReturns True if voicemail was detected in this session.
prop user_state : UserState-
Expand source code
@property def user_state(self) -> UserState: return self._user_state
Methods
async def call_transfer(self, token: str, transfer_to: str) ‑> None-
Expand source code
async def call_transfer(self,token: str, transfer_to: str) -> None: """ Transfer the call to a provided Phone number or SIP endpoint. Args: token: VideoSDK auth token. transfer_to: Phone number or SIP endpoint to transfer the call to. """ job_ctx = self._job_context if not job_ctx: try: job_ctx = get_current_job_context() except Exception: job_ctx = None room = getattr(job_ctx, "room", None) if job_ctx else None if room and hasattr(room, "call_transfer"): try: await room.call_transfer(token, transfer_to) return except Exception as exc: logger.error(f"Error calling call_transfer: {exc}")Transfer the call to a provided Phone number or SIP endpoint.
Args
token- VideoSDK auth token.
transfer_to- Phone number or SIP endpoint to transfer the call to.
async def close(self) ‑> None-
Expand source code
async def close(self) -> None: """ Close the agent session. """ logger.info("Closing agent session") if self._closed: logger.info("Agent session already closed") return self._closed = True self._emit_agent_state(AgentState.CLOSING) metrics_collector.finalize_session() traces_flow_manager = metrics_collector.traces_flow_manager if traces_flow_manager: start_time = time.perf_counter() await traces_flow_manager.start_agent_session_closed({"start_time": start_time}) traces_flow_manager.end_agent_session_closed() self._cancel_wake_up_timer() logger.info("Cleaning up agent session") try: await self.agent.on_exit() except Exception as e: logger.error(f"Error in agent.on_exit(): {e}") if self._thinking_audio_player: await self._thinking_audio_player.stop() if self._background_audio_player: await self._background_audio_player.stop() try: await self.pipeline.cleanup() except Exception as e: logger.error(f"Error cleaning up pipeline: {e}") try: await self.agent.cleanup() except Exception as e: logger.error(f"Error cleaning up agent: {e}") self.agent = None self.pipeline = None self.on_wake_up = None self._wake_up_task = None logger.info("Agent session cleaned up")Close the agent session.
async def hangup(self, reason: str = 'manual_hangup') ‑> None-
Expand source code
async def hangup(self, reason: str = "manual_hangup") -> None: """ Hang up the session, leaving the room immediately if possible. """ job_ctx = self._job_context if not job_ctx: try: job_ctx = get_current_job_context() except Exception: job_ctx = None room = getattr(job_ctx, "room", None) if job_ctx else None if room and hasattr(room, "force_end_session"): try: await room.force_end_session(reason) return except Exception as exc: logger.error(f"Error forcing room to end session: {exc}") await self.close()Hang up the session, leaving the room immediately if possible.
def interrupt(self, *, force: bool = False) ‑> None-
Expand source code
def interrupt(self, *, force: bool = False) -> None: """ Interrupt the agent's current speech. """ if self.current_utterance and not self.current_utterance.interrupted: try: self.current_utterance.interrupt(force=force) except RuntimeError as e: logger.warning(f"Could not interrupt utterance: {e}") return if hasattr(self.pipeline, 'interrupt'): self.pipeline.interrupt()Interrupt the agent's current speech.
async def leave(self) ‑> None-
Expand source code
async def leave(self) -> None: """ Leave the agent session. """ self._emit_agent_state(AgentState.CLOSING) await self.pipeline.leave()Leave the agent session.
async def play_background_audio(self, config: BackgroundAudioHandlerConfig, override_thinking: bool) ‑> None-
Expand source code
async def play_background_audio(self, config: BackgroundAudioHandlerConfig, override_thinking: bool) -> None: """Play background audio on demand""" if override_thinking and self._thinking_audio_player and self._thinking_audio_player.is_playing: await self.stop_thinking_audio() self._thinking_was_playing = True audio_track = self._get_audio_track() if not hasattr(audio_track, 'add_background_bytes'): logger.warning( "Cannot play background audio. This feature requires the mixing audio track. " "Enable it by setting `background_audio=True` in RoomOptions." ) return if audio_track: self._background_audio_player = BackgroundAudioHandler(config, audio_track) await self._background_audio_player.start() # Track background audio start for metrics metrics_collector.on_background_audio_start( file_path=config.file_path, looping=config.looping )Play background audio on demand
async def reply(self,
instructions: str,
wait_for_playback: bool = True,
frames: list[av.VideoFrame] | None = None,
interruptible: bool = True) ‑> UtteranceHandle-
Expand source code
async def reply(self, instructions: str, wait_for_playback: bool = True, frames: list[av.VideoFrame] | None = None, interruptible: bool = True) -> UtteranceHandle: """ Generate a response from agent using instructions and current chat context. This method is safe to call from function tools - it will automatically detect re-entrant calls and schedule them as background tasks. Args: instructions: Instructions to add to chat context wait_for_playback: If True, wait for playback to complete frames: Optional list of VideoFrame objects to include in the reply Returns: UtteranceHandle: A handle to track the utterance lifecycle """ if self._reply_in_progress: if self.current_utterance: return self.current_utterance handle = UtteranceHandle(utterance_id="placeholder", interruptible=interruptible) handle._mark_done() return handle handle = UtteranceHandle(utterance_id=f"utt_{uuid.uuid4().hex[:8]}", interruptible=interruptible) self.current_utterance = handle if self._is_executing_tool: asyncio.create_task( self._internal_blocking_reply(instructions, wait_for_playback, handle, frames) ) return handle else: await self._internal_blocking_reply(instructions, wait_for_playback, handle, frames) return handleGenerate a response from agent using instructions and current chat context.
This method is safe to call from function tools - it will automatically detect re-entrant calls and schedule them as background tasks.
Args
instructions- Instructions to add to chat context
wait_for_playback- If True, wait for playback to complete
frames- Optional list of VideoFrame objects to include in the reply
Returns
UtteranceHandle- A handle to track the utterance lifecycle
async def say(self, message: str, interruptible: bool = True) ‑> UtteranceHandle-
Expand source code
async def say(self, message: str, interruptible: bool = True) -> UtteranceHandle: """ Send an initial message to the agent and return a handle to track it. When called from inside a function tool (_is_executing_tool), the current turn's utterance is not interrupted or replaced, so the LLM stream can continue after the tool returns. """ handle = UtteranceHandle(utterance_id=f"utt_{uuid.uuid4().hex[:8]}", interruptible=interruptible) if not self._is_executing_tool: if self.current_utterance and not self.current_utterance.done(): self.current_utterance.interrupt() self.current_utterance = handle traces_flow_manager = metrics_collector.traces_flow_manager if traces_flow_manager: traces_flow_manager.agent_say_called(message) self.agent.chat_context.add_message(role=ChatRole.ASSISTANT, content=message) if hasattr(self.pipeline, 'send_message'): await self.pipeline.send_message(message, handle=handle) return handleSend an initial message to the agent and return a handle to track it. When called from inside a function tool (_is_executing_tool), the current turn's utterance is not interrupted or replaced, so the LLM stream can continue after the tool returns.
async def start(self,
wait_for_participant: bool = False,
run_until_shutdown: bool = False,
**kwargs: Any) ‑> None-
Expand source code
async def start( self, wait_for_participant: bool = False, run_until_shutdown: bool = False, **kwargs: Any ) -> None: """ Start the agent session. This will: 1. Initialize the agent (including MCP tools if configured) 2. Call the agent's on_enter hook 3. Start the pipeline processing 4. Start wake-up timer if configured (but only if callback is set) 5. Optionally handle full lifecycle management (connect, wait, shutdown) Args: wait_for_participant: If True, wait for a participant to join before starting run_until_shutdown: If True, manage the full lifecycle including connection, waiting for shutdown signals, and cleanup. This is a convenience that internally calls ctx.run_until_shutdown() with this session. **kwargs: Additional arguments to pass to the pipeline start method Examples: Simple start (manual lifecycle management): ```python await session.start() ``` Full lifecycle management (recommended): ```python await session.start(wait_for_participant=True, run_until_shutdown=True) ``` """ if run_until_shutdown: try: ctx = get_current_job_context() if ctx: logger.info("Starting session with full lifecycle management") await ctx.run_until_shutdown( session=self, wait_for_participant=wait_for_participant ) return else: logger.warning( "run_until_shutdown=True requires a JobContext, " "falling back to normal start()" ) except Exception as e: logger.warning( f"Failed to get JobContext for run_until_shutdown: {e}, " "falling back to normal start()" ) self._emit_agent_state(AgentState.STARTING) if self.agent._mcp_servers: await self.agent.initialize_mcp() if self.dtmf_handler: await self.dtmf_handler.start() if self._playground or self._send_analytics_to_pubsub: job_ctx = get_current_job_context() self.playground_manager = PlaygroundManager(job_ctx) metrics_collector.set_playground_manager(self.playground_manager) # Configure metrics with session info metrics_collector.set_system_instructions(self.agent.instructions) # Set provider info based on pipeline components if not self.pipeline.config.is_realtime: if self.pipeline.stt: p_class, p_model = self._get_provider_info(self.pipeline.stt, 'stt') metrics_collector.set_provider_info("stt", p_class, p_model) if self.pipeline.llm: p_class, p_model = self._get_provider_info(self.pipeline.llm, 'llm') metrics_collector.set_provider_info("llm", p_class, p_model) if self.pipeline.tts: p_class, p_model = self._get_provider_info(self.pipeline.tts, 'tts') metrics_collector.set_provider_info("tts", p_class, p_model) if hasattr(self.pipeline, 'vad') and self.pipeline.vad: p_class, p_model = self._get_provider_info(self.pipeline.vad, 'vad') metrics_collector.set_provider_info("vad", p_class, p_model) if hasattr(self.pipeline, 'turn_detector') and self.pipeline.turn_detector: p_class, p_model = self._get_provider_info(self.pipeline.turn_detector, 'eou') metrics_collector.set_provider_info("eou", p_class, p_model) else: if self.pipeline._realtime_model: metrics_collector.set_provider_info("realtime", self.pipeline._realtime_model.__class__.__name__, getattr(self.pipeline._realtime_model, 'model', '')) if self.pipeline.stt: p_class, p_model = self._get_provider_info(self.pipeline.stt, 'stt') metrics_collector.set_provider_info("stt", p_class, p_model) if self.pipeline.tts: p_class, p_model = self._get_provider_info(self.pipeline.tts, 'tts') metrics_collector.set_provider_info("tts", p_class, p_model) # Traces flow manager setup traces_flow_manager = metrics_collector.traces_flow_manager if traces_flow_manager: config_attributes = { "system_instructions": self.agent.instructions, "function_tools": [ get_tool_info(tool).name for tool in ( [tool for tool in self.agent.tools if tool not in self.agent.mcp_manager.tools] if self.agent.mcp_manager else self.agent.tools ) ] if self.agent.tools else [], "mcp_tools": [ tool._tool_info.name for tool in self.agent.mcp_manager.tools ] if self.agent.mcp_manager else [], "pipeline": self.pipeline.__class__.__name__, "pipeline_mode": self.pipeline.config.pipeline_mode.value, "transport_mode": metrics_collector.transport_mode } start_time = time.perf_counter() config_attributes["start_time"] = start_time await traces_flow_manager.start_agent_session_config(config_attributes) await traces_flow_manager.start_agent_session({"start_time": start_time}) if hasattr(self.pipeline, 'set_agent'): self.pipeline.set_agent(self.agent) await self.pipeline.start() if hasattr(self.agent, 'a2a'): self.agent.a2a._attach_deferred_listeners() if self._should_delay_for_sip_user(): logger.info("SIP user detected, waiting for audio stream to be enabled before calling on_enter") audio_stream_enabled = asyncio.Event() def on_audio_stream_enabled(data): stream = data.get("stream") participant = data.get("participant") if stream and stream.kind == "audio" and participant and participant.meta_data.get("sipUser"): logger.info(f"SIP user audio stream enabled for participant {participant.id}") audio_stream_enabled.set() global_event_emitter.on("AUDIO_STREAM_ENABLED", on_audio_stream_enabled) async def wait_and_start(): try: await audio_stream_enabled.wait() logger.info("SIP user audio stream enabled, proceeding with on_enter") await self.agent.on_enter() global_event_emitter.emit("AGENT_STARTED", {"session": self}) if self.on_wake_up is not None: self._start_wake_up_timer() self._emit_agent_state(AgentState.IDLE) except Exception as e: logger.error(f"Error in wait_and_start: {e}") finally: global_event_emitter.off("AUDIO_STREAM_ENABLED", on_audio_stream_enabled) asyncio.create_task(wait_and_start()) return await self.agent.on_enter() global_event_emitter.emit("AGENT_STARTED", {"session": self}) if self.on_wake_up is not None: self._start_wake_up_timer() self._emit_agent_state(AgentState.IDLE)Start the agent session. This will: 1. Initialize the agent (including MCP tools if configured) 2. Call the agent's on_enter hook 3. Start the pipeline processing 4. Start wake-up timer if configured (but only if callback is set) 5. Optionally handle full lifecycle management (connect, wait, shutdown)
Args
wait_for_participant- If True, wait for a participant to join before starting
run_until_shutdown- If True, manage the full lifecycle including connection, waiting for shutdown signals, and cleanup. This is a convenience that internally calls ctx.run_until_shutdown() with this session.
**kwargs- Additional arguments to pass to the pipeline start method
Examples
Simple start (manual lifecycle management):
await session.start()Full lifecycle management (recommended):
await session.start(wait_for_participant=True, run_until_shutdown=True) async def start_thinking_audio(self)-
Expand source code
async def start_thinking_audio(self): """Start thinking audio""" if self._background_audio_player and self._background_audio_player.is_playing: return audio_track = self._get_audio_track() if not hasattr(audio_track, 'add_background_bytes'): logger.warning( "Cannot play 'thinking' audio. This feature requires the mixing audio track. " "Enable it by setting `background_audio=True` in RoomOptions." ) return if self.agent._thinking_background_config and audio_track: self._thinking_audio_player = BackgroundAudioHandler(self.agent._thinking_background_config, audio_track) await self._thinking_audio_player.start() # Track thinking audio start for metrics metrics_collector.on_thinking_audio_start( file_path=self.agent._thinking_background_config.file_path, looping=self.agent._thinking_background_config.looping )Start thinking audio
async def stop_background_audio(self) ‑> None-
Expand source code
async def stop_background_audio(self) -> None: """Stop background audio on demand""" if self._background_audio_player: await self._background_audio_player.stop() self._background_audio_player = None # Track background audio stop for metrics metrics_collector.on_background_audio_stop() if self._thinking_was_playing: await self.start_thinking_audio() self._thinking_was_playing = FalseStop background audio on demand
async def stop_thinking_audio(self)-
Expand source code
async def stop_thinking_audio(self): """Stop thinking audio""" if self._thinking_audio_player: await self._thinking_audio_player.stop() self._thinking_audio_player = None # Track thinking audio stop for metrics metrics_collector.on_thinking_audio_stop()Stop thinking audio
Inherited members
class AgentState (*args, **kwds)-
Expand source code
@enum.unique class AgentState(enum.Enum): """Represents the current state of the agent in a conversation session.""" STARTING = "starting" IDLE = "idle" SPEAKING = "speaking" LISTENING = "listening" THINKING = "thinking" CLOSING = "closing"Represents the current state of the agent in a conversation session.
Ancestors
- enum.Enum
Class variables
var CLOSINGvar IDLEvar LISTENINGvar SPEAKINGvar STARTINGvar THINKING
class ChatContext (items: Optional[List[ChatItem]] = None)-
Expand source code
class ChatContext: """ Manages a conversation context for LLM interactions. """ def __init__(self, items: Optional[List[ChatItem]] = None): """ Initialize the chat context. Args: items (Optional[List[ChatItem]]): Initial list of chat items. If None, starts with empty context. """ self._items: List[ChatItem] = items or [] @classmethod def empty(cls) -> ChatContext: """ Create an empty chat context. Returns: ChatContext: A new empty chat context instance. """ return cls([]) @property def items(self) -> List[ChatItem]: """ Get all items in the context. Returns: List[ChatItem]: List of all conversation items (messages, function calls, outputs). """ return self._items def add_message( self, role: ChatRole, content: Union[str, List[ChatContent]], message_id: Optional[str] = None, created_at: Optional[float] = None, replace: bool = False, ) -> ChatMessage: """ Add a new message to the context. Args: role (ChatRole): The role of the message sender. content (Union[str, List[ChatContent]]): The message content as text or content items. message_id (Optional[str], optional): Custom message ID. Auto-generated if not provided. created_at (Optional[float], optional): Custom creation timestamp. Uses current time if not provided. replace (bool, optional): If True and role is SYSTEM, replaces the existing system message. Defaults to False. Returns: ChatMessage: The newly created and added message. """ if replace and role == ChatRole.SYSTEM: self._items = [ item for item in self._items if not (isinstance(item, ChatMessage) and item.role == ChatRole.SYSTEM) ] if isinstance(content, str): content = [content] message = ChatMessage( role=role, content=content, id=message_id or f"msg_{int(time.time())}", created_at=created_at or time.time(), ) self._items.append(message) return message def add_function_call( self, name: str, arguments: str, call_id: Optional[str] = None ) -> FunctionCall: """ Add a function call to the context. Args: name (str): Name of the function to be called. arguments (str): JSON string containing the function arguments. call_id (Optional[str], optional): Custom call ID. Auto-generated if not provided. Returns: FunctionCall: The newly created and added function call. """ call = FunctionCall( name=name, arguments=arguments, call_id=call_id or f"call_{int(time.time())}" ) self._items.append(call) return call def add_function_output( self, name: str, output: str, call_id: str, is_error: bool = False ) -> FunctionCallOutput: """ Add a function output to the context. Args: name (str): Name of the function that was executed. output (str): The result or output from the function execution. call_id (str): ID linking this output to the original function call. is_error (bool, optional): Flag indicating if the function execution failed. Defaults to False. Returns: FunctionCallOutput: The newly created and added function output. """ function_output = FunctionCallOutput( name=name, output=output, call_id=call_id, is_error=is_error ) self._items.append(function_output) return function_output def get_by_id(self, item_id: str) -> Optional[ChatItem]: """ Find an item by its ID. Args: item_id (str): The ID of the item to find. Returns: Optional[ChatItem]: The found item or None if not found. """ return next( (item for item in self._items if item.id == item_id), None ) def copy( self, *, exclude_function_calls: bool = False, exclude_system_messages: bool = False, tools: Optional[List[FunctionTool]] = None, ) -> ChatContext: """ Create a filtered copy of the chat context. Args: exclude_function_calls (bool, optional): Whether to exclude function calls and outputs. Defaults to False. exclude_system_messages (bool, optional): Whether to exclude system messages. Defaults to False. tools (Optional[List[FunctionTool]], optional): List of available tools to filter function calls by. Defaults to None. Returns: ChatContext: A new ChatContext instance with the filtered items. """ items = [] valid_tool_names = {get_tool_info(tool).name for tool in ( tools or []) if is_function_tool(tool)} for item in self._items: # Skip function calls if excluded if exclude_function_calls and isinstance(item, (FunctionCall, FunctionCallOutput)): continue # Skip system messages if excluded if exclude_system_messages and isinstance(item, ChatMessage) and item.role == ChatRole.SYSTEM: continue # Filter by valid tools if tools are provided if tools and isinstance(item, (FunctionCall, FunctionCallOutput)): if item.name not in valid_tool_names: continue items.append(item) return ChatContext(items) def truncate(self, max_items: int) -> ChatContext: """ Truncate the context to the last N items while preserving system message. Args: max_items (int): Maximum number of items to keep in the context. Returns: ChatContext: The current context instance after truncation. """ system_msg = next( (item for item in self._items if isinstance(item, ChatMessage) and item.role == ChatRole.SYSTEM), None ) new_items = self._items[-max_items:] while new_items and isinstance(new_items[0], (FunctionCall, FunctionCallOutput)): new_items.pop(0) if system_msg and system_msg not in new_items: new_items.insert(0, system_msg) self._items = new_items return self def to_dict(self) -> dict: """ Convert the context to a dictionary representation. Returns: dict: Dictionary representation of the chat context. """ return { "items": [ { "type": item.type, "id": item.id, **({"role": item.role.value, "content": item.content} if isinstance(item, ChatMessage) else {}), **({"name": item.name, "arguments": item.arguments, "call_id": item.call_id, "metadata": item.metadata} if isinstance(item, FunctionCall) else {}), **({"name": item.name, "output": item.output, "call_id": item.call_id, "is_error": item.is_error} if isinstance(item, FunctionCallOutput) else {}) } for item in self._items ] } @classmethod def from_dict(cls, data: dict) -> ChatContext: """ Create a ChatContext from a dictionary representation. Args: data (dict): Dictionary containing the serialized chat context data. Returns: ChatContext: A new ChatContext instance reconstructed from the data. """ items = [] for item_data in data["items"]: if item_data["type"] == "message": items.append(ChatMessage( role=ChatRole(item_data["role"]), content=item_data["content"], id=item_data["id"] )) elif item_data["type"] == "function_call": items.append(FunctionCall( name=item_data["name"], arguments=item_data["arguments"], call_id=item_data["call_id"], id=item_data["id"], metadata=item_data.get("metadata") )) elif item_data["type"] == "function_call_output": items.append(FunctionCallOutput( name=item_data["name"], output=item_data["output"], call_id=item_data["call_id"], is_error=item_data.get("is_error", False), id=item_data["id"] )) return cls(items) def cleanup(self) -> None: """ Clear all chat context items and references to free memory. """ logger.info(f"Cleaning up ChatContext with {len(self._items)} items") for item in self._items: if isinstance(item, ChatMessage): if isinstance(item.content, list): for content_item in item.content: if isinstance(content_item, ImageContent): content_item.image = None item.content = None elif isinstance(item, FunctionCall): item.arguments = None elif isinstance(item, FunctionCallOutput): item.output = None self._items.clear() try: import gc gc.collect() logger.info("ChatContext garbage collection completed") except Exception as e: logger.error(f"Error during ChatContext garbage collection: {e}") logger.info("ChatContext cleanup completed")Manages a conversation context for LLM interactions.
Initialize the chat context.
Args
items:Optional[List[ChatItem]]- Initial list of chat items. If None, starts with empty context.
Static methods
def empty() ‑> ChatContextdef from_dict(data: dict) ‑> ChatContext-
Create a ChatContext from a dictionary representation.
Args
data:dict- Dictionary containing the serialized chat context data.
Returns
ChatContext- A new ChatContext instance reconstructed from the data.
Instance variables
prop items : List[ChatItem]-
Expand source code
@property def items(self) -> List[ChatItem]: """ Get all items in the context. Returns: List[ChatItem]: List of all conversation items (messages, function calls, outputs). """ return self._itemsGet all items in the context.
Returns
List[ChatItem]- List of all conversation items (messages, function calls, outputs).
Methods
def add_function_call(self, name: str, arguments: str, call_id: Optional[str] = None) ‑> FunctionCall-
Expand source code
def add_function_call( self, name: str, arguments: str, call_id: Optional[str] = None ) -> FunctionCall: """ Add a function call to the context. Args: name (str): Name of the function to be called. arguments (str): JSON string containing the function arguments. call_id (Optional[str], optional): Custom call ID. Auto-generated if not provided. Returns: FunctionCall: The newly created and added function call. """ call = FunctionCall( name=name, arguments=arguments, call_id=call_id or f"call_{int(time.time())}" ) self._items.append(call) return callAdd a function call to the context.
Args
name:str- Name of the function to be called.
arguments:str- JSON string containing the function arguments.
call_id:Optional[str], optional- Custom call ID. Auto-generated if not provided.
Returns
FunctionCall- The newly created and added function call.
def add_function_output(self, name: str, output: str, call_id: str, is_error: bool = False) ‑> FunctionCallOutput-
Expand source code
def add_function_output( self, name: str, output: str, call_id: str, is_error: bool = False ) -> FunctionCallOutput: """ Add a function output to the context. Args: name (str): Name of the function that was executed. output (str): The result or output from the function execution. call_id (str): ID linking this output to the original function call. is_error (bool, optional): Flag indicating if the function execution failed. Defaults to False. Returns: FunctionCallOutput: The newly created and added function output. """ function_output = FunctionCallOutput( name=name, output=output, call_id=call_id, is_error=is_error ) self._items.append(function_output) return function_outputAdd a function output to the context.
Args
name:str- Name of the function that was executed.
output:str- The result or output from the function execution.
call_id:str- ID linking this output to the original function call.
is_error:bool, optional- Flag indicating if the function execution failed. Defaults to False.
Returns
FunctionCallOutput- The newly created and added function output.
def add_message(self,
role: ChatRole,
content: Union[str, List[ChatContent]],
message_id: Optional[str] = None,
created_at: Optional[float] = None,
replace: bool = False) ‑> ChatMessage-
Expand source code
def add_message( self, role: ChatRole, content: Union[str, List[ChatContent]], message_id: Optional[str] = None, created_at: Optional[float] = None, replace: bool = False, ) -> ChatMessage: """ Add a new message to the context. Args: role (ChatRole): The role of the message sender. content (Union[str, List[ChatContent]]): The message content as text or content items. message_id (Optional[str], optional): Custom message ID. Auto-generated if not provided. created_at (Optional[float], optional): Custom creation timestamp. Uses current time if not provided. replace (bool, optional): If True and role is SYSTEM, replaces the existing system message. Defaults to False. Returns: ChatMessage: The newly created and added message. """ if replace and role == ChatRole.SYSTEM: self._items = [ item for item in self._items if not (isinstance(item, ChatMessage) and item.role == ChatRole.SYSTEM) ] if isinstance(content, str): content = [content] message = ChatMessage( role=role, content=content, id=message_id or f"msg_{int(time.time())}", created_at=created_at or time.time(), ) self._items.append(message) return messageAdd a new message to the context.
Args
role:ChatRole- The role of the message sender.
content:Union[str, List[ChatContent]]- The message content as text or content items.
message_id:Optional[str], optional- Custom message ID. Auto-generated if not provided.
created_at:Optional[float], optional- Custom creation timestamp. Uses current time if not provided.
replace:bool, optional- If True and role is SYSTEM, replaces the existing system message. Defaults to False.
Returns
ChatMessage- The newly created and added message.
def cleanup(self) ‑> None-
Expand source code
def cleanup(self) -> None: """ Clear all chat context items and references to free memory. """ logger.info(f"Cleaning up ChatContext with {len(self._items)} items") for item in self._items: if isinstance(item, ChatMessage): if isinstance(item.content, list): for content_item in item.content: if isinstance(content_item, ImageContent): content_item.image = None item.content = None elif isinstance(item, FunctionCall): item.arguments = None elif isinstance(item, FunctionCallOutput): item.output = None self._items.clear() try: import gc gc.collect() logger.info("ChatContext garbage collection completed") except Exception as e: logger.error(f"Error during ChatContext garbage collection: {e}") logger.info("ChatContext cleanup completed")Clear all chat context items and references to free memory.
def copy(self,
*,
exclude_function_calls: bool = False,
exclude_system_messages: bool = False,
tools: Optional[List[FunctionTool]] = None) ‑> ChatContext-
Expand source code
def copy( self, *, exclude_function_calls: bool = False, exclude_system_messages: bool = False, tools: Optional[List[FunctionTool]] = None, ) -> ChatContext: """ Create a filtered copy of the chat context. Args: exclude_function_calls (bool, optional): Whether to exclude function calls and outputs. Defaults to False. exclude_system_messages (bool, optional): Whether to exclude system messages. Defaults to False. tools (Optional[List[FunctionTool]], optional): List of available tools to filter function calls by. Defaults to None. Returns: ChatContext: A new ChatContext instance with the filtered items. """ items = [] valid_tool_names = {get_tool_info(tool).name for tool in ( tools or []) if is_function_tool(tool)} for item in self._items: # Skip function calls if excluded if exclude_function_calls and isinstance(item, (FunctionCall, FunctionCallOutput)): continue # Skip system messages if excluded if exclude_system_messages and isinstance(item, ChatMessage) and item.role == ChatRole.SYSTEM: continue # Filter by valid tools if tools are provided if tools and isinstance(item, (FunctionCall, FunctionCallOutput)): if item.name not in valid_tool_names: continue items.append(item) return ChatContext(items)Create a filtered copy of the chat context.
Args
exclude_function_calls:bool, optional- Whether to exclude function calls and outputs. Defaults to False.
exclude_system_messages:bool, optional- Whether to exclude system messages. Defaults to False.
tools:Optional[List[FunctionTool]], optional- List of available tools to filter function calls by. Defaults to None.
Returns
ChatContext- A new ChatContext instance with the filtered items.
def get_by_id(self, item_id: str) ‑> ChatMessage | FunctionCall | FunctionCallOutput | None-
Expand source code
def get_by_id(self, item_id: str) -> Optional[ChatItem]: """ Find an item by its ID. Args: item_id (str): The ID of the item to find. Returns: Optional[ChatItem]: The found item or None if not found. """ return next( (item for item in self._items if item.id == item_id), None )Find an item by its ID.
Args
item_id:str- The ID of the item to find.
Returns
Optional[ChatItem]- The found item or None if not found.
def to_dict(self) ‑> dict-
Expand source code
def to_dict(self) -> dict: """ Convert the context to a dictionary representation. Returns: dict: Dictionary representation of the chat context. """ return { "items": [ { "type": item.type, "id": item.id, **({"role": item.role.value, "content": item.content} if isinstance(item, ChatMessage) else {}), **({"name": item.name, "arguments": item.arguments, "call_id": item.call_id, "metadata": item.metadata} if isinstance(item, FunctionCall) else {}), **({"name": item.name, "output": item.output, "call_id": item.call_id, "is_error": item.is_error} if isinstance(item, FunctionCallOutput) else {}) } for item in self._items ] }Convert the context to a dictionary representation.
Returns
dict- Dictionary representation of the chat context.
def truncate(self, max_items: int) ‑> ChatContext-
Expand source code
def truncate(self, max_items: int) -> ChatContext: """ Truncate the context to the last N items while preserving system message. Args: max_items (int): Maximum number of items to keep in the context. Returns: ChatContext: The current context instance after truncation. """ system_msg = next( (item for item in self._items if isinstance(item, ChatMessage) and item.role == ChatRole.SYSTEM), None ) new_items = self._items[-max_items:] while new_items and isinstance(new_items[0], (FunctionCall, FunctionCallOutput)): new_items.pop(0) if system_msg and system_msg not in new_items: new_items.insert(0, system_msg) self._items = new_items return selfTruncate the context to the last N items while preserving system message.
Args
max_items:int- Maximum number of items to keep in the context.
Returns
ChatContext- The current context instance after truncation.
class ChatMessage (**data: Any)-
Expand source code
class ChatMessage(BaseModel): """ Represents a single message in the chat context. Attributes: role (ChatRole): The role of the message sender (system, user, or assistant). content (Union[str, List[ChatContent]]): The message content as text or list of content items. id (str): Unique identifier for the message. Auto-generated if not provided. type (Literal["message"]): Type identifier, always "message". created_at (float): Unix timestamp when the message was created. interrupted (bool): Flag indicating if the message was interrupted during generation. """ role: ChatRole content: Union[str, List[ChatContent]] id: str = Field(default_factory=lambda: f"msg_{int(time.time())}") type: Literal["message"] = "message" created_at: float = Field(default_factory=time.time) interrupted: bool = FalseRepresents a single message in the chat context.
Attributes
role:ChatRole- The role of the message sender (system, user, or assistant).
content:Union[str, List[ChatContent]]- The message content as text or list of content items.
id:str- Unique identifier for the message. Auto-generated if not provided.
- type (Literal["message"]): Type identifier, always "message".
created_at:float- Unix timestamp when the message was created.
interrupted:bool- Flag indicating if the message was interrupted during generation.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var content : str | List[str | ImageContent]var created_at : floatvar id : strvar interrupted : boolvar model_configvar role : ChatRolevar type : Literal['message']
class ChatRole (*args, **kwds)-
Expand source code
class ChatRole(str, Enum): """ Enumeration of chat roles for message classification. Defines the three standard roles used in chat conversations: - SYSTEM: Instructions and context for the AI assistant - USER: Messages from the human user - ASSISTANT: Responses from the AI assistant """ SYSTEM = "system" USER = "user" ASSISTANT = "assistant"Enumeration of chat roles for message classification.
Defines the three standard roles used in chat conversations: - SYSTEM: Instructions and context for the AI assistant - USER: Messages from the human user - ASSISTANT: Responses from the AI assistant
Ancestors
- builtins.str
- enum.Enum
Class variables
var ASSISTANTvar SYSTEMvar USER
class ConversationalGraphResponse (**data: Any)-
Expand source code
class ConversationalGraphResponse(BaseModel): """ Data model to hold Conversational Graph response data.""" response_to_user:str = Field(..., description="Response to the user by agent") extracted_values:List[ExtractedField] = Field(default_factory=list, description="List of extracted values from the user input") move_forward:bool = Field(False, description="If we want to Move forward to the next state") reasoning:str = Field("", description="Reasoning for the response") chosen_branch:str = Field(None, description="Chosen branch for the move forward") is_off_topic:bool = Field(False, description="Is the user input off topic") backtrack_to_state:str = Field(None, description="Backtrack to the state") current_state_id:str = Field(None, description="exact state_id of current state")Data model to hold Conversational Graph response data.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var backtrack_to_state : strvar chosen_branch : strvar current_state_id : strvar extracted_values : List[ExtractedField]var is_off_topic : boolvar model_configvar move_forward : boolvar reasoning : strvar response_to_user : str
class CustomAudioStreamTrack (loop)-
Expand source code
class CustomAudioStreamTrack(CustomAudioTrack): """ Base audio track implementation using a frame buffer. Audio frames are created as soon as audio data is received. Supports optional pause/resume for false-interrupt detection while maintaining compatibility with avatar plugins that need simple audio flow. """ def __init__(self, loop): super().__init__() self.loop = loop self._start = None self._timestamp = 0 self.frame_buffer = [] self.audio_data_buffer = bytearray() self.frame_time = 0 self.sample_rate = 24000 self.channels = 1 self.sample_width = 2 self.time_base_fraction = Fraction(1, self.sample_rate) self.samples = int(AUDIO_PTIME * self.sample_rate) self.chunk_size = int(self.samples * self.channels * self.sample_width) self._synthesis_complete = False self._needs_last_audio_callback = False self._last_speaking_time = 0.0 self._speaking_grace_period = 0.5 # 500ms grace period for jitter # Pause/resume support - simple flag-based (no blocking) self._is_paused = False self._paused_frames = [] # Separate buffer for paused content self._accepting_audio = True self._manual_audio_control = False @property def can_pause(self) -> bool: """Returns True if this track supports pause/resume operations""" return True def interrupt(self): """Clear all buffers and reset state""" logger.info("Audio track interrupted, clearing buffers.") self.frame_buffer.clear() self.audio_data_buffer.clear() self._paused_frames.clear() self._is_paused = False self._last_speaking_time = 0.0 self._synthesis_complete = False self._needs_last_audio_callback = False # Handle manual audio control mode if self._manual_audio_control: self._accepting_audio = False else: self._accepting_audio = True async def pause(self) -> None: """ Pause audio playback. Instead of blocking recv(), we move remaining frames to a separate buffer so they can be resumed later. This approach keeps the audio flow simple for avatars. """ if self._is_paused: logger.warning("Audio track already paused") return logger.info("Audio track paused - preserving current buffer state.") self._is_paused = True # Move current frames to paused buffer for later resume self._paused_frames = self.frame_buffer.copy() self.frame_buffer.clear() async def resume(self) -> None: """ Resume audio playback from paused position. Restores frames that were saved when paused. """ if not self._is_paused: logger.warning("Audio track not paused, nothing to resume") return logger.info("Audio track resumed - restoring paused buffer.") self._is_paused = False # Restore frames from paused buffer self.frame_buffer = self._paused_frames.copy() self._paused_frames.clear() def enable_audio_input(self, manual_control: bool = False): """ Allow fresh audio data to be buffered. When manual_control is True, future interrupts will pause intake until this method is called again. This is useful for preventing old audio from bleeding into new responses. """ self._manual_audio_control = manual_control self._accepting_audio = True logger.debug(f"Audio input enabled (manual_control={manual_control})") def on_last_audio_byte(self, callback: Callable[[], Awaitable[None]]) -> None: """Set callback for when the final audio byte of synthesis is produced""" logger.info("on last audio callback") self._last_audio_callback = callback self._synthesis_complete = False self._needs_last_audio_callback = False @property def is_speaking(self) -> bool: """ True if the track is currently playing audio or has buffered data, including a small grace period to bridge gaps in streaming TTS. """ has_data = len(self.frame_buffer) > 0 or len(self.audio_data_buffer) > 0 if has_data: return True return (time() - self._last_speaking_time) < self._speaking_grace_period def mark_synthesis_complete(self) -> None: """ Mark that TTS synthesis has finished sending all audio data. If the buffer is already empty (all audio consumed), fires the on_last_audio_byte callback immediately. Otherwise, the callback will fire when recv() drains the remaining buffer. """ self._synthesis_complete = True # If we're not currently speaking (grace period passed) and buffer is empty, fire immediately if not self.is_speaking and self._needs_last_audio_callback: self._needs_last_audio_callback = False self._synthesis_complete = False logger.info("[AudioTrack] Synthesis complete and buffer already empty — triggering last_audio_callback.") if hasattr(self, "_last_audio_callback") and self._last_audio_callback: asyncio.create_task(self._last_audio_callback()) async def add_new_bytes(self, audio_data: bytes): """ Add new audio bytes to the buffer. Respects _accepting_audio flag for manual audio control mode. """ if not self._accepting_audio: logger.debug("Audio input currently disabled, dropping audio data") return self.audio_data_buffer += audio_data while len(self.audio_data_buffer) >= self.chunk_size: chunk = self.audio_data_buffer[: self.chunk_size] self.audio_data_buffer = self.audio_data_buffer[self.chunk_size :] try: audio_frame = self.buildAudioFrames(chunk) # If paused, add to paused buffer instead if self._is_paused: self._paused_frames.append(audio_frame) logger.debug("Added frame to paused buffer") else: self.frame_buffer.append(audio_frame) logger.debug( f"Added audio frame to buffer, total frames: {len(self.frame_buffer)}" ) except Exception as e: logger.error(f"Error building audio frame: {e}") break def buildAudioFrames(self, chunk: bytes) -> AudioFrame: if len(chunk) != self.chunk_size: logger.warning( f"Incorrect chunk size received {len(chunk)}, expected {self.chunk_size}" ) data = np.frombuffer(chunk, dtype=np.int16) expected_samples = self.samples * self.channels if len(data) != expected_samples: logger.warning( f"Incorrect number of samples in chunk {len(data)}, expected {expected_samples}" ) data = data.reshape(-1, self.channels) layout = "mono" if self.channels == 1 else "stereo" audio_frame = AudioFrame.from_ndarray(data.T, format="s16", layout=layout) return audio_frame def next_timestamp(self): pts = int(self.frame_time) time_base = self.time_base_fraction self.frame_time += self.samples return pts, time_base async def recv(self) -> AudioFrame: """ Receive next audio frame. When paused, produces silence frames but keeps timing synchronized. This ensures smooth resume without audio glitches. """ try: if self.readyState != "live": raise MediaStreamError if self._start is None: self._start = time() self._timestamp = 0 else: self._timestamp += self.samples wait = self._start + (self._timestamp / self.sample_rate) - time() if wait > 0: await asyncio.sleep(wait) pts, time_base = self.next_timestamp() # When paused, always produce silence but keep timing # This allows smooth resume without timing jumps if self._is_paused: frame = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in frame.planes: p.update(bytes(p.buffer_size)) elif len(self.frame_buffer) > 0: frame = self.frame_buffer.pop(0) self._is_speaking = True self._last_speaking_time = time() else: # No audio data available — silence if getattr(self, "_is_speaking", False): # Only declare we've stopped speaking if the grace period has passed # This bridges gaps in streaming TTS (like Sarvam jitter) if (time() - self._last_speaking_time) >= self._speaking_grace_period: self._is_speaking = False if self._synthesis_complete: # TTS finished and buffer drained — fire callback self._synthesis_complete = False self._needs_last_audio_callback = False logger.info("[AudioTrack] Agent finished speaking — triggering last_audio_callback.") if hasattr(self, "_last_audio_callback") and self._last_audio_callback: asyncio.create_task(self._last_audio_callback()) else: # Buffer temporarily empty but synthesis still in progress logger.debug("[AudioTrack] Buffer empty — waiting for more TTS audio.") self._needs_last_audio_callback = True # Produce silence frame frame = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in frame.planes: p.update(bytes(p.buffer_size)) frame.pts = pts frame.time_base = time_base frame.sample_rate = self.sample_rate return frame except MediaStreamError: raise except Exception as e: traceback.print_exc() logger.error(f"Error while creating tts->rtc frame: {e}") async def cleanup(self): self.interrupt() self.stop()Base audio track implementation using a frame buffer. Audio frames are created as soon as audio data is received.
Supports optional pause/resume for false-interrupt detection while maintaining compatibility with avatar plugins that need simple audio flow.
Ancestors
- videosdk.custom_audio_track.CustomAudioTrack
- vsaiortc.mediastreams.MediaStreamTrack
- pyee.asyncio.AsyncIOEventEmitter
- pyee.base.EventEmitter
Subclasses
Instance variables
prop can_pause : bool-
Expand source code
@property def can_pause(self) -> bool: """Returns True if this track supports pause/resume operations""" return TrueReturns True if this track supports pause/resume operations
prop is_speaking : bool-
Expand source code
@property def is_speaking(self) -> bool: """ True if the track is currently playing audio or has buffered data, including a small grace period to bridge gaps in streaming TTS. """ has_data = len(self.frame_buffer) > 0 or len(self.audio_data_buffer) > 0 if has_data: return True return (time() - self._last_speaking_time) < self._speaking_grace_periodTrue if the track is currently playing audio or has buffered data, including a small grace period to bridge gaps in streaming TTS.
Methods
async def add_new_bytes(self, audio_data: bytes)-
Expand source code
async def add_new_bytes(self, audio_data: bytes): """ Add new audio bytes to the buffer. Respects _accepting_audio flag for manual audio control mode. """ if not self._accepting_audio: logger.debug("Audio input currently disabled, dropping audio data") return self.audio_data_buffer += audio_data while len(self.audio_data_buffer) >= self.chunk_size: chunk = self.audio_data_buffer[: self.chunk_size] self.audio_data_buffer = self.audio_data_buffer[self.chunk_size :] try: audio_frame = self.buildAudioFrames(chunk) # If paused, add to paused buffer instead if self._is_paused: self._paused_frames.append(audio_frame) logger.debug("Added frame to paused buffer") else: self.frame_buffer.append(audio_frame) logger.debug( f"Added audio frame to buffer, total frames: {len(self.frame_buffer)}" ) except Exception as e: logger.error(f"Error building audio frame: {e}") breakAdd new audio bytes to the buffer. Respects _accepting_audio flag for manual audio control mode.
def buildAudioFrames(self, chunk: bytes) ‑> av.audio.frame.AudioFrame-
Expand source code
def buildAudioFrames(self, chunk: bytes) -> AudioFrame: if len(chunk) != self.chunk_size: logger.warning( f"Incorrect chunk size received {len(chunk)}, expected {self.chunk_size}" ) data = np.frombuffer(chunk, dtype=np.int16) expected_samples = self.samples * self.channels if len(data) != expected_samples: logger.warning( f"Incorrect number of samples in chunk {len(data)}, expected {expected_samples}" ) data = data.reshape(-1, self.channels) layout = "mono" if self.channels == 1 else "stereo" audio_frame = AudioFrame.from_ndarray(data.T, format="s16", layout=layout) return audio_frame async def cleanup(self)-
Expand source code
async def cleanup(self): self.interrupt() self.stop() def enable_audio_input(self, manual_control: bool = False)-
Expand source code
def enable_audio_input(self, manual_control: bool = False): """ Allow fresh audio data to be buffered. When manual_control is True, future interrupts will pause intake until this method is called again. This is useful for preventing old audio from bleeding into new responses. """ self._manual_audio_control = manual_control self._accepting_audio = True logger.debug(f"Audio input enabled (manual_control={manual_control})")Allow fresh audio data to be buffered. When manual_control is True, future interrupts will pause intake until this method is called again.
This is useful for preventing old audio from bleeding into new responses.
def interrupt(self)-
Expand source code
def interrupt(self): """Clear all buffers and reset state""" logger.info("Audio track interrupted, clearing buffers.") self.frame_buffer.clear() self.audio_data_buffer.clear() self._paused_frames.clear() self._is_paused = False self._last_speaking_time = 0.0 self._synthesis_complete = False self._needs_last_audio_callback = False # Handle manual audio control mode if self._manual_audio_control: self._accepting_audio = False else: self._accepting_audio = TrueClear all buffers and reset state
def mark_synthesis_complete(self) ‑> None-
Expand source code
def mark_synthesis_complete(self) -> None: """ Mark that TTS synthesis has finished sending all audio data. If the buffer is already empty (all audio consumed), fires the on_last_audio_byte callback immediately. Otherwise, the callback will fire when recv() drains the remaining buffer. """ self._synthesis_complete = True # If we're not currently speaking (grace period passed) and buffer is empty, fire immediately if not self.is_speaking and self._needs_last_audio_callback: self._needs_last_audio_callback = False self._synthesis_complete = False logger.info("[AudioTrack] Synthesis complete and buffer already empty — triggering last_audio_callback.") if hasattr(self, "_last_audio_callback") and self._last_audio_callback: asyncio.create_task(self._last_audio_callback())Mark that TTS synthesis has finished sending all audio data. If the buffer is already empty (all audio consumed), fires the on_last_audio_byte callback immediately. Otherwise, the callback will fire when recv() drains the remaining buffer.
def next_timestamp(self)-
Expand source code
def next_timestamp(self): pts = int(self.frame_time) time_base = self.time_base_fraction self.frame_time += self.samples return pts, time_base def on_last_audio_byte(self, callback: Callable[[], Awaitable[None]]) ‑> None-
Expand source code
def on_last_audio_byte(self, callback: Callable[[], Awaitable[None]]) -> None: """Set callback for when the final audio byte of synthesis is produced""" logger.info("on last audio callback") self._last_audio_callback = callback self._synthesis_complete = False self._needs_last_audio_callback = FalseSet callback for when the final audio byte of synthesis is produced
async def pause(self) ‑> None-
Expand source code
async def pause(self) -> None: """ Pause audio playback. Instead of blocking recv(), we move remaining frames to a separate buffer so they can be resumed later. This approach keeps the audio flow simple for avatars. """ if self._is_paused: logger.warning("Audio track already paused") return logger.info("Audio track paused - preserving current buffer state.") self._is_paused = True # Move current frames to paused buffer for later resume self._paused_frames = self.frame_buffer.copy() self.frame_buffer.clear()Pause audio playback. Instead of blocking recv(), we move remaining frames to a separate buffer so they can be resumed later. This approach keeps the audio flow simple for avatars.
async def recv(self) ‑> av.audio.frame.AudioFrame-
Expand source code
async def recv(self) -> AudioFrame: """ Receive next audio frame. When paused, produces silence frames but keeps timing synchronized. This ensures smooth resume without audio glitches. """ try: if self.readyState != "live": raise MediaStreamError if self._start is None: self._start = time() self._timestamp = 0 else: self._timestamp += self.samples wait = self._start + (self._timestamp / self.sample_rate) - time() if wait > 0: await asyncio.sleep(wait) pts, time_base = self.next_timestamp() # When paused, always produce silence but keep timing # This allows smooth resume without timing jumps if self._is_paused: frame = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in frame.planes: p.update(bytes(p.buffer_size)) elif len(self.frame_buffer) > 0: frame = self.frame_buffer.pop(0) self._is_speaking = True self._last_speaking_time = time() else: # No audio data available — silence if getattr(self, "_is_speaking", False): # Only declare we've stopped speaking if the grace period has passed # This bridges gaps in streaming TTS (like Sarvam jitter) if (time() - self._last_speaking_time) >= self._speaking_grace_period: self._is_speaking = False if self._synthesis_complete: # TTS finished and buffer drained — fire callback self._synthesis_complete = False self._needs_last_audio_callback = False logger.info("[AudioTrack] Agent finished speaking — triggering last_audio_callback.") if hasattr(self, "_last_audio_callback") and self._last_audio_callback: asyncio.create_task(self._last_audio_callback()) else: # Buffer temporarily empty but synthesis still in progress logger.debug("[AudioTrack] Buffer empty — waiting for more TTS audio.") self._needs_last_audio_callback = True # Produce silence frame frame = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in frame.planes: p.update(bytes(p.buffer_size)) frame.pts = pts frame.time_base = time_base frame.sample_rate = self.sample_rate return frame except MediaStreamError: raise except Exception as e: traceback.print_exc() logger.error(f"Error while creating tts->rtc frame: {e}")Receive next audio frame. When paused, produces silence frames but keeps timing synchronized. This ensures smooth resume without audio glitches.
async def resume(self) ‑> None-
Expand source code
async def resume(self) -> None: """ Resume audio playback from paused position. Restores frames that were saved when paused. """ if not self._is_paused: logger.warning("Audio track not paused, nothing to resume") return logger.info("Audio track resumed - restoring paused buffer.") self._is_paused = False # Restore frames from paused buffer self.frame_buffer = self._paused_frames.copy() self._paused_frames.clear()Resume audio playback from paused position. Restores frames that were saved when paused.
class DTMFHandler (callback: Callable | None = None)-
Expand source code
class DTMFHandler: """ Handles DTMF events from PubSub and forwards digits to callbacks. """ def __init__(self, callback: Optional[Callable] = None): self.ctx = get_current_job_context() self._callback = callback self._subscribed = False async def start(self): """ Begins listening to DTMF_EVENT from pubsub. Called by AgentSession automatically. """ if self._subscribed: return subscribe_config = PubSubSubscribeConfig( topic="DTMF_EVENT", cb=lambda msg: asyncio.create_task(self._on_pubsub_event(msg)) ) await self.ctx.room.subscribe_to_pubsub(subscribe_config) self._subscribed = True def set_callback(self, callback: Callable): """ Allows developers to attach or update callback. """ self._callback = callback async def _on_pubsub_event(self, message): """ Internal PubSub handler - extracts digit and forwards to user callback. """ try: digit = message.get("payload", {}).get("number") logger.info(f"[DTMFHandler] Received: {digit}") if not digit or not self._callback: return if asyncio.iscoroutinefunction(self._callback): await self._callback(digit) else: self._callback(digit) except Exception as e: logger.error(f"[DTMFHandler] Error processing message: {e}")Handles DTMF events from PubSub and forwards digits to callbacks.
Methods
def set_callback(self, callback: Callable)-
Expand source code
def set_callback(self, callback: Callable): """ Allows developers to attach or update callback. """ self._callback = callbackAllows developers to attach or update callback.
async def start(self)-
Expand source code
async def start(self): """ Begins listening to DTMF_EVENT from pubsub. Called by AgentSession automatically. """ if self._subscribed: return subscribe_config = PubSubSubscribeConfig( topic="DTMF_EVENT", cb=lambda msg: asyncio.create_task(self._on_pubsub_event(msg)) ) await self.ctx.room.subscribe_to_pubsub(subscribe_config) self._subscribed = TrueBegins listening to DTMF_EVENT from pubsub. Called by AgentSession automatically.
class DedicatedInferenceResource (resource_id: str, config: Dict[str, Any])-
Expand source code
class DedicatedInferenceResource(BaseResource): """ Dedicated inference resource that runs AI models in a separate process. This mimics the old IPC system's single shared inference process that handles all STT, LLM, TTS, and VAD tasks for all agent jobs. """ def __init__(self, resource_id: str, config: Dict[str, Any]): super().__init__(resource_id, config) self.process: Optional[Process] = None self.parent_conn: Optional[Connection] = None self.child_conn: Optional[Connection] = None self._process_ready = False self._models_cache: Dict[str, Any] = {} # Inference-specific configuration self.initialize_timeout = config.get("inference_process_timeout", 30.0) self.memory_warn_mb = config.get("inference_memory_warn_mb", 1000.0) self.ping_interval = config.get("ping_interval", 30.0) @property def resource_type(self) -> ResourceType: return ResourceType.PROCESS async def _initialize_impl(self) -> None: """Initialize the dedicated inference process.""" logger.info(f"Initializing dedicated inference process: {self.resource_id}") # Create pipe for communication self.parent_conn, self.child_conn = Pipe() # Start the inference process self.process = Process( target=self._run_inference_process, args=(self.resource_id, self.child_conn, self.config), daemon=True, ) self.process.start() # Wait for process to be ready start_time = time.time() while ( not self._process_ready and (time.time() - start_time) < self.initialize_timeout ): try: if self.parent_conn.poll(): message = self.parent_conn.recv() if message.get("type") == "ready": self._process_ready = True break elif message.get("type") == "error": raise Exception( f"Inference process error: {message.get('error')}" ) await asyncio.sleep(0.1) except Exception as e: logger.warning(f"Error checking inference process readiness: {e}") if not self._process_ready: raise TimeoutError( f"Inference process {self.resource_id} failed to initialize within {self.initialize_timeout}s" ) logger.info( f"Dedicated inference process initialized: {self.resource_id} (PID: {self.process.pid})" ) async def _execute_task_impl( self, task_id: str, config, entrypoint: Callable, args: tuple, kwargs: dict ) -> Any: """Execute inference task in the dedicated process.""" if not self._process_ready: raise RuntimeError(f"Inference process {self.resource_id} is not ready") # Prepare inference data inference_data = { "task_id": task_id, "task_type": config.task_type.value, "model_config": config.data.get("model_config", {}), "input_data": config.data.get("input_data", {}), "timeout": config.timeout, } # Send inference request to process self.parent_conn.send({"type": "inference", "data": inference_data}) # Wait for result start_time = time.time() while (time.time() - start_time) < config.timeout: try: if self.parent_conn.poll(): message = self.parent_conn.recv() if ( message.get("type") == "result" and message.get("task_id") == task_id ): if message.get("status") == "success": return message.get("result") else: raise RuntimeError( message.get("error", "Unknown inference error") ) elif message.get("type") == "error": raise RuntimeError( message.get("error", "Inference process error") ) await asyncio.sleep(0.1) except Exception as e: logger.warning(f"Error checking inference result: {e}") raise TimeoutError( f"Inference task {task_id} timed out after {config.timeout}s" ) async def _shutdown_impl(self) -> None: """Shutdown the dedicated inference process.""" if self.process and self.process.is_alive(): # Send shutdown signal self.parent_conn.send({"type": "shutdown"}) # Wait for graceful shutdown timeout = self.config.get("close_timeout", 60.0) start_time = time.time() while self.process.is_alive() and (time.time() - start_time) < timeout: await asyncio.sleep(0.1) # Force terminate if still alive if self.process.is_alive(): logger.warning( f"Force terminating inference process {self.resource_id}" ) self.process.terminate() self.process.join(timeout=5.0) if self.process.is_alive(): self.process.kill() async def health_check(self) -> bool: """Perform a health check on the dedicated inference process.""" try: if self._shutdown or not self.process or not self.process.is_alive(): return False # Send ping to inference process self.parent_conn.send({"type": "ping"}) # Wait for ping response start_time = time.time() timeout = 5.0 # 5 second timeout for health check while (time.time() - start_time) < timeout: try: if self.parent_conn.poll(): message = self.parent_conn.recv() if message.get("type") == "ping_response": # Update last heartbeat self.last_heartbeat = time.time() return True elif message.get("type") == "error": logger.error( f"Inference process error: {message.get('error')}" ) return False await asyncio.sleep(0.1) except Exception as e: logger.warning(f"Error checking inference process health: {e}") # Timeout - process is unresponsive logger.warning(f"Inference process {self.resource_id} health check timeout") return False except Exception as e: logger.error( f"Health check failed for inference process {self.resource_id}: {e}" ) return False @staticmethod def _run_inference_process( resource_id: str, conn: Connection, config: Dict[str, Any] ): """Run the inference process in a separate process.""" try: # Set up logging logging.basicConfig(level=logging.INFO) logger.info( f"Inference process started: {resource_id} (PID: {os.getpid()})" ) # Set up signal handlers def signal_handler(signum, frame): logger.info("Received shutdown signal") conn.send({"type": "shutdown_ack"}) sys.exit(0) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) # Send ready signal conn.send({"type": "ready"}) # Model cache for reuse models_cache: Dict[str, Any] = {} async def main_loop(): while True: try: if conn.poll(timeout=1.0): message = conn.recv() message_type = message.get("type") if message_type == "inference": await _handle_inference( conn, message.get("data", {}), models_cache ) elif message_type == "ping": await _handle_ping(conn) elif message_type == "shutdown": logger.info("Received shutdown request") conn.send({"type": "shutdown_ack"}) break else: logger.warning(f"Unknown message type: {message_type}") except Exception as e: logger.error(f"Error in inference process main loop: {e}") conn.send({"type": "error", "error": str(e)}) asyncio.run(main_loop()) except Exception as e: logger.error(f"Fatal error in inference process: {e}") conn.send({"type": "error", "error": str(e)}) sys.exit(1) finally: logger.info("Inference process shutting down") conn.close()Dedicated inference resource that runs AI models in a separate process.
This mimics the old IPC system's single shared inference process that handles all STT, LLM, TTS, and VAD tasks for all agent jobs.
Ancestors
- BaseResource
- abc.ABC
Methods
async def health_check(self) ‑> bool-
Expand source code
async def health_check(self) -> bool: """Perform a health check on the dedicated inference process.""" try: if self._shutdown or not self.process or not self.process.is_alive(): return False # Send ping to inference process self.parent_conn.send({"type": "ping"}) # Wait for ping response start_time = time.time() timeout = 5.0 # 5 second timeout for health check while (time.time() - start_time) < timeout: try: if self.parent_conn.poll(): message = self.parent_conn.recv() if message.get("type") == "ping_response": # Update last heartbeat self.last_heartbeat = time.time() return True elif message.get("type") == "error": logger.error( f"Inference process error: {message.get('error')}" ) return False await asyncio.sleep(0.1) except Exception as e: logger.warning(f"Error checking inference process health: {e}") # Timeout - process is unresponsive logger.warning(f"Inference process {self.resource_id} health check timeout") return False except Exception as e: logger.error( f"Health check failed for inference process {self.resource_id}: {e}" ) return FalsePerform a health check on the dedicated inference process.
Inherited members
class EOU (threshold: float = 0.7)-
Expand source code
class EOU(EventEmitter[Literal["error"]]): """Base class for End of Utterance Detection implementations""" def __init__(self, threshold: float = 0.7) -> None: super().__init__() self._label = f"{type(self).__module__}.{type(self).__name__}" self._threshold = threshold @property def label(self) -> str: """Get the EOU provider label""" return self._label @property def threshold(self) -> float: """Get the EOU detection threshold""" return self._threshold @abstractmethod def get_eou_probability(self, chat_context: ChatContext) -> float: """ Get the probability score for end of utterance detection. Args: chat_context: Chat context to analyze Returns: float: Probability score (0.0 to 1.0) """ raise NotImplementedError def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool: """ Detect if the given chat context represents an end of utterance. Args: chat_context: Chat context to analyze threshold: Optional threshold override Returns: bool: True if end of utterance is detected, False otherwise """ if threshold is None: threshold = self._threshold probability = self.get_eou_probability(chat_context) return probability >= threshold def set_threshold(self, threshold: float) -> None: """Update the EOU detection threshold""" self._threshold = threshold async def aclose(self) -> None: """Cleanup resources - should be overridden by subclasses to cleanup models""" logger.info(f"Cleaning up EOU: {self._label}") try: import gc gc.collect() logger.info(f"EOU garbage collection completed: {self._label}") except Exception as e: logger.error(f"Error during EOU garbage collection: {e}") logger.info(f"EOU cleanup completed: {self._label}") async def cleanup(self) -> None: """Cleanup resources - calls aclose for compatibility""" await self.aclose()Base class for End of Utterance Detection implementations
Ancestors
- EventEmitter
- typing.Generic
Instance variables
prop label : str-
Expand source code
@property def label(self) -> str: """Get the EOU provider label""" return self._labelGet the EOU provider label
prop threshold : float-
Expand source code
@property def threshold(self) -> float: """Get the EOU detection threshold""" return self._thresholdGet the EOU detection threshold
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """Cleanup resources - should be overridden by subclasses to cleanup models""" logger.info(f"Cleaning up EOU: {self._label}") try: import gc gc.collect() logger.info(f"EOU garbage collection completed: {self._label}") except Exception as e: logger.error(f"Error during EOU garbage collection: {e}") logger.info(f"EOU cleanup completed: {self._label}")Cleanup resources - should be overridden by subclasses to cleanup models
async def cleanup(self) ‑> None-
Expand source code
async def cleanup(self) -> None: """Cleanup resources - calls aclose for compatibility""" await self.aclose()Cleanup resources - calls aclose for compatibility
def detect_end_of_utterance(self,
chat_context: ChatContext,
threshold: Optional[float] = None) ‑> bool-
Expand source code
def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool: """ Detect if the given chat context represents an end of utterance. Args: chat_context: Chat context to analyze threshold: Optional threshold override Returns: bool: True if end of utterance is detected, False otherwise """ if threshold is None: threshold = self._threshold probability = self.get_eou_probability(chat_context) return probability >= thresholdDetect if the given chat context represents an end of utterance.
Args
chat_context- Chat context to analyze
threshold- Optional threshold override
Returns
bool- True if end of utterance is detected, False otherwise
def get_eou_probability(self,
chat_context: ChatContext) ‑> float-
Expand source code
@abstractmethod def get_eou_probability(self, chat_context: ChatContext) -> float: """ Get the probability score for end of utterance detection. Args: chat_context: Chat context to analyze Returns: float: Probability score (0.0 to 1.0) """ raise NotImplementedErrorGet the probability score for end of utterance detection.
Args
chat_context- Chat context to analyze
Returns
float- Probability score (0.0 to 1.0)
def set_threshold(self, threshold: float) ‑> None-
Expand source code
def set_threshold(self, threshold: float) -> None: """Update the EOU detection threshold""" self._threshold = thresholdUpdate the EOU detection threshold
Inherited members
class EOUConfig (mode: Literal['ADAPTIVE', 'DEFAULT'] = 'DEFAULT',
min_max_speech_wait_timeout: List[float] | Tuple[float, float] = <factory>)-
Expand source code
@dataclass class EOUConfig: """Configuration for end-of-utterance detection behavior and speech wait timeouts.""" mode: Literal["ADAPTIVE", "DEFAULT"] = "DEFAULT" min_max_speech_wait_timeout: List[float] | Tuple[float, float] = field(default_factory=lambda: [0.5, 0.8]) def __post_init__(self): if not (isinstance(self.min_max_speech_wait_timeout, (list, tuple)) and len(self.min_max_speech_wait_timeout) == 2): raise ValueError("min_max_speech_wait_timeout must be a list or tuple of two floats") min_val, max_val = self.min_max_speech_wait_timeout if not (isinstance(min_val, (int, float)) and isinstance(max_val, (int, float))): raise ValueError("min_max_speech_wait_timeout values must be numbers") if min_val <= 0 or max_val <= 0: raise ValueError("min_max_speech_wait_timeout values must be greater than 0") if min_val >= max_val: raise ValueError("min_speech_wait_timeout must be less than max_speech_wait_timeout")Configuration for end-of-utterance detection behavior and speech wait timeouts.
Instance variables
var min_max_speech_wait_timeout : List[float] | Tuple[float, float]var mode : Literal['ADAPTIVE', 'DEFAULT']
class EncodeOptions (format: "Literal['JPEG', 'PNG']" = 'JPEG',
resize_options: ResizeOptions = <factory>,
quality: int = 90)-
Expand source code
@dataclass class EncodeOptions: """Configuration settings for converting av.VideoFrame into standard image formats.""" format: Literal["JPEG", "PNG"] = "JPEG" """The encoding format for the image.""" resize_options: ResizeOptions = field(default_factory=lambda: ResizeOptions( width=320, height=240 )) """Settings for adjusting the image size.""" quality: int = 90 """Compression level for the image, ranging from 0 to 100. Applicable only to JPEG."""Configuration settings for converting av.VideoFrame into standard image formats.
Instance variables
var format : Literal['JPEG', 'PNG']-
The encoding format for the image.
var quality : int-
Compression level for the image, ranging from 0 to 100. Applicable only to JPEG.
var resize_options : ResizeOptions-
Settings for adjusting the image size.
class EventEmitter-
Expand source code
class EventEmitter(Generic[T]): """A generic synchronous event emitter that supports registering, removing, and invoking event handlers.""" def __init__(self) -> None: self._handlers: Dict[T, List[Callable[..., Any]]] = {} def on( self, event: T, callback: Callable[..., Any] | None = None ) -> Callable[..., Any]: """Register a synchronous handler for an event. Can be used directly or as a decorator.""" def register(handler: Callable[..., Any]) -> Callable[..., Any]: if asyncio.iscoroutinefunction(handler): raise ValueError( "Async handlers are not supported. Use a sync wrapper." ) handlers = self._handlers.setdefault(event, []) if handler not in handlers: handlers.append(handler) return handler return register if callback is None else register(callback) def off(self, event: T, callback: Callable[..., Any]) -> None: """Remove a previously registered handler for an event.""" if event in self._handlers: try: self._handlers[event].remove(callback) except ValueError: pass if not self._handlers[event]: del self._handlers[event] def emit(self, event: T, *args: Any) -> None: """Emit an event, invoking all registered handlers with the provided arguments.""" callbacks = self._handlers.get(event) if not callbacks: return arguments = args if args else ({},) for cb in callbacks[:]: try: self._invoke(cb, arguments) except Exception as ex: logger.error(f"Handler raised exception on event '{event}': {ex}") def _invoke(self, func: Callable[..., Any], args: tuple[Any, ...]) -> None: """Invoke a handler, adapting the argument count to match the handler's signature.""" code = func.__code__ argcount = code.co_argcount flags = code.co_flags has_varargs = flags & 0x04 != 0 # If the function expects no arguments (only self), don't pass any if argcount == 1 and hasattr(func, "__self__"): # Only self parameter func() elif has_varargs: func(*args) else: func(*args[:argcount])A generic synchronous event emitter that supports registering, removing, and invoking event handlers.
Ancestors
- typing.Generic
Subclasses
- Agent
- AgentSession
- AvatarInput
- ContentGeneration
- Denoise
- EOU
- EventBus
- LLM
- Pipeline
- PipelineOrchestrator
- RealtimeBaseModel
- RealtimeLLMAdapter
- SpeechGeneration
- SpeechUnderstanding
- STT
- TTS
- VAD
Methods
def emit(self, event: -T, *args: Any) ‑> None-
Expand source code
def emit(self, event: T, *args: Any) -> None: """Emit an event, invoking all registered handlers with the provided arguments.""" callbacks = self._handlers.get(event) if not callbacks: return arguments = args if args else ({},) for cb in callbacks[:]: try: self._invoke(cb, arguments) except Exception as ex: logger.error(f"Handler raised exception on event '{event}': {ex}")Emit an event, invoking all registered handlers with the provided arguments.
def off(self, event: -T, callback: Callable[..., Any]) ‑> None-
Expand source code
def off(self, event: T, callback: Callable[..., Any]) -> None: """Remove a previously registered handler for an event.""" if event in self._handlers: try: self._handlers[event].remove(callback) except ValueError: pass if not self._handlers[event]: del self._handlers[event]Remove a previously registered handler for an event.
def on(self, event: -T, callback: Callable[..., Any] | None = None) ‑> Callable[..., Any]-
Expand source code
def on( self, event: T, callback: Callable[..., Any] | None = None ) -> Callable[..., Any]: """Register a synchronous handler for an event. Can be used directly or as a decorator.""" def register(handler: Callable[..., Any]) -> Callable[..., Any]: if asyncio.iscoroutinefunction(handler): raise ValueError( "Async handlers are not supported. Use a sync wrapper." ) handlers = self._handlers.setdefault(event, []) if handler not in handlers: handlers.append(handler) return handler return register if callback is None else register(callback)Register a synchronous handler for an event. Can be used directly or as a decorator.
class ExecutorType (*args, **kwds)-
Expand source code
class ExecutorType(Enum): """Type of executor for task processing.""" PROCESS = "process" THREAD = "thread"Type of executor for task processing.
Ancestors
- enum.Enum
Class variables
var PROCESSvar THREAD
class FallbackLLM (providers: List[LLM],
temporary_disable_sec: float = 60.0,
permanent_disable_after_attempts: int = 3)-
Expand source code
class FallbackLLM(LLM, FallbackBase): """LLM wrapper that automatically fails over to backup providers on errors and attempts recovery of higher-priority ones.""" def __init__(self, providers: List[LLM], temporary_disable_sec: float = 60.0, permanent_disable_after_attempts: int = 3): LLM.__init__(self) FallbackBase.__init__(self, providers, "LLM", temporary_disable_sec=temporary_disable_sec, permanent_disable_after_attempts=permanent_disable_after_attempts) self._setup_event_listeners() def _setup_event_listeners(self): self.active_provider.on("error", self._on_provider_error) def _on_provider_error(self, error_msg): failed_p = self.active_provider asyncio.create_task(self._handle_async_error(str(error_msg), failed_p)) async def _handle_async_error(self, error_msg: str, failed_provider: Any): switched = await self._switch_provider(f"Async Error: {error_msg}", failed_provider=failed_provider) self.emit("error", error_msg) async def _switch_provider(self, reason: str, failed_provider: Any = None): provider_to_cleanup = failed_provider if failed_provider else self.active_provider try: provider_to_cleanup.off("error", self._on_provider_error) except: pass active_before = self.active_provider switched = await super()._switch_provider(reason, failed_provider) active_after = self.active_provider if switched: if active_before != active_after: self.active_provider.on("error", self._on_provider_error) return True return False async def chat(self, messages: ChatContext, **kwargs) -> AsyncIterator[LLMResponse]: """ Attempts to chat with current provider. Loops until one succeeds or all fail. Checks for recovery of primary providers before starting. """ self.check_recovery() while True: current_provider = self.active_provider try: async for chunk in current_provider.chat(messages, **kwargs): yield chunk return except Exception as e: switched = await self._switch_provider(str(e), failed_provider=current_provider) self.emit("error", str(e)) if not switched: raise e async def cancel_current_generation(self) -> None: await self.active_provider.cancel_current_generation() async def aclose(self) -> None: for p in self.providers: await p.aclose() await super().aclose()LLM wrapper that automatically fails over to backup providers on errors and attempts recovery of higher-priority ones.
Initialize the LLM base class.
Ancestors
- LLM
- EventEmitter
- typing.Generic
- FallbackBase
Methods
async def chat(self,
messages: ChatContext,
**kwargs) ‑> AsyncIterator[LLMResponse]-
Expand source code
async def chat(self, messages: ChatContext, **kwargs) -> AsyncIterator[LLMResponse]: """ Attempts to chat with current provider. Loops until one succeeds or all fail. Checks for recovery of primary providers before starting. """ self.check_recovery() while True: current_provider = self.active_provider try: async for chunk in current_provider.chat(messages, **kwargs): yield chunk return except Exception as e: switched = await self._switch_provider(str(e), failed_provider=current_provider) self.emit("error", str(e)) if not switched: raise eAttempts to chat with current provider. Loops until one succeeds or all fail. Checks for recovery of primary providers before starting.
Inherited members
class FallbackSTT (providers: List[STT],
temporary_disable_sec: float = 60.0,
permanent_disable_after_attempts: int = 3)-
Expand source code
class FallbackSTT(STT, FallbackBase): """STT wrapper that automatically fails over to backup providers on errors and attempts recovery of higher-priority ones.""" def __init__(self, providers: List[STT], temporary_disable_sec: float = 60.0, permanent_disable_after_attempts: int = 3): STT.__init__(self) FallbackBase.__init__(self, providers, "STT", temporary_disable_sec=temporary_disable_sec, permanent_disable_after_attempts=permanent_disable_after_attempts) self._transcript_callback = None self._setup_event_listeners() def _setup_event_listeners(self): """Attach error listener to the currently active provider.""" self.active_provider.on("error", self._on_provider_error) def _on_provider_error(self, error_msg): """Handle async errors (e.g. WebSocket disconnects)""" failed_p = self.active_provider asyncio.create_task(self._handle_async_error(str(error_msg), failed_p)) async def _handle_async_error(self, error_msg: str, failed_provider: Any): """Async wrapper to handle switching logic""" switched = await self._switch_provider(f"Async Error: {error_msg}", failed_provider=failed_provider) self.emit("error", error_msg) async def _switch_provider(self, reason: str, failed_provider: Any = None): """Override switch to handle STT specific setup""" provider_to_cleanup = failed_provider if failed_provider else self.active_provider try: provider_to_cleanup.off("error", self._on_provider_error) except: pass active_before = self.active_provider switched = await super()._switch_provider(reason, failed_provider) active_after = self.active_provider if switched: if active_before != active_after: if self._transcript_callback: self.active_provider.on_stt_transcript(self._transcript_callback) self.active_provider.on("error", self._on_provider_error) return True return False def on_stt_transcript(self, callback) -> None: """Capture the callback so we can re-apply it after switching.""" self._transcript_callback = callback self.active_provider.on_stt_transcript(callback) async def process_audio(self, audio_frames: bytes, **kwargs) -> None: """ Main entry point. If this fails, it's usually a connection error. We catch, switch, and retry immediately. """ if self.check_recovery(): if self._transcript_callback: self.active_provider.on_stt_transcript(self._transcript_callback) self.active_provider.on("error", self._on_provider_error) current_provider = self.active_provider try: await current_provider.process_audio(audio_frames, **kwargs) except Exception as e: switched = await self._switch_provider(str(e), failed_provider=current_provider) self.emit("error", str(e)) if switched: await self.active_provider.process_audio(audio_frames, **kwargs) else: raise e async def aclose(self) -> None: """Close all providers.""" for p in self.providers: await p.aclose() await super().aclose()STT wrapper that automatically fails over to backup providers on errors and attempts recovery of higher-priority ones.
Ancestors
- STT
- EventEmitter
- typing.Generic
- FallbackBase
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """Close all providers.""" for p in self.providers: await p.aclose() await super().aclose()Close all providers.
def on_stt_transcript(self, callback) ‑> None-
Expand source code
def on_stt_transcript(self, callback) -> None: """Capture the callback so we can re-apply it after switching.""" self._transcript_callback = callback self.active_provider.on_stt_transcript(callback)Capture the callback so we can re-apply it after switching.
async def process_audio(self, audio_frames: bytes, **kwargs) ‑> None-
Expand source code
async def process_audio(self, audio_frames: bytes, **kwargs) -> None: """ Main entry point. If this fails, it's usually a connection error. We catch, switch, and retry immediately. """ if self.check_recovery(): if self._transcript_callback: self.active_provider.on_stt_transcript(self._transcript_callback) self.active_provider.on("error", self._on_provider_error) current_provider = self.active_provider try: await current_provider.process_audio(audio_frames, **kwargs) except Exception as e: switched = await self._switch_provider(str(e), failed_provider=current_provider) self.emit("error", str(e)) if switched: await self.active_provider.process_audio(audio_frames, **kwargs) else: raise eMain entry point. If this fails, it's usually a connection error. We catch, switch, and retry immediately.
Inherited members
class FallbackTTS (providers: List[TTS],
temporary_disable_sec: float = 60.0,
permanent_disable_after_attempts: int = 3)-
Expand source code
class FallbackTTS(TTS, FallbackBase): """TTS wrapper that automatically fails over to backup providers on errors and attempts recovery of higher-priority ones.""" def __init__(self, providers: List[TTS], temporary_disable_sec: float = 60.0, permanent_disable_after_attempts: int = 3): TTS.__init__( self, sample_rate=providers[0].sample_rate, num_channels=providers[0].num_channels ) FallbackBase.__init__(self, providers, "TTS", temporary_disable_sec=temporary_disable_sec, permanent_disable_after_attempts=permanent_disable_after_attempts) self._initializing = False self._setup_event_listeners() def _setup_event_listeners(self): self.active_provider.on("error", self._on_provider_error) def _on_provider_error(self, error_msg): failed_p = self.active_provider asyncio.create_task(self._handle_async_error(str(error_msg), failed_p)) async def _handle_async_error(self, error_msg: str, failed_provider: Any): switched = await self._switch_provider(f"Async Error: {error_msg}", failed_provider=failed_provider) self.emit("error", error_msg) async def _switch_provider(self, reason: str, failed_provider: Any = None): provider_to_cleanup = failed_provider if failed_provider else self.active_provider try: provider_to_cleanup.off("error", self._on_provider_error) except: pass active_before = self.active_provider switched = await super()._switch_provider(reason, failed_provider) active_after = self.active_provider if switched: if active_before != active_after: self.active_provider.on("error", self._on_provider_error) if hasattr(self, "loop") and self.loop and hasattr(self, "audio_track") and self.audio_track: self._propagate_settings(self.active_provider) return True return False def _propagate_settings(self, provider): """Helper to set loop/audio_track on a provider.""" try: name = "loop" value = self.loop if hasattr(provider, f"_set_{name}"): getattr(provider, f"_set_{name}")(value) else: setattr(provider, name, value) name = "audio_track" value = self.audio_track if hasattr(provider, f"_set_{name}"): getattr(provider, f"_set_{name}")(value) else: setattr(provider, name, value) except Exception as e: logger.warning(f"[TTS] Failed to propagate settings to {provider.label}: {e}") def __setattr__(self, name: str, value: Any) -> None: """ Intercept attribute assignments to propagate loop and audio_track to all providers. This allows FallbackTTS to work without CascadingPipeline needing to know about it. """ object.__setattr__(self, name, value) if name in ("loop", "audio_track") and hasattr(self, "providers") and not getattr(self, "_initializing", False): logger.info(f"[TTS] FallbackTTS: {name} was set to {value}, propagating to all providers") for provider in self.providers: try: if hasattr(provider, f"_set_{name}"): getattr(provider, f"_set_{name}")(value) else: setattr(provider, name, value) logger.info(f"[TTS] Set {name} on provider {provider.label}") except Exception as e: logger.warning(f"[TTS] Failed to set {name} on provider {provider.label}: {e}") def _set_loop_and_audio_track(self, loop, audio_track): """ Optional method for explicit setup (for compatibility). Setting these attributes will trigger __setattr__ which propagates to all providers. """ logger.info(f"[TTS] _set_loop_and_audio_track called on FallbackTTS. loop={loop}, audio_track={audio_track}") self.loop = loop self.audio_track = audio_track async def synthesize(self, text, **kwargs) -> None: """ Try active provider. If exception, switch and retry with same text. Checks for recovery of primary providers before starting. """ if self.check_recovery(): if hasattr(self, "loop") and self.loop and hasattr(self, "audio_track") and self.audio_track: self._propagate_settings(self.active_provider) while True: current_provider = self.active_provider try: logger.info(f"[TTS] Attempting synthesis with {current_provider.label}") await current_provider.synthesize(text, **kwargs) logger.info(f"[TTS] Synthesis successful with {current_provider.label}") return except Exception as e: logger.error(f"[TTS] Synthesis failed with {current_provider.label}: {e}") self.emit("error", str(e)) switched = await self._switch_provider(str(e), failed_provider=current_provider) if not switched: logger.error(f"[TTS] All providers exhausted. Raising error.") raise e logger.info(f"[TTS] Retrying with new provider: {self.active_provider.label}") async def interrupt(self): if self.active_provider: await self.active_provider.interrupt() async def aclose(self): for p in self.providers: await p.aclose() await super().aclose()TTS wrapper that automatically fails over to backup providers on errors and attempts recovery of higher-priority ones.
Ancestors
- TTS
- EventEmitter
- typing.Generic
- FallbackBase
Methods
async def synthesize(self, text, **kwargs) ‑> None-
Expand source code
async def synthesize(self, text, **kwargs) -> None: """ Try active provider. If exception, switch and retry with same text. Checks for recovery of primary providers before starting. """ if self.check_recovery(): if hasattr(self, "loop") and self.loop and hasattr(self, "audio_track") and self.audio_track: self._propagate_settings(self.active_provider) while True: current_provider = self.active_provider try: logger.info(f"[TTS] Attempting synthesis with {current_provider.label}") await current_provider.synthesize(text, **kwargs) logger.info(f"[TTS] Synthesis successful with {current_provider.label}") return except Exception as e: logger.error(f"[TTS] Synthesis failed with {current_provider.label}: {e}") self.emit("error", str(e)) switched = await self._switch_provider(str(e), failed_provider=current_provider) if not switched: logger.error(f"[TTS] All providers exhausted. Raising error.") raise e logger.info(f"[TTS] Retrying with new provider: {self.active_provider.label}")Try active provider. If exception, switch and retry with same text. Checks for recovery of primary providers before starting.
Inherited members
class FunctionCall (**data: Any)-
Expand source code
class FunctionCall(BaseModel): """ Represents a function call in the chat context. Attributes: id (str): Unique identifier for the function call. Auto-generated if not provided. type (Literal["function_call"]): Type identifier, always "function_call". name (str): Name of the function to be called. arguments (str): JSON string containing the function arguments. call_id (str): Unique identifier linking this call to its output. metadata (Optional[dict]): Provider-specific metadata, e.g. Gemini thought_signature bytes stored as base64 string, or Anthropic cache control markers. """ id: str = Field(default_factory=lambda: f"call_{int(time.time())}") type: Literal["function_call"] = "function_call" name: str arguments: str call_id: str metadata: Optional[dict] = NoneRepresents a function call in the chat context.
Attributes
id:str- Unique identifier for the function call. Auto-generated if not provided.
- type (Literal["function_call"]): Type identifier, always "function_call".
name:str- Name of the function to be called.
arguments:str- JSON string containing the function arguments.
call_id:str- Unique identifier linking this call to its output.
metadata:Optional[dict]- Provider-specific metadata, e.g. Gemini thought_signature bytes stored as base64 string, or Anthropic cache control markers.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var arguments : strvar call_id : strvar id : strvar metadata : dict | Nonevar model_configvar name : strvar type : Literal['function_call']
class FunctionCallOutput (**data: Any)-
Expand source code
class FunctionCallOutput(BaseModel): """ Represents the output of a function call. Attributes: id (str): Unique identifier for the function output. Auto-generated if not provided. type (Literal["function_call_output"]): Type identifier, always "function_call_output". name (str): Name of the function that was called. call_id (str): Identifier linking this output to the original function call. output (str): The result or output from the function execution. is_error (bool): Flag indicating if the function execution failed. """ id: str = Field(default_factory=lambda: f"output_{int(time.time())}") type: Literal["function_call_output"] = "function_call_output" name: str call_id: str output: str is_error: bool = FalseRepresents the output of a function call.
Attributes
id:str- Unique identifier for the function output. Auto-generated if not provided.
- type (Literal["function_call_output"]): Type identifier, always "function_call_output".
name:str- Name of the function that was called.
call_id:str- Identifier linking this output to the original function call.
output:str- The result or output from the function execution.
is_error:bool- Flag indicating if the function execution failed.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var call_id : strvar id : strvar is_error : boolvar model_configvar name : strvar output : strvar type : Literal['function_call_output']
class FunctionTool (*args, **kwargs)-
Expand source code
@runtime_checkable class FunctionTool(Protocol): @property @abstractmethod def _tool_info(self) -> "FunctionToolInfo": ... def __call__(self, *args: Any, **kwargs: Any) -> Any: ...Base class for protocol classes.
Protocol classes are defined as::
class Proto(Protocol): def meth(self) -> int: ...Such classes are primarily used with static type checkers that recognize structural subtyping (static duck-typing).
For example::
class C: def meth(self) -> int: return 0 def func(x: Proto) -> int: return x.meth() func(C()) # Passes static type checkSee PEP 544 for details. Protocol classes decorated with @typing.runtime_checkable act as simple-minded runtime protocols that check only the presence of given attributes, ignoring their type signatures. Protocol classes can be generic, they are defined as::
class GenProto[T](Protocol): def meth(self) -> T: ...Ancestors
- typing.Protocol
- typing.Generic
class FunctionToolInfo (name: str,
description: str | None = None,
parameters_schema: Optional[dict] = None)-
Expand source code
@dataclass class FunctionToolInfo: """Holds metadata about a function tool, including its name, description, and parameter schema.""" name: str description: str | None = None parameters_schema: Optional[dict] = NoneHolds metadata about a function tool, including its name, description, and parameter schema.
Instance variables
var description : str | Nonevar name : strvar parameters_schema : dict | None
class HealthMetrics (resource_id: str,
timestamp: float,
memory_usage_mb: float,
cpu_usage_percent: float,
active_tasks: int,
response_time_ms: float,
error_count: int = 0,
success_count: int = 0)-
Expand source code
@dataclass class HealthMetrics: """Health metrics for resource monitoring.""" resource_id: str timestamp: float memory_usage_mb: float cpu_usage_percent: float active_tasks: int response_time_ms: float error_count: int = 0 success_count: int = 0Health metrics for resource monitoring.
Instance variables
var active_tasks : intvar cpu_usage_percent : floatvar error_count : intvar memory_usage_mb : floatvar resource_id : strvar response_time_ms : floatvar success_count : intvar timestamp : float
class ImageContent (**data: Any)-
Expand source code
class ImageContent(BaseModel): """ Represents image content in chat messages. Attributes: id (str): Unique identifier for the image. Auto-generated if not provided. type (Literal["image"]): Type identifier, always "image". image (Union[av.VideoFrame, str]): The image data as VideoFrame or URL string. inference_detail (Literal["auto", "high", "low"]): Detail level for LLM image analysis. encode_options (EncodeOptions): Configuration for image encoding and resizing. """ id: str = Field(default_factory=lambda: f"img_{int(time.time())}") type: Literal["image"] = "image" image: Union[av.VideoFrame, str] inference_detail: Literal["auto", "high", "low"] = "auto" encode_options: EncodeOptions = Field( default_factory=lambda: EncodeOptions( format="JPEG", quality=90, resize_options=ResizeOptions( width=320, height=240 ), ) ) class Config: arbitrary_types_allowed = True def to_data_url(self) -> str: """ Convert the image to a data URL format. Returns: str: A data URL string representing the image. """ if isinstance(self.image, str): return self.image encoded_image = images.encode(self.image, self.encode_options) b64_image = base64.b64encode(encoded_image).decode("utf-8") return f"data:image/{self.encode_options.format.lower()};base64,{b64_image}"Represents image content in chat messages.
Attributes
id:str- Unique identifier for the image. Auto-generated if not provided.
- type (Literal["image"]): Type identifier, always "image".
image:Union[av.VideoFrame, str]- The image data as VideoFrame or URL string.
- inference_detail (Literal["auto", "high", "low"]): Detail level for LLM image analysis.
encode_options:EncodeOptions- Configuration for image encoding and resizing.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var Configvar encode_options : EncodeOptionsvar id : strvar image : av.video.frame.VideoFrame | strvar inference_detail : Literal['auto', 'high', 'low']var model_configvar type : Literal['image']
Methods
def to_data_url(self) ‑> str-
Expand source code
def to_data_url(self) -> str: """ Convert the image to a data URL format. Returns: str: A data URL string representing the image. """ if isinstance(self.image, str): return self.image encoded_image = images.encode(self.image, self.encode_options) b64_image = base64.b64encode(encoded_image).decode("utf-8") return f"data:image/{self.encode_options.format.lower()};base64,{b64_image}"Convert the image to a data URL format.
Returns
str- A data URL string representing the image.
class InterruptConfig (mode: Literal['VAD_ONLY', 'STT_ONLY', 'HYBRID'] = 'HYBRID',
interrupt_min_duration: float = 0.5,
interrupt_min_words: int = 2,
false_interrupt_pause_duration: float = 2.0,
resume_on_false_interrupt: bool = False)-
Expand source code
@dataclass class InterruptConfig: """Configuration for interruption handling, including mode, duration thresholds, and false interrupt behavior.""" mode: Literal["VAD_ONLY", "STT_ONLY", "HYBRID"] = "HYBRID" interrupt_min_duration: float = 0.5 interrupt_min_words: int = 2 false_interrupt_pause_duration: float = 2.0 resume_on_false_interrupt: bool = False def __post_init__(self): if self.interrupt_min_duration <= 0: raise ValueError("interrupt_min_duration must be greater than 0") if self.interrupt_min_words <= 0: raise ValueError("interrupt_min_words must be greater than 0") if self.false_interrupt_pause_duration <= 0: raise ValueError("false_interrupt_pause_duration must be greater than 0")Configuration for interruption handling, including mode, duration thresholds, and false interrupt behavior.
Instance variables
var false_interrupt_pause_duration : floatvar interrupt_min_duration : floatvar interrupt_min_words : intvar mode : Literal['VAD_ONLY', 'STT_ONLY', 'HYBRID']var resume_on_false_interrupt : bool
class JobContext (*,
room_options: RoomOptions,
metadata: dict | None = None,
loop: asyncio.events.AbstractEventLoop | None = None)-
Expand source code
class JobContext: """Holds the runtime state for a single job, including room connection, pipeline, and shutdown lifecycle management.""" def __init__( self, *, room_options: RoomOptions, metadata: Optional[dict] = None, loop: Optional[asyncio.AbstractEventLoop] = None, ) -> None: self.room_options = room_options self.metadata = metadata or {} self._loop = loop or asyncio.get_event_loop() self._pipeline: Optional["Pipeline"] = None self.videosdk_auth = self.room_options.auth_token or os.getenv( "VIDEOSDK_AUTH_TOKEN" ) self.room: Optional["BaseTransportHandler"] = None self._shutdown_callbacks: list[Callable[[], Coroutine[None, None, None]]] = [] self._is_shutting_down: bool = False self._meeting_joined_event: asyncio.Event = asyncio.Event() self._wait_for_meeting_join: bool = False self.want_console = len(sys.argv) > 1 and sys.argv[1].lower() == "console" self.playground_manager: Optional["PlaygroundManager"] = None from .metrics import metrics_collector self.metrics_collector = metrics_collector self._log_manager = None self._job_logger = None def _set_pipeline_internal(self, pipeline: Any) -> None: """Internal method called by pipeline constructors""" self._pipeline = pipeline if self.room: self.room.pipeline = pipeline if hasattr(pipeline, "_set_loop_and_audio_track"): pipeline._set_loop_and_audio_track(self._loop, self.room.audio_track) # Ensure our lambda function fix is preserved after pipeline setup # This prevents the pipeline from overriding our event handlers if hasattr(self.room, "meeting") and self.room.meeting: # Re-apply our lambda function fix to ensure it's not overridden self.room.meeting.add_event_listener( self.room._create_meeting_handler() ) async def connect(self) -> None: """Connect to the room""" if self.room_options: custom_camera_video_track = None custom_microphone_audio_track = None sinks = [] avatar = self.room_options.avatar if not avatar and self._pipeline and hasattr(self._pipeline, "avatar"): avatar = self._pipeline.avatar if avatar: if not self.room_options.room_id: self.room_options.room_id = self.get_room_id() room_id = self.room_options.room_id from .avatar import AvatarAudioOut, generate_avatar_credentials if isinstance(avatar, AvatarAudioOut): avatar.set_room_id(room_id) await avatar.connect() audio_out = avatar else: _api_key = os.getenv("VIDEOSDK_API_KEY") _secret_key = os.getenv("VIDEOSDK_SECRET_KEY") credentials = generate_avatar_credentials( _api_key, _secret_key, participant_id=avatar.participant_id ) await avatar.connect(room_id, credentials.token) audio_out = AvatarAudioOut(credentials=credentials, room_id=room_id) await audio_out.connect() # no-op (no dispatcher_url) custom_camera_video_track = getattr(avatar, 'video_track', None) custom_microphone_audio_track = getattr(avatar, 'audio_track', None) sinks.append(audio_out) self._cloud_avatar = avatar if not isinstance(avatar, AvatarAudioOut) else None self._avatar_audio_out = audio_out if self._pipeline: self._pipeline.avatar = audio_out if self.want_console: from .console_mode import setup_console_voice_for_ctx if not self._pipeline: raise RuntimeError( "Pipeline must be constructed before ctx.connect() in console mode" ) cleanup_callback = await setup_console_voice_for_ctx(self) self.add_shutdown_callback(cleanup_callback) else: self.metrics_collector.transport_mode = self.room_options.transport_mode self.metrics_collector.analytics_client.configure(self.room_options.metrics) if self.room_options.transport_mode == TransportMode.VIDEOSDK: from .room.room import VideoSDKHandler if not self.room_options.room_id: self.room_options.room_id = self.get_room_id() if self.room_options.send_logs_to_dashboard or (self.room_options.logs and self.room_options.logs.enabled): from .metrics.logger_handler import LogManager, JobLogger self._log_manager = LogManager() self._log_manager.start(auth_token=self.videosdk_auth or "") self._job_logger = JobLogger( queue=self._log_manager.get_queue(), room_id=self.room_options.room_id or "", peer_id=self.room_options.agent_participant_id or "agent", auth_token=self.videosdk_auth or "", dashboard_log_level=self.room_options.dashboard_log_level if not (self.room_options.logs and self.room_options.logs.level) else self.room_options.logs.level, send_logs_to_dashboard=True, ) if self.room_options.join_meeting: validate_room_options_recording(self.room_options) record_audio_resolved, record_screen_share = resolve_video_sdk_recording( self.room_options ) agent_id = self._pipeline.agent.id if self._pipeline and hasattr(self._pipeline, 'agent') else None self.room = VideoSDKHandler( meeting_id=self.room_options.room_id, auth_token=self.videosdk_auth, name=self.room_options.name, agent_participant_id=self.room_options.agent_participant_id, agent_id=agent_id, pipeline=self._pipeline, loop=self._loop, vision=self.room_options.vision, recording=self.room_options.recording, record_audio=record_audio_resolved, record_screen_share=record_screen_share, custom_camera_video_track=custom_camera_video_track, custom_microphone_audio_track=custom_microphone_audio_track, audio_sinks=sinks, background_audio=self.room_options.background_audio, on_room_error=self.room_options.on_room_error, auto_end_session=self.room_options.auto_end_session, session_timeout_seconds=self.room_options.session_timeout_seconds, no_participant_timeout_seconds=self.room_options.no_participant_timeout_seconds, signaling_base_url=self.room_options.signaling_base_url, job_logger=self._job_logger, traces_options=self.room_options.traces, metrics_options=self.room_options.metrics, logs_options=self.room_options.logs, avatar_participant_id=avatar.participant_id if avatar and hasattr(avatar, 'participant_id') else None, ) if self._pipeline and hasattr( self._pipeline, "_set_loop_and_audio_track" ): self._pipeline._set_loop_and_audio_track( self._loop, self.room.audio_track ) elif self.room_options.transport_mode == TransportMode.WEBSOCKET: if not self.room_options.websocket: raise ValueError("WebSocket configuration (websocket) is required when mode is WEBSOCKET") if self.room_options.webrtc and (self.room_options.webrtc.signaling_url or self.room_options.webrtc.ice_servers != [{"urls": "stun:stun.l.google.com:19302"}]): logger.warning("WebRTC configuration provided but transport mode is set to WEBSOCKET. WebRTC config will be ignored.") from .transports.websocket_handler import WebSocketTransportHandler self.room = WebSocketTransportHandler( loop=self._loop, pipeline=self._pipeline, port=self.room_options.websocket.port, path=self.room_options.websocket.path ) elif self.room_options.transport_mode == TransportMode.WEBRTC: if not self.room_options.webrtc: raise ValueError("WebRTC configuration (webrtc) is required when mode is WEBRTC") if not self.room_options.webrtc.signaling_url: raise ValueError("WebRTC signaling_url is required when mode is WEBRTC") if self.room_options.websocket and (self.room_options.websocket.port != 8080 or self.room_options.websocket.path != "/ws"): logger.warning("WebSocket configuration provided but connection mode is set to WEBRTC. WebSocket config will be ignored.") from .transports.webrtc_handler import WebRTCTransportHandler self.room = WebRTCTransportHandler( loop=self._loop, pipeline=self._pipeline, signaling_url=self.room_options.webrtc.signaling_url, ice_servers=self.room_options.webrtc.ice_servers ) elif self.room_options.transport_mode == TransportMode.VIDEOSDK: if self.room_options.websocket and (self.room_options.websocket.port != 8080 or self.room_options.websocket.path != "/ws"): logger.warning("WebSocket configuration provided but transport mode is VIDEOSDK. WebSocket config will be ignored.") if self.room_options.webrtc and (self.room_options.webrtc.signaling_url or self.room_options.webrtc.ice_servers != [{"urls": "stun:stun.l.google.com:19302"}]): logger.warning("WebRTC configuration provided but transport mode is VIDEOSDK. WebRTC config will be ignored.") if self.room: await self.room.connect() # For Non-VideoSDK modes, we still need to ensure audio track is linked if not done inside constructor if ( self.room_options.transport_mode != TransportMode.VIDEOSDK and self._pipeline and hasattr(self._pipeline, "_set_loop_and_audio_track") ): # BaseTransportHandler subclasses now initialize self.audio_track if self.room.audio_track: self._pipeline._set_loop_and_audio_track(self._loop, self.room.audio_track) if ( self.room_options.playground and self.room_options.join_meeting and not self.want_console and self.room_options.transport_mode == TransportMode.VIDEOSDK ): if self.videosdk_auth: playground_url = f"https://playground.videosdk.live?token={self.videosdk_auth}&meetingId={self.room_options.room_id}" print(f"\033[1;36m" + "Agent started in playground mode" + "\033[0m") print("\033[1;75m" + "Interact with agent here at:" + "\033[0m") print("\033[1;4;94m" + playground_url + "\033[0m") else: raise ValueError("VIDEOSDK_AUTH_TOKEN environment variable not found") async def shutdown(self) -> None: """Called by Worker during graceful shutdown""" if self._is_shutting_down: logger.info("JobContext already shutting down") return self._is_shutting_down = True logger.info("JobContext shutting down") for callback in self._shutdown_callbacks: try: await callback() except Exception as e: logger.error(f"Error in shutdown callback: {e}") if self._pipeline: try: await self._pipeline.cleanup() except Exception as e: logger.error(f"Error during pipeline cleanup: {e}") self._pipeline = None cloud_avatar = getattr(self, '_cloud_avatar', None) if cloud_avatar and hasattr(cloud_avatar, 'aclose'): try: await cloud_avatar.aclose() except Exception as e: logger.error(f"Error during cloud avatar aclose: {e}") audio_out = getattr(self, '_avatar_audio_out', None) if audio_out: try: await audio_out.aclose() except Exception as e: logger.error(f"Error during avatar audio_out aclose: {e}") if self._job_logger: try: self._job_logger.cleanup() except Exception as e: logger.error(f"Error during job logger cleanup: {e}") self._job_logger = None if self._log_manager: try: self._log_manager.stop() except Exception as e: logger.error(f"Error during log manager stop: {e}") self._log_manager = None if self.room: try: if not getattr(self.room, "_left", False): await self.room.leave() else: logger.info("Room already left, skipping room.leave()") except Exception as e: logger.error(f"Error during room leave: {e}") try: if hasattr(self.room, "cleanup"): await self.room.cleanup() except Exception as e: logger.error(f"Error during room cleanup: {e}") self.room = None self.room_options = None self._loop = None self.videosdk_auth = None self._shutdown_callbacks.clear() logger.info("JobContext cleaned up") def add_shutdown_callback( self, callback: Callable[[], Coroutine[None, None, None]] ) -> None: """Add a callback to be called during shutdown""" self._shutdown_callbacks.append(callback) def notify_meeting_joined(self) -> None: """Called when the agent successfully joins the meeting.""" self._meeting_joined_event.set() audio_out = getattr(self, '_avatar_audio_out', None) if audio_out and self.room and self.room.meeting: audio_out._set_meeting(self.room.meeting) async def wait_for_meeting_joined(self, timeout: float = 30.0) -> bool: """Wait until the meeting is joined or timeout. Returns True if joined.""" try: await asyncio.wait_for(self._meeting_joined_event.wait(), timeout=timeout) return True except asyncio.TimeoutError: logger.warning(f"Timeout waiting for meeting join after {timeout}s") return False async def wait_for_participant(self, participant_id: str | None = None) -> str: if self.room: return await self.room.wait_for_participant(participant_id) else: raise ValueError("Room not initialized") async def run_until_shutdown( self, session: Any = None, wait_for_participant: bool = False, ) -> None: """ Simplified helper that handles all cleanup boilerplate. This method: 1. Connects to the room 2. Sets up session end callbacks 3. Waits for participant (optional) 4. Starts the session 5. Waits for shutdown signal 6. Cleans up gracefully Args: session: AgentSession to manage (will call session.start() and session.close()) wait_for_participant: Whether to wait for a participant before starting Example: ```python async def entrypoint(ctx: JobContext): session = AgentSession(agent=agent, pipeline=pipeline) await ctx.run_until_shutdown(session=session, wait_for_participant=True) ``` """ shutdown_event = asyncio.Event() if session: async def cleanup_session(): logger.info("Cleaning up session...") try: await session.close() except Exception as e: logger.error(f"Error closing session in cleanup: {e}") shutdown_event.set() self.add_shutdown_callback(cleanup_session) else: async def cleanup_no_session(): logger.info("Shutdown called, no session to clean up") shutdown_event.set() self.add_shutdown_callback(cleanup_no_session) def on_session_end(reason: str): logger.info(f"Session ended: {reason}") asyncio.create_task(self.shutdown()) try: try: await self.connect() except Exception as e: logger.error(f"Error connecting to room: {e}") raise if self.room: try: self.room.setup_session_end_callback(on_session_end) logger.info("Session end callback configured") except Exception as e: logger.warning(f"Error setting up session end callback: {e}") else: logger.warning( "Room not available, session end callback not configured" ) if wait_for_participant and self.room: try: logger.info("Waiting for participant...") participant_id = await self.room.wait_for_participant() if participant_id is None: logger.info("Session ended before any participant joined, shutting down") return logger.info("Participant joined") except Exception as e: logger.error(f"Error waiting for participant: {e}") raise if session: try: await session.start() logger.info("Agent session started") except Exception as e: logger.error(f"Error starting session: {e}") raise logger.info( "Agent is running... (will exit when session ends or on interrupt)" ) await shutdown_event.wait() logger.info("Shutdown event received, exiting gracefully...") except KeyboardInterrupt: logger.info("Keyboard interrupt received, shutting down...") except Exception as e: logger.error(f"Unexpected error in run_until_shutdown: {e}") raise finally: if session: try: await session.close() except Exception as e: logger.error(f"Error closing session in finally: {e}") try: await self.shutdown() except Exception as e: logger.error(f"Error in ctx.shutdown: {e}") def get_room_id(self) -> str: """ Creates a new room using the VideoSDK API and returns the room ID. Raises: ValueError: If the VIDEOSDK_AUTH_TOKEN is missing. RuntimeError: If the API request fails or the response is invalid. """ if self.want_console: return None if self.videosdk_auth: base_url = self.room_options.signaling_base_url url = f"https://{base_url}/v2/rooms" headers = {"Authorization": self.videosdk_auth} try: response = requests.post(url, headers=headers) response.raise_for_status() except requests.RequestException as e: raise RuntimeError(f"Failed to create room: {e}") from e data = response.json() room_id = data.get("roomId") if not room_id: raise RuntimeError(f"Unexpected API response, missing roomId: {data}") return room_id else: raise ValueError( "VIDEOSDK_AUTH_TOKEN not found. " "Set it as an environment variable or provide it in room options via auth_token." )Holds the runtime state for a single job, including room connection, pipeline, and shutdown lifecycle management.
Methods
def add_shutdown_callback(self, callback: Callable[[], Coroutine[None, None, None]]) ‑> None-
Expand source code
def add_shutdown_callback( self, callback: Callable[[], Coroutine[None, None, None]] ) -> None: """Add a callback to be called during shutdown""" self._shutdown_callbacks.append(callback)Add a callback to be called during shutdown
async def connect(self) ‑> None-
Expand source code
async def connect(self) -> None: """Connect to the room""" if self.room_options: custom_camera_video_track = None custom_microphone_audio_track = None sinks = [] avatar = self.room_options.avatar if not avatar and self._pipeline and hasattr(self._pipeline, "avatar"): avatar = self._pipeline.avatar if avatar: if not self.room_options.room_id: self.room_options.room_id = self.get_room_id() room_id = self.room_options.room_id from .avatar import AvatarAudioOut, generate_avatar_credentials if isinstance(avatar, AvatarAudioOut): avatar.set_room_id(room_id) await avatar.connect() audio_out = avatar else: _api_key = os.getenv("VIDEOSDK_API_KEY") _secret_key = os.getenv("VIDEOSDK_SECRET_KEY") credentials = generate_avatar_credentials( _api_key, _secret_key, participant_id=avatar.participant_id ) await avatar.connect(room_id, credentials.token) audio_out = AvatarAudioOut(credentials=credentials, room_id=room_id) await audio_out.connect() # no-op (no dispatcher_url) custom_camera_video_track = getattr(avatar, 'video_track', None) custom_microphone_audio_track = getattr(avatar, 'audio_track', None) sinks.append(audio_out) self._cloud_avatar = avatar if not isinstance(avatar, AvatarAudioOut) else None self._avatar_audio_out = audio_out if self._pipeline: self._pipeline.avatar = audio_out if self.want_console: from .console_mode import setup_console_voice_for_ctx if not self._pipeline: raise RuntimeError( "Pipeline must be constructed before ctx.connect() in console mode" ) cleanup_callback = await setup_console_voice_for_ctx(self) self.add_shutdown_callback(cleanup_callback) else: self.metrics_collector.transport_mode = self.room_options.transport_mode self.metrics_collector.analytics_client.configure(self.room_options.metrics) if self.room_options.transport_mode == TransportMode.VIDEOSDK: from .room.room import VideoSDKHandler if not self.room_options.room_id: self.room_options.room_id = self.get_room_id() if self.room_options.send_logs_to_dashboard or (self.room_options.logs and self.room_options.logs.enabled): from .metrics.logger_handler import LogManager, JobLogger self._log_manager = LogManager() self._log_manager.start(auth_token=self.videosdk_auth or "") self._job_logger = JobLogger( queue=self._log_manager.get_queue(), room_id=self.room_options.room_id or "", peer_id=self.room_options.agent_participant_id or "agent", auth_token=self.videosdk_auth or "", dashboard_log_level=self.room_options.dashboard_log_level if not (self.room_options.logs and self.room_options.logs.level) else self.room_options.logs.level, send_logs_to_dashboard=True, ) if self.room_options.join_meeting: validate_room_options_recording(self.room_options) record_audio_resolved, record_screen_share = resolve_video_sdk_recording( self.room_options ) agent_id = self._pipeline.agent.id if self._pipeline and hasattr(self._pipeline, 'agent') else None self.room = VideoSDKHandler( meeting_id=self.room_options.room_id, auth_token=self.videosdk_auth, name=self.room_options.name, agent_participant_id=self.room_options.agent_participant_id, agent_id=agent_id, pipeline=self._pipeline, loop=self._loop, vision=self.room_options.vision, recording=self.room_options.recording, record_audio=record_audio_resolved, record_screen_share=record_screen_share, custom_camera_video_track=custom_camera_video_track, custom_microphone_audio_track=custom_microphone_audio_track, audio_sinks=sinks, background_audio=self.room_options.background_audio, on_room_error=self.room_options.on_room_error, auto_end_session=self.room_options.auto_end_session, session_timeout_seconds=self.room_options.session_timeout_seconds, no_participant_timeout_seconds=self.room_options.no_participant_timeout_seconds, signaling_base_url=self.room_options.signaling_base_url, job_logger=self._job_logger, traces_options=self.room_options.traces, metrics_options=self.room_options.metrics, logs_options=self.room_options.logs, avatar_participant_id=avatar.participant_id if avatar and hasattr(avatar, 'participant_id') else None, ) if self._pipeline and hasattr( self._pipeline, "_set_loop_and_audio_track" ): self._pipeline._set_loop_and_audio_track( self._loop, self.room.audio_track ) elif self.room_options.transport_mode == TransportMode.WEBSOCKET: if not self.room_options.websocket: raise ValueError("WebSocket configuration (websocket) is required when mode is WEBSOCKET") if self.room_options.webrtc and (self.room_options.webrtc.signaling_url or self.room_options.webrtc.ice_servers != [{"urls": "stun:stun.l.google.com:19302"}]): logger.warning("WebRTC configuration provided but transport mode is set to WEBSOCKET. WebRTC config will be ignored.") from .transports.websocket_handler import WebSocketTransportHandler self.room = WebSocketTransportHandler( loop=self._loop, pipeline=self._pipeline, port=self.room_options.websocket.port, path=self.room_options.websocket.path ) elif self.room_options.transport_mode == TransportMode.WEBRTC: if not self.room_options.webrtc: raise ValueError("WebRTC configuration (webrtc) is required when mode is WEBRTC") if not self.room_options.webrtc.signaling_url: raise ValueError("WebRTC signaling_url is required when mode is WEBRTC") if self.room_options.websocket and (self.room_options.websocket.port != 8080 or self.room_options.websocket.path != "/ws"): logger.warning("WebSocket configuration provided but connection mode is set to WEBRTC. WebSocket config will be ignored.") from .transports.webrtc_handler import WebRTCTransportHandler self.room = WebRTCTransportHandler( loop=self._loop, pipeline=self._pipeline, signaling_url=self.room_options.webrtc.signaling_url, ice_servers=self.room_options.webrtc.ice_servers ) elif self.room_options.transport_mode == TransportMode.VIDEOSDK: if self.room_options.websocket and (self.room_options.websocket.port != 8080 or self.room_options.websocket.path != "/ws"): logger.warning("WebSocket configuration provided but transport mode is VIDEOSDK. WebSocket config will be ignored.") if self.room_options.webrtc and (self.room_options.webrtc.signaling_url or self.room_options.webrtc.ice_servers != [{"urls": "stun:stun.l.google.com:19302"}]): logger.warning("WebRTC configuration provided but transport mode is VIDEOSDK. WebRTC config will be ignored.") if self.room: await self.room.connect() # For Non-VideoSDK modes, we still need to ensure audio track is linked if not done inside constructor if ( self.room_options.transport_mode != TransportMode.VIDEOSDK and self._pipeline and hasattr(self._pipeline, "_set_loop_and_audio_track") ): # BaseTransportHandler subclasses now initialize self.audio_track if self.room.audio_track: self._pipeline._set_loop_and_audio_track(self._loop, self.room.audio_track) if ( self.room_options.playground and self.room_options.join_meeting and not self.want_console and self.room_options.transport_mode == TransportMode.VIDEOSDK ): if self.videosdk_auth: playground_url = f"https://playground.videosdk.live?token={self.videosdk_auth}&meetingId={self.room_options.room_id}" print(f"\033[1;36m" + "Agent started in playground mode" + "\033[0m") print("\033[1;75m" + "Interact with agent here at:" + "\033[0m") print("\033[1;4;94m" + playground_url + "\033[0m") else: raise ValueError("VIDEOSDK_AUTH_TOKEN environment variable not found")Connect to the room
def get_room_id(self) ‑> str-
Expand source code
def get_room_id(self) -> str: """ Creates a new room using the VideoSDK API and returns the room ID. Raises: ValueError: If the VIDEOSDK_AUTH_TOKEN is missing. RuntimeError: If the API request fails or the response is invalid. """ if self.want_console: return None if self.videosdk_auth: base_url = self.room_options.signaling_base_url url = f"https://{base_url}/v2/rooms" headers = {"Authorization": self.videosdk_auth} try: response = requests.post(url, headers=headers) response.raise_for_status() except requests.RequestException as e: raise RuntimeError(f"Failed to create room: {e}") from e data = response.json() room_id = data.get("roomId") if not room_id: raise RuntimeError(f"Unexpected API response, missing roomId: {data}") return room_id else: raise ValueError( "VIDEOSDK_AUTH_TOKEN not found. " "Set it as an environment variable or provide it in room options via auth_token." )Creates a new room using the VideoSDK API and returns the room ID.
Raises
ValueError- If the VIDEOSDK_AUTH_TOKEN is missing.
RuntimeError- If the API request fails or the response is invalid.
def notify_meeting_joined(self) ‑> None-
Expand source code
def notify_meeting_joined(self) -> None: """Called when the agent successfully joins the meeting.""" self._meeting_joined_event.set() audio_out = getattr(self, '_avatar_audio_out', None) if audio_out and self.room and self.room.meeting: audio_out._set_meeting(self.room.meeting)Called when the agent successfully joins the meeting.
async def run_until_shutdown(self, session: Any = None, wait_for_participant: bool = False) ‑> None-
Expand source code
async def run_until_shutdown( self, session: Any = None, wait_for_participant: bool = False, ) -> None: """ Simplified helper that handles all cleanup boilerplate. This method: 1. Connects to the room 2. Sets up session end callbacks 3. Waits for participant (optional) 4. Starts the session 5. Waits for shutdown signal 6. Cleans up gracefully Args: session: AgentSession to manage (will call session.start() and session.close()) wait_for_participant: Whether to wait for a participant before starting Example: ```python async def entrypoint(ctx: JobContext): session = AgentSession(agent=agent, pipeline=pipeline) await ctx.run_until_shutdown(session=session, wait_for_participant=True) ``` """ shutdown_event = asyncio.Event() if session: async def cleanup_session(): logger.info("Cleaning up session...") try: await session.close() except Exception as e: logger.error(f"Error closing session in cleanup: {e}") shutdown_event.set() self.add_shutdown_callback(cleanup_session) else: async def cleanup_no_session(): logger.info("Shutdown called, no session to clean up") shutdown_event.set() self.add_shutdown_callback(cleanup_no_session) def on_session_end(reason: str): logger.info(f"Session ended: {reason}") asyncio.create_task(self.shutdown()) try: try: await self.connect() except Exception as e: logger.error(f"Error connecting to room: {e}") raise if self.room: try: self.room.setup_session_end_callback(on_session_end) logger.info("Session end callback configured") except Exception as e: logger.warning(f"Error setting up session end callback: {e}") else: logger.warning( "Room not available, session end callback not configured" ) if wait_for_participant and self.room: try: logger.info("Waiting for participant...") participant_id = await self.room.wait_for_participant() if participant_id is None: logger.info("Session ended before any participant joined, shutting down") return logger.info("Participant joined") except Exception as e: logger.error(f"Error waiting for participant: {e}") raise if session: try: await session.start() logger.info("Agent session started") except Exception as e: logger.error(f"Error starting session: {e}") raise logger.info( "Agent is running... (will exit when session ends or on interrupt)" ) await shutdown_event.wait() logger.info("Shutdown event received, exiting gracefully...") except KeyboardInterrupt: logger.info("Keyboard interrupt received, shutting down...") except Exception as e: logger.error(f"Unexpected error in run_until_shutdown: {e}") raise finally: if session: try: await session.close() except Exception as e: logger.error(f"Error closing session in finally: {e}") try: await self.shutdown() except Exception as e: logger.error(f"Error in ctx.shutdown: {e}")Simplified helper that handles all cleanup boilerplate.
This method: 1. Connects to the room 2. Sets up session end callbacks 3. Waits for participant (optional) 4. Starts the session 5. Waits for shutdown signal 6. Cleans up gracefully
Args
session- AgentSession to manage (will call session.start() and session.close())
wait_for_participant- Whether to wait for a participant before starting
Example
async def entrypoint(ctx: JobContext): session = AgentSession(agent=agent, pipeline=pipeline) await ctx.run_until_shutdown(session=session, wait_for_participant=True) async def shutdown(self) ‑> None-
Expand source code
async def shutdown(self) -> None: """Called by Worker during graceful shutdown""" if self._is_shutting_down: logger.info("JobContext already shutting down") return self._is_shutting_down = True logger.info("JobContext shutting down") for callback in self._shutdown_callbacks: try: await callback() except Exception as e: logger.error(f"Error in shutdown callback: {e}") if self._pipeline: try: await self._pipeline.cleanup() except Exception as e: logger.error(f"Error during pipeline cleanup: {e}") self._pipeline = None cloud_avatar = getattr(self, '_cloud_avatar', None) if cloud_avatar and hasattr(cloud_avatar, 'aclose'): try: await cloud_avatar.aclose() except Exception as e: logger.error(f"Error during cloud avatar aclose: {e}") audio_out = getattr(self, '_avatar_audio_out', None) if audio_out: try: await audio_out.aclose() except Exception as e: logger.error(f"Error during avatar audio_out aclose: {e}") if self._job_logger: try: self._job_logger.cleanup() except Exception as e: logger.error(f"Error during job logger cleanup: {e}") self._job_logger = None if self._log_manager: try: self._log_manager.stop() except Exception as e: logger.error(f"Error during log manager stop: {e}") self._log_manager = None if self.room: try: if not getattr(self.room, "_left", False): await self.room.leave() else: logger.info("Room already left, skipping room.leave()") except Exception as e: logger.error(f"Error during room leave: {e}") try: if hasattr(self.room, "cleanup"): await self.room.cleanup() except Exception as e: logger.error(f"Error during room cleanup: {e}") self.room = None self.room_options = None self._loop = None self.videosdk_auth = None self._shutdown_callbacks.clear() logger.info("JobContext cleaned up")Called by Worker during graceful shutdown
async def wait_for_meeting_joined(self, timeout: float = 30.0) ‑> bool-
Expand source code
async def wait_for_meeting_joined(self, timeout: float = 30.0) -> bool: """Wait until the meeting is joined or timeout. Returns True if joined.""" try: await asyncio.wait_for(self._meeting_joined_event.wait(), timeout=timeout) return True except asyncio.TimeoutError: logger.warning(f"Timeout waiting for meeting join after {timeout}s") return FalseWait until the meeting is joined or timeout. Returns True if joined.
async def wait_for_participant(self, participant_id: str | None = None) ‑> str-
Expand source code
async def wait_for_participant(self, participant_id: str | None = None) -> str: if self.room: return await self.room.wait_for_participant(participant_id) else: raise ValueError("Room not initialized")
class KnowledgeBase (config: KnowledgeBaseConfig)-
Expand source code
class KnowledgeBase(ABC): """ Base class for handling knowledge-base retrieval operations. Provides hooks developers can override: - allow_retrieval: Decide if the knowledge base should be used. - pre_process_query: Preprocess the query before searching. - format_context: Format retrieved documents for the final prompt. """ def __init__(self, config: KnowledgeBaseConfig): """ Initialize the knowledge base handler. Args: config (KnowledgeBaseConfig): Configuration for retrieval settings. """ self.config = config def allow_retrieval(self, transcript: str) -> bool: """ Decide whether the knowledge base should be used for this message. Args: transcript (str): User message. Returns: bool: True to perform retrieval, False otherwise. """ return True def pre_process_query(self, transcript: str) -> str: """ Preprocess the user message before searching the knowledge base. Args: transcript (str): Original user message. Returns: str: Processed query string. """ return transcript def format_context(self, documents: List[str]) -> str: """ Format retrieved documents into a context string. Args: documents (List[str]): Retrieved document texts. Returns: str: Formatted context for the model. """ if not documents: return "" doc_str = "\n".join([f"- {doc}" for doc in documents]) return f"Use the following context to answer the user:\n{doc_str}\n" async def retrieve_documents(self, query: str) -> List[str]: """ Fetch documents from the configured knowledge base. Args: query (str): Search query. Returns: List[str]: Retrieved document texts. """ api_base_url = "https://api.videosdk.live/ai/v1" auth_token = os.getenv("VIDEOSDK_AUTH_TOKEN") if not auth_token: logger.warning("VIDEOSDK_AUTH_TOKEN not set, skipping KB retrieval") return [] try: url = f"{api_base_url}/knowledge-bases/{self.config.id}/search" headers = { "Authorization": auth_token, "Content-Type": "application/json" } payload = { "queryText": query, "topK": self.config.top_k } metrics_collector.on_knowledge_base_start(kb_id=self.config.id) async with aiohttp.ClientSession() as session: async with session.post(url, json=payload, headers=headers) as response: if response.status == 200: data = await response.json() results = data.get("results", []) # Extract text from each result's payload documents = [] scores = [] for result in results: if isinstance(result, dict): payload = result.get("payload", {}) if isinstance(payload, dict): text = payload.get("text", "") if text and text.strip(): # Only add non-empty text documents.append(text.strip()) scores.append(result.get("score", 0)) logger.debug(f"Retrieved {len(documents)} documents from knowledge base") metrics_collector.on_knowledge_base_complete(documents=documents, scores=scores) return documents else: error_text = await response.text() logger.error( f"KB API error {response.status}: {error_text}" ) return [] except Exception as e: logger.error(f"Error retrieving KB documents: {e}") return [] async def process_query(self, transcript: str) -> Optional[str]: """ Run the full knowledge-base retrieval flow for a user message. Args: transcript (str): User message. Returns: Optional[str]: Formatted context or None if retrieval is skipped. """ # Check if KB should be triggered if not self.allow_retrieval(transcript): return None # Transform the query query = self.pre_process_query(transcript) # Retrieve documents documents = await self.retrieve_documents(query) if not documents: return None # Format the prompt formatted_context = self.format_context(documents) return formatted_contextBase class for handling knowledge-base retrieval operations.
Provides hooks developers can override: - allow_retrieval: Decide if the knowledge base should be used. - pre_process_query: Preprocess the query before searching. - format_context: Format retrieved documents for the final prompt.
Initialize the knowledge base handler.
Args
config:KnowledgeBaseConfig- Configuration for retrieval settings.
Ancestors
- abc.ABC
Methods
def allow_retrieval(self, transcript: str) ‑> bool-
Expand source code
def allow_retrieval(self, transcript: str) -> bool: """ Decide whether the knowledge base should be used for this message. Args: transcript (str): User message. Returns: bool: True to perform retrieval, False otherwise. """ return TrueDecide whether the knowledge base should be used for this message.
Args
transcript:str- User message.
Returns
bool- True to perform retrieval, False otherwise.
def format_context(self, documents: List[str]) ‑> str-
Expand source code
def format_context(self, documents: List[str]) -> str: """ Format retrieved documents into a context string. Args: documents (List[str]): Retrieved document texts. Returns: str: Formatted context for the model. """ if not documents: return "" doc_str = "\n".join([f"- {doc}" for doc in documents]) return f"Use the following context to answer the user:\n{doc_str}\n"Format retrieved documents into a context string.
Args
documents:List[str]- Retrieved document texts.
Returns
str- Formatted context for the model.
def pre_process_query(self, transcript: str) ‑> str-
Expand source code
def pre_process_query(self, transcript: str) -> str: """ Preprocess the user message before searching the knowledge base. Args: transcript (str): Original user message. Returns: str: Processed query string. """ return transcriptPreprocess the user message before searching the knowledge base.
Args
transcript:str- Original user message.
Returns
str- Processed query string.
async def process_query(self, transcript: str) ‑> str | None-
Expand source code
async def process_query(self, transcript: str) -> Optional[str]: """ Run the full knowledge-base retrieval flow for a user message. Args: transcript (str): User message. Returns: Optional[str]: Formatted context or None if retrieval is skipped. """ # Check if KB should be triggered if not self.allow_retrieval(transcript): return None # Transform the query query = self.pre_process_query(transcript) # Retrieve documents documents = await self.retrieve_documents(query) if not documents: return None # Format the prompt formatted_context = self.format_context(documents) return formatted_contextRun the full knowledge-base retrieval flow for a user message.
Args
transcript:str- User message.
Returns
Optional[str]- Formatted context or None if retrieval is skipped.
async def retrieve_documents(self, query: str) ‑> List[str]-
Expand source code
async def retrieve_documents(self, query: str) -> List[str]: """ Fetch documents from the configured knowledge base. Args: query (str): Search query. Returns: List[str]: Retrieved document texts. """ api_base_url = "https://api.videosdk.live/ai/v1" auth_token = os.getenv("VIDEOSDK_AUTH_TOKEN") if not auth_token: logger.warning("VIDEOSDK_AUTH_TOKEN not set, skipping KB retrieval") return [] try: url = f"{api_base_url}/knowledge-bases/{self.config.id}/search" headers = { "Authorization": auth_token, "Content-Type": "application/json" } payload = { "queryText": query, "topK": self.config.top_k } metrics_collector.on_knowledge_base_start(kb_id=self.config.id) async with aiohttp.ClientSession() as session: async with session.post(url, json=payload, headers=headers) as response: if response.status == 200: data = await response.json() results = data.get("results", []) # Extract text from each result's payload documents = [] scores = [] for result in results: if isinstance(result, dict): payload = result.get("payload", {}) if isinstance(payload, dict): text = payload.get("text", "") if text and text.strip(): # Only add non-empty text documents.append(text.strip()) scores.append(result.get("score", 0)) logger.debug(f"Retrieved {len(documents)} documents from knowledge base") metrics_collector.on_knowledge_base_complete(documents=documents, scores=scores) return documents else: error_text = await response.text() logger.error( f"KB API error {response.status}: {error_text}" ) return [] except Exception as e: logger.error(f"Error retrieving KB documents: {e}") return []Fetch documents from the configured knowledge base.
Args
query:str- Search query.
Returns
List[str]- Retrieved document texts.
class KnowledgeBaseConfig (id: str, top_k: int = 3)-
Expand source code
@dataclass class KnowledgeBaseConfig: """ Configuration for managed RAG (Retrieval-Augmented Generation). Attributes: id: The ID of the knowledge base provided by your app dashboard top_k: Optional number of documents to retrieve (default: 3) """ id: str top_k: int = 3 def __post_init__(self): if not self.id: raise ValueError("id cannot be empty") if self.top_k < 1: raise ValueError("top_k must be at least 1")Configuration for managed RAG (Retrieval-Augmented Generation).
Attributes
id- The ID of the knowledge base provided by your app dashboard
top_k- Optional number of documents to retrieve (default: 3)
Instance variables
var id : strvar top_k : int
class LLM-
Expand source code
class LLM(EventEmitter[Literal["error"]]): """ Base class for LLM implementations. """ def __init__(self) -> None: """ Initialize the LLM base class. """ super().__init__() self._label = f"{type(self).__module__}.{type(self).__name__}" @property def label(self) -> str: """ Get the LLM provider label. Returns: str: A string identifier for the LLM provider (e.g., "videosdk.plugins.openai.llm.OpenAILLM"). """ return self._label @abstractmethod async def chat( self, messages: ChatContext, tools: list[FunctionTool] | None = None, **kwargs: Any ) -> AsyncIterator[LLMResponse]: """ Main method to interact with the LLM. Args: messages (ChatContext): The conversation context containing message history. tools (list[FunctionTool] | None, optional): List of available function tools for the LLM to use. **kwargs (Any): Additional arguments specific to the LLM provider implementation. Returns: AsyncIterator[LLMResponse]: An async iterator yielding LLMResponse objects as they're generated. Raises: NotImplementedError: This method must be implemented by subclasses. """ raise NotImplementedError @abstractmethod async def cancel_current_generation(self) -> None: """ Cancel the current LLM generation if active. Raises: NotImplementedError: This method must be implemented by subclasses. """ # override in subclasses pass async def aclose(self) -> None: """ Cleanup resources. """ logger.info(f"Cleaning up LLM: {self.label}") await self.cancel_current_generation() try: import gc gc.collect() logger.info(f"LLM garbage collection completed: {self.label}") except Exception as e: logger.error(f"Error during LLM garbage collection: {e}") logger.info(f"LLM cleanup completed: {self.label}") async def __aenter__(self) -> LLM: """ Async context manager entry point. """ return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: """ Async context manager exit point. """ await self.aclose()Base class for LLM implementations.
Initialize the LLM base class.
Ancestors
- EventEmitter
- typing.Generic
Subclasses
Instance variables
prop label : str-
Expand source code
@property def label(self) -> str: """ Get the LLM provider label. Returns: str: A string identifier for the LLM provider (e.g., "videosdk.plugins.openai.llm.OpenAILLM"). """ return self._labelGet the LLM provider label.
Returns
str- A string identifier for the LLM provider (e.g., "videosdk.plugins.openai.llm.OpenAILLM").
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """ Cleanup resources. """ logger.info(f"Cleaning up LLM: {self.label}") await self.cancel_current_generation() try: import gc gc.collect() logger.info(f"LLM garbage collection completed: {self.label}") except Exception as e: logger.error(f"Error during LLM garbage collection: {e}") logger.info(f"LLM cleanup completed: {self.label}")Cleanup resources.
async def cancel_current_generation(self) ‑> None-
Expand source code
@abstractmethod async def cancel_current_generation(self) -> None: """ Cancel the current LLM generation if active. Raises: NotImplementedError: This method must be implemented by subclasses. """ # override in subclasses passCancel the current LLM generation if active.
Raises
NotImplementedError- This method must be implemented by subclasses.
async def chat(self,
messages: ChatContext,
tools: list[FunctionTool] | None = None,
**kwargs: Any) ‑> AsyncIterator[LLMResponse]-
Expand source code
@abstractmethod async def chat( self, messages: ChatContext, tools: list[FunctionTool] | None = None, **kwargs: Any ) -> AsyncIterator[LLMResponse]: """ Main method to interact with the LLM. Args: messages (ChatContext): The conversation context containing message history. tools (list[FunctionTool] | None, optional): List of available function tools for the LLM to use. **kwargs (Any): Additional arguments specific to the LLM provider implementation. Returns: AsyncIterator[LLMResponse]: An async iterator yielding LLMResponse objects as they're generated. Raises: NotImplementedError: This method must be implemented by subclasses. """ raise NotImplementedErrorMain method to interact with the LLM.
Args
messages:ChatContext- The conversation context containing message history.
tools:list[FunctionTool] | None, optional- List of available function tools for the LLM to use.
**kwargs:Any- Additional arguments specific to the LLM provider implementation.
Returns
AsyncIterator[LLMResponse]- An async iterator yielding LLMResponse objects as they're generated.
Raises
NotImplementedError- This method must be implemented by subclasses.
Inherited members
class LLMResponse (**data: Any)-
Expand source code
class LLMResponse(BaseModel): """ Data model to hold LLM response data. Attributes: content (str): The text content generated by the LLM. role (ChatRole): The role of the response (typically ASSISTANT). metadata (Optional[dict[str, Any]]): Additional response metadata from the LLM provider. """ content: str role: ChatRole metadata: Optional[dict[str, Any]] = NoneData model to hold LLM response data.
Attributes
content:str- The text content generated by the LLM.
role:ChatRole- The role of the response (typically ASSISTANT).
metadata:Optional[dict[str, Any]]- Additional response metadata from the LLM provider.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var content : strvar metadata : dict[str, typing.Any] | Nonevar model_configvar role : ChatRole
class LoggingOptions (enabled: bool = False,
level: str = 'INFO',
export_url: str | None = None,
export_headers: Dict[str, str] | None = None)-
Expand source code
@dataclass class LoggingOptions: """Configuration for log collection, level filtering, and export settings.""" enabled: bool = False level: str = "INFO" export_url: Optional[str] = None export_headers: Optional[Dict[str, str]] = NoneConfiguration for log collection, level filtering, and export settings.
Instance variables
var enabled : boolvar export_headers : Dict[str, str] | Nonevar export_url : str | Nonevar level : str
class MCPServerHTTP (endpoint_url: str,
request_headers: Dict[str, Any] | None = None,
connection_timeout: float = 10.0,
stream_read_timeout: float = 300.0,
session_timeout: float = 5.0)-
Expand source code
class MCPServerHTTP(MCPServiceProvider): """ HTTP/Web-based MCP service provider with automatic transport detection. """ def __init__( self, endpoint_url: str, request_headers: Optional[Dict[str, Any]] = None, connection_timeout: float = 10.0, stream_read_timeout: float = 300.0, session_timeout: float = 5.0, ): """ Initialize the HTTP MCP server provider. Args: endpoint_url (str): The HTTP endpoint URL for the MCP server. request_headers (Optional[Dict[str, Any]], optional): Optional HTTP request headers. connection_timeout (float, optional): Connection timeout in seconds. Defaults to 10.0. stream_read_timeout (float, optional): Stream read timeout in seconds. Defaults to 300.0. session_timeout (float, optional): Session timeout in seconds. Defaults to 5.0. """ super().__init__(session_timeout) self.endpoint_url = endpoint_url self.request_headers = request_headers or {} self.connection_timeout = connection_timeout self.stream_read_timeout = stream_read_timeout self.transport_mode = HTTPTransportDetector.detect_transport_mode( endpoint_url) def get_stream_provider(self): """ Get appropriate stream provider based on detected transport. """ timeout_delta = timedelta(seconds=self.connection_timeout) if self.transport_mode == 'streamable_http': return streamable_http_client( url=self.endpoint_url, headers=self.request_headers, timeout=timeout_delta, ) else: return sse_client( url=self.endpoint_url, headers=self.request_headers, timeout=self.connection_timeout, ) def __repr__(self) -> str: """ String representation of the HTTP MCP server provider. """ return f"MCPServerHTTP(url={self.endpoint_url}, transport={self.transport_mode})"HTTP/Web-based MCP service provider with automatic transport detection.
Initialize the HTTP MCP server provider.
Args
endpoint_url:str- The HTTP endpoint URL for the MCP server.
request_headers:Optional[Dict[str, Any]], optional- Optional HTTP request headers.
connection_timeout:float, optional- Connection timeout in seconds. Defaults to 10.0.
stream_read_timeout:float, optional- Stream read timeout in seconds. Defaults to 300.0.
session_timeout:float, optional- Session timeout in seconds. Defaults to 5.0.
Ancestors
- MCPServiceProvider
- abc.ABC
Methods
def get_stream_provider(self)-
Expand source code
def get_stream_provider(self): """ Get appropriate stream provider based on detected transport. """ timeout_delta = timedelta(seconds=self.connection_timeout) if self.transport_mode == 'streamable_http': return streamable_http_client( url=self.endpoint_url, headers=self.request_headers, timeout=timeout_delta, ) else: return sse_client( url=self.endpoint_url, headers=self.request_headers, timeout=self.connection_timeout, )Get appropriate stream provider based on detected transport.
Inherited members
class MCPServerStdio (executable_path: str,
process_arguments: List[str],
environment_vars: Dict[str, str] | None = None,
working_directory: str | pathlib.Path | None = None,
session_timeout: float = 5.0)-
Expand source code
class MCPServerStdio(MCPServiceProvider): """ Process-based MCP service provider for local applications. """ def __init__( self, executable_path: str, process_arguments: List[str], environment_vars: Optional[Dict[str, str]] = None, working_directory: Optional[str | Path] = None, session_timeout: float = 5.0, ): """ Initialize the stdio MCP server provider. Args: executable_path (str): Path to the executable MCP server. process_arguments (List[str]): Command line arguments to pass to the executable. environment_vars (Optional[Dict[str, str]], optional): Optional environment variables. working_directory (Optional[str | Path], optional): Working directory for the process. session_timeout (float, optional): Session timeout in seconds. Defaults to 5.0. """ super().__init__(session_timeout) self.executable_path = executable_path self.process_arguments = process_arguments self.environment_vars = environment_vars self.working_directory = Path(working_directory) if working_directory and not isinstance( working_directory, Path) else working_directory def get_stream_provider(self): """ Get stdio stream provider for process communication. """ server_params = StdioServerParameters( command=self.executable_path, args=self.process_arguments, env=self.environment_vars, cwd=self.working_directory ) return stdio_client(server_params) def __repr__(self) -> str: """ String representation of the stdio MCP server provider. """ return (f"MCPServerStdio(executable={self.executable_path}, " f"args={self.process_arguments}, cwd={self.working_directory})")Process-based MCP service provider for local applications.
Initialize the stdio MCP server provider.
Args
executable_path:str- Path to the executable MCP server.
process_arguments:List[str]- Command line arguments to pass to the executable.
environment_vars:Optional[Dict[str, str]], optional- Optional environment variables.
working_directory:Optional[str | Path], optional- Working directory for the process.
session_timeout:float, optional- Session timeout in seconds. Defaults to 5.0.
Ancestors
- MCPServiceProvider
- abc.ABC
Methods
def get_stream_provider(self)-
Expand source code
def get_stream_provider(self): """ Get stdio stream provider for process communication. """ server_params = StdioServerParameters( command=self.executable_path, args=self.process_arguments, env=self.environment_vars, cwd=self.working_directory ) return stdio_client(server_params)Get stdio stream provider for process communication.
Inherited members
class MetricsOptions (enabled: bool = True,
export_url: str | None = None,
export_headers: Dict[str, str] | None = None)-
Expand source code
@dataclass class MetricsOptions: """Configuration for metrics collection and export settings.""" enabled: bool = True export_url: Optional[str] = None export_headers: Optional[Dict[str, str]] = NoneConfiguration for metrics collection and export settings.
Instance variables
var enabled : boolvar export_headers : Dict[str, str] | Nonevar export_url : str | None
class Options (executor_type: Any = None,
num_idle_processes: int = 1,
initialize_timeout: float = 10.0,
close_timeout: float = 60.0,
memory_warn_mb: float = 500.0,
memory_limit_mb: float = 0.0,
ping_interval: float = 30.0,
max_processes: int = 1,
agent_id: str = 'VideoSDKAgent',
auth_token: str | None = None,
permissions: Any = None,
max_retry: int = 16,
load_threshold: float = 0.75,
register: bool = False,
signaling_base_url: str = 'api.videosdk.live',
host: str = '0.0.0.0',
port: int = 8081,
log_level: str = 'INFO')-
Expand source code
@dataclass class Options: """Configuration options for WorkerJob execution.""" executor_type: Any = None # Will be set in __post_init__ """Which executor to use to run jobs. Automatically selected based on platform.""" num_idle_processes: int = 1 """Number of idle processes/threads to keep warm.""" initialize_timeout: float = 10.0 """Maximum amount of time to wait for a process/thread to initialize/prewarm""" close_timeout: float = 60.0 """Maximum amount of time to wait for a job to shut down gracefully""" memory_warn_mb: float = 500.0 """Memory warning threshold in MB.""" memory_limit_mb: float = 0.0 """Maximum memory usage for a job in MB. Defaults to 0 (disabled).""" ping_interval: float = 30.0 """Interval between health check pings.""" max_processes: int = 1 """Maximum number of processes/threads.""" agent_id: str = "VideoSDKAgent" """ID of the agent.""" auth_token: Optional[str] = None """VideoSDK authentication token. Uses VIDEOSDK_AUTH_TOKEN env var if not provided.""" permissions: Any = None # Will be set in __post_init__ """Permissions for the agent participant.""" max_retry: int = 16 """Maximum number of times to retry connecting to VideoSDK.""" load_threshold: float = 0.75 """Load threshold above which worker is marked as unavailable.""" register: bool = False """Whether to register with the backend. Defaults to False for local development.""" signaling_base_url: str = "api.videosdk.live" """Signaling base URL for VideoSDK services. Defaults to api.videosdk.live.""" host: str = "0.0.0.0" """Host for the debug HTTP server.""" port: int = 8081 """Port for the debug HTTP server.""" log_level: str = "INFO" """Log level for SDK logging. Options: DEBUG, INFO, WARNING, ERROR. Defaults to INFO.""" def __post_init__(self): """Post-initialization setup.""" # Import here to avoid circular imports from .worker import ExecutorType, WorkerPermissions, _default_executor_type if self.executor_type is None: self.executor_type = _default_executor_type if self.permissions is None: self.permissions = WorkerPermissions() if not self.auth_token: self.auth_token = os.getenv("VIDEOSDK_AUTH_TOKEN")Configuration options for WorkerJob execution.
Instance variables
var agent_id : str-
ID of the agent.
var auth_token : str | None-
VideoSDK authentication token. Uses VIDEOSDK_AUTH_TOKEN env var if not provided.
var close_timeout : float-
Maximum amount of time to wait for a job to shut down gracefully
var executor_type : Any-
Which executor to use to run jobs. Automatically selected based on platform.
var host : str-
Host for the debug HTTP server.
var initialize_timeout : float-
Maximum amount of time to wait for a process/thread to initialize/prewarm
var load_threshold : float-
Load threshold above which worker is marked as unavailable.
var log_level : str-
Log level for SDK logging. Options: DEBUG, INFO, WARNING, ERROR. Defaults to INFO.
var max_processes : int-
Maximum number of processes/threads.
var max_retry : int-
Maximum number of times to retry connecting to VideoSDK.
var memory_limit_mb : float-
Maximum memory usage for a job in MB. Defaults to 0 (disabled).
var memory_warn_mb : float-
Memory warning threshold in MB.
var num_idle_processes : int-
Number of idle processes/threads to keep warm.
var permissions : Any-
Permissions for the agent participant.
var ping_interval : float-
Interval between health check pings.
var port : int-
Port for the debug HTTP server.
var register : bool-
Whether to register with the backend. Defaults to False for local development.
var signaling_base_url : str-
Signaling base URL for VideoSDK services. Defaults to api.videosdk.live.
class Pipeline (stt: STT | None = None,
llm: LLM | RealtimeBaseModel | None = None,
tts: TTS | None = None,
vad: VAD | None = None,
turn_detector: EOU | None = None,
avatar: typing.Any | None = None,
denoise: Denoise | None = None,
eou_config: EOUConfig | None = None,
interrupt_config: InterruptConfig | None = None,
conversational_graph: typing.Any | None = None,
max_context_items: int | None = None,
voice_mail_detector: VoiceMailDetector | None = None,
realtime_config: RealtimeConfig | None = None)-
Expand source code
class Pipeline(EventEmitter[Literal["start", "error", "transcript_ready", "content_generated", "synthesis_complete"]]): """ Unified Pipeline class supporting multiple component configurations. Supports: - Full cascading: VAD → STT → TurnD → LLM → TTS - Partial cascading: Any subset of components - Realtime: Speech-to-speech models (OpenAI Realtime, Gemini Live) - Hybrid: Components + user event callbacks Args: stt: Speech-to-Text processor (optional) llm: Language Model or RealtimeBaseModel (optional) tts: Text-to-Speech processor (optional) vad: Voice Activity Detector (optional) turn_detector: End-of-Utterance detector (optional) avatar: Avatar for visual output (optional) denoise: Audio denoiser (optional) eou_config: End of utterance configuration interrupt_config: Interruption configuration conversational_graph: Conversational graph for structured dialogs (optional) max_context_items: Maximum chat context items (auto-truncates when exceeded) voice_mail_detector: Voicemail detection (optional) """ def __init__( self, stt: STT | None = None, llm: LLM | RealtimeBaseModel | None = None, tts: TTS | None = None, vad: VAD | None = None, turn_detector: EOU | None = None, avatar: Any | None = None, denoise: Denoise | None = None, eou_config: EOUConfig | None = None, interrupt_config: InterruptConfig | None = None, conversational_graph: Any | None = None, max_context_items: int | None = None, voice_mail_detector: VoiceMailDetector | None = None, realtime_config: RealtimeConfig | None = None, ) -> None: super().__init__() # Store raw components self.stt = stt self.tts = tts self.vad = vad self.turn_detector = turn_detector self.avatar = avatar self.denoise = denoise self.conversational_graph = conversational_graph self.max_context_items = max_context_items self.voice_mail_detector = voice_mail_detector # Pipeline hooks for middleware/interception self.hooks = PipelineHooks() # Realtime configuration self.agent : Agent | None = None self.realtime_config = realtime_config # Detect and handle realtime models self.llm: LLM | RealtimeLLMAdapter | None = None self._realtime_model: RealtimeBaseModel | None = None if isinstance(llm, RealtimeBaseModel): self._realtime_model = llm self.llm = RealtimeLLMAdapter(llm) if self.agent: self.llm.set_agent(self.agent) else: self.llm = llm self.config: PipelineConfig = build_pipeline_config( stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, avatar=self.avatar, denoise=self.denoise, realtime_model=self._realtime_model, realtime_config_mode=( self.realtime_config.mode if self.realtime_config and self.realtime_config.mode else None ), ) # Configuration self.eou_config = eou_config or EOUConfig() self.interrupt_config = interrupt_config or InterruptConfig() # Pipeline state self.orchestrator: PipelineOrchestrator | None = None self.speech_generation: SpeechGeneration | None = None self.vision = False self.loop: asyncio.AbstractEventLoop | None = None self.audio_track: CustomAudioStreamTrack | None = None self._wake_up_callback: Optional[Callable[[], None]] = None self._recent_frames: list[av.VideoFrame] = [] self._max_frames_buffer = 5 self._vision_lock = asyncio.Lock() self._current_utterance_handle: UtteranceHandle | None = None self._setup_error_handlers() self._auto_register() def _auto_register(self) -> None: """Automatically register this pipeline with the current job context""" try: job_context = get_current_job_context() if job_context: job_context._set_pipeline_internal(self) except Exception: pass @property def realtime_mode(self) -> str | None: """Backwards-compatible alias. Returns the string value or None.""" return self.config.realtime_mode.value if self.config.realtime_mode else None @property def _is_realtime_mode(self) -> bool: """Backwards-compatible alias.""" return self.config.is_realtime def _configure_text_only_mode(self) -> None: """Configure realtime model for text-only output (provider-specific)""" if not self._realtime_model or not hasattr(self._realtime_model, 'config'): return config = self._realtime_model.config if hasattr(config, 'response_modalities'): config.response_modalities = ["TEXT"] logger.info("Configured Gemini for TEXT-only mode") elif hasattr(config, 'modalities'): config.modalities = ["text"] logger.info("Configured OpenAI for text-only mode") else: logger.warning(f"Unknown realtime provider config, could not set text-only mode") def _wrap_async(self, async_func): """Wrap an async function to be compatible with EventEmitter's sync-only handlers""" def sync_wrapper(*args, **kwargs): asyncio.create_task(async_func(*args, **kwargs)) return sync_wrapper async def _on_transcript_ready_hybrid_stt(self, data: dict) -> None: """Handle transcript in hybrid STT mode (external STT + KB + realtime LLM+TTS)""" transcript = data["text"] if not self.agent: logger.warning("No agent available for transcript processing") return logger.info(f"Processing transcript in hybrid_stt mode: {transcript}") enriched_text = transcript if self.agent.knowledge_base: try: logger.info(f"Querying knowledge base for: {transcript[:100]}...") kb_context = await self.agent.knowledge_base.process_query(transcript) if kb_context: enriched_text = f"{kb_context}\n\nUser: {transcript}" logger.info(f"Enriched transcript with KB context: {kb_context[:100]}...") else: logger.info("No KB context returned") except Exception as e: logger.error(f"Error processing KB query: {e}", exc_info=True) if isinstance(self.llm, RealtimeLLMAdapter): try: await self.llm.send_text_message(enriched_text) logger.info("Sent enriched text to realtime model") except Exception as e: logger.error(f"Error sending text to realtime model: {e}") async def _on_realtime_transcription_hybrid_tts(self, data: dict) -> None: """Handle transcription from realtime model in hybrid TTS mode""" role = data.get("role") text = data.get("text") is_final = data.get("is_final", False) if role not in ["agent", "assistant", "model"] or not is_final or not text: return logger.info(f"Intercepted final text from realtime model (hybrid_tts): {text[:100]}...") if self.speech_generation: try: await self.speech_generation.synthesize(text) logger.info("Sent transcribed text to external TTS") except Exception as e: logger.error(f"Error synthesizing with external TTS: {e}") def on( self, event: Literal["speech_in", "speech_out", "stt", "llm", "tts", "vision_frame", "user_turn_start", "user_turn_end", "agent_turn_start", "agent_turn_end"] | str, callback: Callable | None = None ) -> Callable: """ Register a listener for pipeline events or a hook for processing stages. Can be used as a decorator or with a callback. Supported hooks (decorator only): - stt: STT processing (async iterator: audio -> events) - tts: TTS processing (async iterator: text -> audio) - llm: Called when LLM generates content. Return/yield str to modify, return None to observe. - vision_frame: Process video frames when vision is enabled (async iterator) - user_turn_start: Called when user turn starts - user_turn_end: Called when user turn ends - agent_turn_start: Called when agent processing starts - agent_turn_end: Called when agent finishes speaking Supported events (listener): - transcript_ready - synthesis_complete - error Examples: @pipeline.on("llm") async def on_llm(data): print(f"LLM generated: {data['text']}") @pipeline.on("llm") async def modify_response(data): text = data.get("text", "") yield text.replace("SSN", "[REDACTED]") """ if event in ["stt", "tts", "llm", "vision_frame", "user_turn_start", "user_turn_end", "agent_turn_start", "agent_turn_end"]: return self.hooks.on(event)(callback) if callback else self.hooks.on(event) return super().on(event, callback) def _setup_error_handlers(self) -> None: """Setup error handlers for all components""" if self.stt: self.stt.on("error", lambda *args: self.on_component_error("STT", args[0] if args else "Unknown error")) if self.llm and not self.config.is_realtime: self.llm.on("error", lambda *args: self.on_component_error("LLM", args[0] if args else "Unknown error")) if self.tts: self.tts.on("error", lambda *args: self.on_component_error("TTS", args[0] if args else "Unknown error")) if self.vad: self.vad.on("error", lambda *args: self.on_component_error("VAD", args[0] if args else "Unknown error")) if self.turn_detector: self.turn_detector.on("error", lambda *args: self.on_component_error("TURN-D", args[0] if args else "Unknown error")) def on_component_error(self, source: str, error_data: Any) -> None: """Handle error events from components""" logger.error(f"[{source}] Component error: {error_data}") metrics_collector.add_error(source, error_data) self.emit("error", {"source": source, "error": str(error_data)}) def get_session_metrics_snapshot(self) -> dict: """Return dict suitable for populating SessionMetrics fields.""" return { "pipeline_type": self.config.pipeline_mode.value, "components": self.config.component_names, } def set_agent(self, agent: Any) -> None: """Associate an agent with this pipeline and configure the orchestrator based on the pipeline mode.""" self.agent = agent # Configure metrics with pipeline info metrics_collector.configure_pipeline( pipeline_mode=self.config.pipeline_mode, realtime_mode=self.config.realtime_mode, active_components=self.config.active_components, ) metrics_collector.set_eou_config(self.eou_config) metrics_collector.set_interrupt_config(self.interrupt_config) if self.config.realtime_mode in (RealtimeMode.HYBRID_STT, RealtimeMode.LLM_ONLY): logger.info(f"Creating orchestrator for {self.config.realtime_mode.value} mode") self.orchestrator = PipelineOrchestrator( agent=agent, stt=self.stt, llm=None, tts=None, vad=self.vad, turn_detector=self.turn_detector, denoise=self.denoise, avatar=None, mode=self.eou_config.mode, min_speech_wait_timeout=self.eou_config.min_max_speech_wait_timeout, interrupt_mode=self.interrupt_config.mode, interrupt_min_duration=self.interrupt_config.interrupt_min_duration, interrupt_min_words=self.interrupt_config.interrupt_min_words, false_interrupt_pause_duration=self.interrupt_config.false_interrupt_pause_duration, resume_on_false_interrupt=self.interrupt_config.resume_on_false_interrupt, conversational_graph=None, max_context_items=self.max_context_items, voice_mail_detector=self.voice_mail_detector, hooks=self.hooks, ) self.orchestrator.on("transcript_ready", self._wrap_async(self._on_transcript_ready_hybrid_stt)) logger.info("Registered hybrid_stt event listener on orchestrator") if isinstance(self.llm, RealtimeLLMAdapter): self.llm.set_agent(agent) elif self.config.realtime_mode == RealtimeMode.HYBRID_TTS: logger.info("Setting up hybrid_tts mode: realtime STT+LLM + external TTS") if hasattr(self._realtime_model, 'audio_track'): self._realtime_model.audio_track = None logger.info("Disconnected realtime model audio track (external TTS will be used)") if self.tts: self.speech_generation = SpeechGeneration( agent=agent, tts=self.tts, avatar=self.avatar, hooks=self.hooks, ) if self._realtime_model and not hasattr(self, '_hybrid_tts_listeners_registered'): self._hybrid_tts_listeners_registered = True self._realtime_model.on("realtime_model_transcription", self._wrap_async(self._on_realtime_transcription_hybrid_tts)) logger.info("Registered hybrid_tts event listener for realtime_model_transcription") if isinstance(self.llm, RealtimeLLMAdapter): self.llm.set_agent(agent) elif self.config.realtime_mode == RealtimeMode.FULL_S2S: if isinstance(self.llm, RealtimeLLMAdapter): self.llm.set_agent(agent) elif not self.config.is_realtime: if self.conversational_graph: self.conversational_graph.compile() self.orchestrator = PipelineOrchestrator( agent=agent, stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, denoise=self.denoise, avatar=self.avatar, mode=self.eou_config.mode, min_speech_wait_timeout=self.eou_config.min_max_speech_wait_timeout, interrupt_mode=self.interrupt_config.mode, interrupt_min_duration=self.interrupt_config.interrupt_min_duration, interrupt_min_words=self.interrupt_config.interrupt_min_words, false_interrupt_pause_duration=self.interrupt_config.false_interrupt_pause_duration, resume_on_false_interrupt=self.interrupt_config.resume_on_false_interrupt, conversational_graph=self.conversational_graph, max_context_items=self.max_context_items, voice_mail_detector=self.voice_mail_detector, hooks=self.hooks, ) self.orchestrator.on("transcript_ready", lambda data: self.emit("transcript_ready", data)) self.orchestrator.on("content_generated", lambda data: self.emit("content_generated", data)) self.orchestrator.on("synthesis_complete", lambda data: self.emit("synthesis_complete", data)) self.orchestrator.on("voicemail_result", lambda data: self.emit("voicemail_result", data)) def _set_loop_and_audio_track(self, loop: asyncio.AbstractEventLoop, audio_track: CustomAudioStreamTrack) -> None: """Set the event loop and audio output track, then configure all pipeline components.""" self.loop = loop self.audio_track = audio_track self._configure_components() async def change_pipeline( self, stt: STT | None = None, llm: LLM | RealtimeBaseModel | None = None, tts: TTS | None = None, vad: VAD | None = None, turn_detector: EOU | None = None, avatar: Any | None = None, denoise: Denoise | None = None, eou_config: EOUConfig | None = None, interrupt_config: InterruptConfig | None = None, conversational_graph: Any | None = None, max_context_items: int | None = None, voice_mail_detector: VoiceMailDetector | None = None, realtime_config: RealtimeConfig | None = None ) -> None: """ Dynamically change pipeline configuration and components. This method allows switching between different modes (Realtime, Cascading, Hybrid) and updating individual components. """ logger.info("Changing pipeline configuration...") if self.orchestrator: await self.orchestrator.interrupt() get_provider_info = self.agent.session._get_provider_info start_time = time.perf_counter() original_pipeline_config = {} if not self.config.is_realtime: if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') original_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.llm: p_class, p_model = get_provider_info(self.llm, 'llm') original_pipeline_config["llm"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') original_pipeline_config["tts"] = {"class": p_class, "model": p_model} if hasattr(self, 'vad') and self.vad: p_class, p_model = get_provider_info(self.vad, 'vad') original_pipeline_config["vad"] = {"class": p_class, "model": p_model} if hasattr(self, 'turn_detector') and self.turn_detector: p_class, p_model = get_provider_info(self.turn_detector, 'eou') original_pipeline_config["eou"] = {"class": p_class, "model": p_model} else: if self._realtime_model: original_pipeline_config["realtime"] = {"class": self._realtime_model.__class__.__name__, "model": getattr(self._realtime_model, 'model', '')} if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') original_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') original_pipeline_config["tts"] = {"class": p_class, "model": p_model} original_pipeline_config["pipeline_mode"] = self.config.pipeline_mode.value original_pipeline_config["denoise"] = self.denoise.__class__.__name__ original_pipeline_config["eou_config"] = asdict(self.eou_config) original_pipeline_config["interrupt_config"] = asdict(self.interrupt_config) original_pipeline_config["max_context_items"] = self.max_context_items if self._realtime_model and hasattr(self._realtime_model, 'audio_track'): self._realtime_model.audio_track = None await cleanup_pipeline(self, llm_changing=True) # 2.Update components await swap_component_in_orchestrator( self, 'stt', stt, 'speech_understanding', 'stt_lock', register_stt_transcript_listener ) await swap_tts(self, tts) await swap_component_in_orchestrator(self, 'vad', vad, 'speech_understanding') await swap_component_in_orchestrator(self, 'turn_detector', turn_detector, 'speech_understanding', 'turn_detector_lock') await swap_component_in_orchestrator(self, 'denoise', denoise, 'speech_understanding', 'denoise_lock') if self.avatar and self.avatar != avatar: await self.avatar.aclose() self.avatar = avatar # Update configs if eou_config is not None: self.eou_config = eou_config if interrupt_config is not None: self.interrupt_config = interrupt_config if max_context_items is not None: self.max_context_items = max_context_items if voice_mail_detector is not None: self.voice_mail_detector = voice_mail_detector if realtime_config is not None: self.realtime_config = realtime_config if conversational_graph is not None: self.conversational_graph = conversational_graph if self.conversational_graph and hasattr(self.conversational_graph, 'compile'): self.conversational_graph.compile() # Update LLM / Realtime Model await swap_llm(self, llm) # 3. REBOOT: Detect mode and restart self.config = build_pipeline_config( stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, avatar=self.avatar, denoise=self.denoise, realtime_model=self._realtime_model, realtime_config_mode=( self.realtime_config.mode if self.realtime_config and self.realtime_config.mode else None ), ) new_mode = self.config.pipeline_mode.value logger.info(f"New pipeline mode: {new_mode}") if self.agent: logger.info("Restarting pipeline with updated components") self.set_agent(self.agent) self._configure_components() end_time = time.perf_counter() time_data = { "start_time": start_time, "end_time": end_time } new_pipeline_config = {} if not self.config.is_realtime: if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') new_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.llm: p_class, p_model = get_provider_info(self.llm, 'llm') new_pipeline_config["llm"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') new_pipeline_config["tts"] = {"class": p_class, "model": p_model} if hasattr(self, 'vad') and self.vad: p_class, p_model = get_provider_info(self.vad, 'vad') new_pipeline_config["vad"] = {"class": p_class, "model": p_model} if hasattr(self, 'turn_detector') and self.turn_detector: p_class, p_model = get_provider_info(self.turn_detector, 'eou') new_pipeline_config["eou"] = {"class": p_class, "model": p_model} else: if self._realtime_model: new_pipeline_config["realtime"] = {"class": self._realtime_model.__class__.__name__, "model": getattr(self._realtime_model, 'model', '')} if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') new_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') new_pipeline_config["tts"] = {"class": p_class, "model": p_model} new_pipeline_config["pipeline_mode"] = self.config.pipeline_mode.value new_pipeline_config["eou_config"] = asdict(self.eou_config) new_pipeline_config["interrupt_config"] = asdict(self.interrupt_config) new_pipeline_config["max_context_items"] = self.max_context_items metrics_collector.traces_flow_manager.create_pipeline_change_trace(time_data, original_pipeline_config, new_pipeline_config) self._setup_error_handlers() await self.start() async def change_component( self, stt: STT | None = NO_CHANGE, llm: LLM | RealtimeBaseModel | None = NO_CHANGE, tts: TTS | None = NO_CHANGE, vad: VAD | None = NO_CHANGE, turn_detector: EOU | None = NO_CHANGE, denoise: Denoise | None = NO_CHANGE, ) -> None: """Dynamically change components. This will close the old components and set the new ones. """ logger.info("Changing pipeline component(s)...") start_time = time.perf_counter() components_change_data = { "new_stt": stt.__class__.__name__ if stt is not NO_CHANGE else None, "new_tts": tts.__class__.__name__ if tts is not NO_CHANGE else None, "new_llm": llm.__class__.__name__ if llm is not NO_CHANGE else None, "new_vad": vad.__class__.__name__ if vad is not NO_CHANGE else None, "new_turn_detector": turn_detector.__class__.__name__ if turn_detector is not NO_CHANGE else None, "new_denoise": denoise.__class__.__name__ if denoise is not NO_CHANGE else None } # 0 Change components only if present earlier validation_map = { 'STT': (stt, self.stt), 'TTS': (tts, self.tts), 'LLM': (llm, self.llm), 'VAD': (vad, self.vad), 'Turn Detector': (turn_detector, self.turn_detector), 'Denoise': (denoise, self.denoise) } for name, (new_val, current_val) in validation_map.items(): if new_val is not NO_CHANGE and current_val is None: raise ValueError( f"Cannot change component '{name}' because it is not present in the current pipeline. " "Use change_pipeline() for full reconfiguration." ) logger.info(f"Performing swap in {self.config.pipeline_mode.value} mode") # Detect pipeline mode shift mode_shift = check_mode_shift(self, llm, stt, tts) if mode_shift: logger.info("Component change triggers mode shift. Delegating to change_pipeline for full reconfiguration.") # Resolve sentinels to current values for resettlement target_stt = self.stt if stt is NO_CHANGE else stt target_tts = self.tts if tts is NO_CHANGE else tts target_vad = self.vad if vad is NO_CHANGE else vad target_turn_detector = self.turn_detector if turn_detector is NO_CHANGE else turn_detector target_denoise = self.denoise if denoise is NO_CHANGE else denoise if llm is NO_CHANGE: target_llm = self._realtime_model if self._realtime_model else self.llm else: target_llm = llm await self.change_pipeline( stt=target_stt, llm=target_llm, tts=target_tts, vad=target_vad, turn_detector=target_turn_detector, denoise=target_denoise, avatar=self.avatar, eou_config=self.eou_config, interrupt_config=self.interrupt_config, conversational_graph=self.conversational_graph, max_context_items=self.max_context_items, voice_mail_detector=self.voice_mail_detector, realtime_config=self.realtime_config ) return components_change_status = {} if stt is not NO_CHANGE and self.stt != stt: await swap_component_in_orchestrator( self, 'stt', stt, 'speech_understanding', 'stt_lock', register_stt_transcript_listener ) components_change_status["new_stt"] = "success" if llm is not NO_CHANGE and self.llm != llm: await swap_llm(self, llm) components_change_status["new_llm"] = "success" if tts is not NO_CHANGE and self.tts != tts: await swap_tts(self, tts) components_change_status["new_tts"] = "success" if vad is not NO_CHANGE and self.vad != vad: await swap_component_in_orchestrator(self, 'vad', vad, 'speech_understanding') components_change_status["new_vad"] = "success" if turn_detector is not NO_CHANGE and self.turn_detector != turn_detector: await swap_component_in_orchestrator(self, 'turn_detector', turn_detector, 'speech_understanding', 'turn_detector_lock') components_change_status["new_turn_detector"] = "success" if denoise is not NO_CHANGE and self.denoise != denoise: await swap_component_in_orchestrator(self, 'denoise', denoise, 'speech_understanding', 'denoise_lock') components_change_status["new_denoise"] = "success" # 3. REBOOT: Rebuild config with updated components self.config = build_pipeline_config( stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, avatar=self.avatar, denoise=self.denoise, realtime_model=self._realtime_model, realtime_config_mode=( self.realtime_config.mode if self.realtime_config and self.realtime_config.mode else None ), ) end_time = time.perf_counter() time_data = { "start_time": start_time, "end_time": end_time } if self._is_realtime_mode: self._configure_components() self._setup_error_handlers() await self.start() metrics_collector.traces_flow_manager.create_components_change_trace(components_change_status, components_change_data, time_data) new_mode = self.config.pipeline_mode.value logger.info(f"New pipeline mode: {new_mode}") return def _configure_components(self) -> None: """Configure pipeline components with the event loop, audio track, and vision settings based on pipeline mode.""" if not self.loop: return job_context = get_current_job_context() if job_context and job_context.room: requested_vision = getattr(job_context.room, 'vision', False) self.vision = requested_vision if requested_vision and self.config.is_realtime: model_name = self._realtime_model.__class__.__name__ if self._realtime_model else "Unknown" if model_name not in ["GeminiRealtime", "OpenAIRealtime"]: logger.warning(f"Vision requested but {model_name} doesn't support video input. Disabling vision.") self.vision = False if not self.config.is_realtime and self.tts: self.tts.loop = self.loop if self.avatar and job_context and job_context.room: self.tts.audio_track = getattr(job_context.room, "agent_audio_track", None) or job_context.room.audio_track elif self.audio_track: self.tts.audio_track = self.audio_track if self.tts.audio_track: logger.info(f"TTS audio track configured: {type(self.tts.audio_track).__name__}") # Set hooks on audio track for speech_out processing if hasattr(self.tts.audio_track, 'set_pipeline_hooks'): self.tts.audio_track.set_pipeline_hooks(self.hooks) if self.orchestrator: self.orchestrator.set_audio_track(self.tts.audio_track) if self.config.is_realtime and self._realtime_model: self._realtime_model.loop = self.loop audio_track = None if self.avatar and job_context and job_context.room: audio_track = getattr(job_context.room, 'agent_audio_track', None) or job_context.room.audio_track elif self.audio_track: audio_track = self.audio_track if self.config.realtime_mode == RealtimeMode.HYBRID_TTS and self.tts: self._realtime_model.audio_track = None self.tts.audio_track = audio_track self.tts.loop = self.loop logger.info("hybrid_tts: Audio track connected to external TTS, disconnected from realtime model") if self.tts.audio_track and hasattr(self.tts.audio_track, 'set_pipeline_hooks'): self.tts.audio_track.set_pipeline_hooks(self.hooks) else: self._realtime_model.audio_track = audio_track if self._realtime_model.audio_track and hasattr(self._realtime_model.audio_track, 'set_pipeline_hooks'): self._realtime_model.audio_track.set_pipeline_hooks(self.hooks) async def _audio_track_callback(): self._realtime_model.emit("agent_speech_ended", {}) self._on_agent_speech_ended_realtime({}) self._realtime_model.audio_track.on_last_audio_byte(_audio_track_callback) def set_wake_up_callback(self, callback: Callable[[], None]) -> None: """Set a callback to be invoked when user speech is first detected.""" self._wake_up_callback = callback def _notify_speech_started(self) -> None: """Notify that user speech started (triggers wake-up)""" if self._wake_up_callback: self._wake_up_callback() async def start(self, **kwargs: Any) -> None: """ Start the pipeline processing. Args: **kwargs: Additional arguments for pipeline configuration """ logger.info( f"Starting pipeline | mode={self.config.pipeline_mode.value} " f"| realtime={self.config.realtime_mode.value if self.config.realtime_mode else 'none'} " f"| components={self.config.component_names}" ) if self.config.is_realtime: if self._realtime_model: await self._realtime_model.connect() if isinstance(self.llm, RealtimeLLMAdapter): self.llm.on_user_speech_started(lambda data: self._on_user_speech_started_realtime(data)) self.llm.on_user_speech_ended(lambda data: asyncio.create_task(self._on_user_speech_ended_realtime(data))) self.llm.on_agent_speech_started(lambda data: asyncio.create_task(self._on_agent_speech_started_realtime(data))) # self.llm.on_agent_speech_ended(lambda data: self._on_agent_speech_ended_realtime(data)) self.llm.on_transcription(self._on_realtime_transcription) if self.config.realtime_mode == RealtimeMode.HYBRID_STT and self.orchestrator: await self.orchestrator.start() logger.info("Started orchestrator for hybrid_stt mode") else: if self.orchestrator: await self.orchestrator.start() async def send_message(self, message: str, handle: UtteranceHandle) -> None: """ Send a message to the pipeline. Args: message: Message text to send handle: Utterance handle to track """ self._current_utterance_handle = handle if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): self.llm.current_utterance = handle try: await self.llm.send_message(message) except Exception as e: logger.error(f"Error sending message: {e}") handle._mark_done() else: if self.orchestrator: await self.orchestrator.say(message, handle) else: logger.warning("No orchestrator available") handle._mark_done() async def send_text_message(self, message: str) -> None: """ Send a text message (for A2A or text-only scenarios). Args: message: Text message to send """ if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.send_text_message(message) else: if self.orchestrator: await self.orchestrator.process_text(message) async def on_audio_delta(self, audio_data: bytes) -> None: """ Handle incoming audio data from the user. Args: audio_data: Raw audio bytes """ if self.config.realtime_mode == RealtimeMode.HYBRID_STT and self.orchestrator: await self.orchestrator.process_audio(audio_data) elif self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.handle_audio_input(audio_data) else: if self.orchestrator: await self.orchestrator.process_audio(audio_data) if not hasattr(self, '_first_audio_logged'): self._first_audio_logged = True if self.config.realtime_mode == RealtimeMode.HYBRID_STT: logger.info("Audio routing: hybrid_stt → orchestrator (external STT)") elif self.config.is_realtime: logger.info("Audio routing: realtime mode → realtime model") else: logger.info("Audio routing: traditional mode → orchestrator") async def on_video_delta(self, video_data: av.VideoFrame) -> None: """ Handle incoming video data from the user. Args: video_data: Video frame """ if not self.vision: return if self._vision_lock.locked(): return # Process through vision_frame hook if available if self.hooks and self.hooks.has_vision_frame_hooks(): async def frame_stream(): yield video_data processed_stream = self.hooks.process_vision_frame(frame_stream()) async for processed_frame in processed_stream: video_data = processed_frame self._recent_frames.append(video_data) if len(self._recent_frames) > self._max_frames_buffer: self._recent_frames.pop(0) if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.handle_video_input(video_data) def get_latest_frames(self, num_frames: int = 1) -> list[av.VideoFrame]: """ Get the latest video frames from the pipeline. Args: num_frames: Number of frames to retrieve (default: 1, max: 5) Returns: List of VideoFrame objects """ if not self.vision: logger.warning("Vision not enabled") return [] num_frames = max(1, min(num_frames, self._max_frames_buffer)) if not self._recent_frames: return [] return self._recent_frames[-num_frames:] def interrupt(self) -> None: """Interrupt the current agent speech, cancelling ongoing generation and playback if interruptible.""" if self.config.is_realtime: if self._realtime_model: if self._realtime_model.current_utterance and not self._realtime_model.current_utterance.is_interruptible: logger.info("Interruption disabled for current utterance") return asyncio.create_task(self._realtime_model.interrupt()) if self.config.realtime_mode == RealtimeMode.HYBRID_TTS and self.speech_generation: asyncio.create_task(self.speech_generation.interrupt()) if self.avatar and hasattr(self.avatar, 'interrupt'): asyncio.create_task(self.avatar.interrupt()) if self._current_utterance_handle and not self._current_utterance_handle.done(): if self._current_utterance_handle.is_interruptible: self._current_utterance_handle.interrupt() else: if self.orchestrator: asyncio.create_task(self.orchestrator.interrupt()) if self.avatar and hasattr(self.avatar, 'interrupt'): asyncio.create_task(self.avatar.interrupt()) async def reply_with_context( self, instructions: str, wait_for_playback: bool, handle: UtteranceHandle, frames: list[av.VideoFrame] | None = None ) -> None: """ Generate a reply using instructions and current chat context. Args: instructions: Instructions to add to context wait_for_playback: If True, wait for playback to complete handle: Utterance handle frames: Optional video frames for vision """ self._current_utterance_handle = handle if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): self.llm.current_utterance = handle if frames and hasattr(self.llm, 'send_message_with_frames'): async with self._vision_lock: await self.llm.send_message_with_frames(instructions, frames) else: await self.llm.send_text_message(instructions) else: if self.orchestrator: await self.orchestrator.reply_with_context(instructions, wait_for_playback, handle, frames) else: logger.warning("No orchestrator available") handle._mark_done() def _on_user_speech_started_realtime(self, data: dict) -> None: """Handle user speech started in realtime mode""" self._notify_speech_started() metrics_collector.on_user_speech_start() if self.config.realtime_mode == RealtimeMode.HYBRID_TTS and self.speech_generation: asyncio.create_task(self.speech_generation.interrupt()) if self.agent and self.agent.session: self.agent.session._emit_user_state(UserState.SPEAKING) self.agent.session._emit_agent_state(AgentState.LISTENING) async def _on_user_speech_ended_realtime(self, data: dict) -> None: """Handle user speech ended in realtime mode""" metrics_collector.on_user_speech_end() if self.agent and self.agent.session: self.agent.session._emit_user_state(UserState.IDLE) self.agent.session._emit_agent_state(AgentState.THINKING) if self.agent.session.is_background_audio_enabled: await self.agent.session.start_thinking_audio() async def _on_agent_speech_started_realtime(self, data: dict) -> None: """Handle agent speech started in realtime mode""" metrics_collector.on_agent_speech_start() if self.agent and self.agent.session: self.agent.session._emit_agent_state(AgentState.SPEAKING) self.agent.session._emit_user_state(UserState.LISTENING) if self.agent.session.is_background_audio_enabled: await self.agent.session.stop_thinking_audio() def _on_agent_speech_ended_realtime(self, data: dict) -> None: """Handle agent speech ended in realtime mode""" metrics_collector.on_agent_speech_end() metrics_collector.schedule_turn_complete(timeout=1.0) if self.agent: self.agent.session._emit_user_state(UserState.IDLE) self.agent.session._emit_agent_state(AgentState.IDLE) if self._current_utterance_handle and not self._current_utterance_handle.done(): self._current_utterance_handle._mark_done() if self._realtime_model: self._realtime_model.current_utterance = None if self.avatar and hasattr(self.avatar, 'send_segment_end'): asyncio.create_task(self.avatar.send_segment_end()) if self.agent and hasattr(self.agent, 'on_agent_speech_ended'): self.agent.on_agent_speech_ended(data) def _on_realtime_transcription(self, data: dict) -> None: """Handle realtime model transcription""" self.emit("realtime_model_transcription", data) if self.voice_mail_detector: pass def set_voice_mail_detector(self, detector: VoiceMailDetector | None) -> None: """Set or replace the voicemail detector on the pipeline and its orchestrator.""" self.voice_mail_detector = detector if self.orchestrator: self.orchestrator.set_voice_mail_detector(detector) async def process_text(self, text: str) -> None: """ Process text input directly (bypasses STT). Args: text: User text input """ if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.send_text_message(text) else: if self.orchestrator: await self.orchestrator.process_text(text) else: logger.warning("No orchestrator available for text processing") def get_component_configs(self) -> Dict[str, Dict[str, Any]]: """Return a dictionary of public configuration attributes for each active pipeline component.""" configs: Dict[str, Dict[str, Any]] = {} for comp_name, comp in [ ("stt", self.stt), ("llm", self.llm if not self.config.is_realtime else self._realtime_model), ("tts", self.tts), ("vad", self.vad), ("eou", self.turn_detector), ]: if comp: try: configs[comp_name] = { k: v for k, v in comp.__dict__.items() if not k.startswith("_") and not callable(v) } except Exception: configs[comp_name] = {} return configs async def cleanup(self) -> None: """Release all pipeline resources, close components, and reset internal state.""" logger.info("Cleaning up pipeline") if self.config.is_realtime: if self._realtime_model: await self._realtime_model.aclose() self._realtime_model = None if self.avatar: if hasattr(self.avatar, 'cleanup'): await self.avatar.cleanup() elif hasattr(self.avatar, 'aclose'): await self.avatar.aclose() self.avatar = None if self.denoise: await self.denoise.aclose() self.denoise = None else: if self.stt: await self.stt.aclose() self.stt = None if self.llm and not isinstance(self.llm, RealtimeLLMAdapter): await self.llm.aclose() self.llm = None if self.tts: await self.tts.aclose() self.tts = None if self.vad: await self.vad.aclose() self.vad = None if self.turn_detector: await self.turn_detector.aclose() self.turn_detector = None if self.denoise: await self.denoise.aclose() self.denoise = None if self.avatar: if hasattr(self.avatar, 'cleanup'): await self.avatar.cleanup() elif hasattr(self.avatar, 'aclose'): await self.avatar.aclose() self.avatar = None if self.orchestrator: await self.orchestrator.cleanup() self.orchestrator = None self.agent = None self.vision = False self.loop = None self.audio_track = None self._wake_up_callback = None self._recent_frames = [] self._current_utterance_handle = None logger.info("Pipeline cleaned up") async def leave(self) -> None: """Leave the pipeline by performing a full cleanup of all resources.""" await self.cleanup()Unified Pipeline class supporting multiple component configurations.
Supports: - Full cascading: VAD → STT → TurnD → LLM → TTS - Partial cascading: Any subset of components - Realtime: Speech-to-speech models (OpenAI Realtime, Gemini Live) - Hybrid: Components + user event callbacks
Args
stt- Speech-to-Text processor (optional)
llm- Language Model or RealtimeBaseModel (optional)
tts- Text-to-Speech processor (optional)
vad- Voice Activity Detector (optional)
turn_detector- End-of-Utterance detector (optional)
avatar- Avatar for visual output (optional)
denoise- Audio denoiser (optional)
eou_config- End of utterance configuration
interrupt_config- Interruption configuration
conversational_graph- Conversational graph for structured dialogs (optional)
max_context_items- Maximum chat context items (auto-truncates when exceeded)
voice_mail_detector- Voicemail detection (optional)
Ancestors
- EventEmitter
- typing.Generic
Instance variables
prop realtime_mode : str | None-
Expand source code
@property def realtime_mode(self) -> str | None: """Backwards-compatible alias. Returns the string value or None.""" return self.config.realtime_mode.value if self.config.realtime_mode else NoneBackwards-compatible alias. Returns the string value or None.
Methods
async def change_component(self,
stt: STT | None = <object object>,
llm: LLM | RealtimeBaseModel | None = <object object>,
tts: TTS | None = <object object>,
vad: VAD | None = <object object>,
turn_detector: EOU | None = <object object>,
denoise: Denoise | None = <object object>) ‑> None-
Expand source code
async def change_component( self, stt: STT | None = NO_CHANGE, llm: LLM | RealtimeBaseModel | None = NO_CHANGE, tts: TTS | None = NO_CHANGE, vad: VAD | None = NO_CHANGE, turn_detector: EOU | None = NO_CHANGE, denoise: Denoise | None = NO_CHANGE, ) -> None: """Dynamically change components. This will close the old components and set the new ones. """ logger.info("Changing pipeline component(s)...") start_time = time.perf_counter() components_change_data = { "new_stt": stt.__class__.__name__ if stt is not NO_CHANGE else None, "new_tts": tts.__class__.__name__ if tts is not NO_CHANGE else None, "new_llm": llm.__class__.__name__ if llm is not NO_CHANGE else None, "new_vad": vad.__class__.__name__ if vad is not NO_CHANGE else None, "new_turn_detector": turn_detector.__class__.__name__ if turn_detector is not NO_CHANGE else None, "new_denoise": denoise.__class__.__name__ if denoise is not NO_CHANGE else None } # 0 Change components only if present earlier validation_map = { 'STT': (stt, self.stt), 'TTS': (tts, self.tts), 'LLM': (llm, self.llm), 'VAD': (vad, self.vad), 'Turn Detector': (turn_detector, self.turn_detector), 'Denoise': (denoise, self.denoise) } for name, (new_val, current_val) in validation_map.items(): if new_val is not NO_CHANGE and current_val is None: raise ValueError( f"Cannot change component '{name}' because it is not present in the current pipeline. " "Use change_pipeline() for full reconfiguration." ) logger.info(f"Performing swap in {self.config.pipeline_mode.value} mode") # Detect pipeline mode shift mode_shift = check_mode_shift(self, llm, stt, tts) if mode_shift: logger.info("Component change triggers mode shift. Delegating to change_pipeline for full reconfiguration.") # Resolve sentinels to current values for resettlement target_stt = self.stt if stt is NO_CHANGE else stt target_tts = self.tts if tts is NO_CHANGE else tts target_vad = self.vad if vad is NO_CHANGE else vad target_turn_detector = self.turn_detector if turn_detector is NO_CHANGE else turn_detector target_denoise = self.denoise if denoise is NO_CHANGE else denoise if llm is NO_CHANGE: target_llm = self._realtime_model if self._realtime_model else self.llm else: target_llm = llm await self.change_pipeline( stt=target_stt, llm=target_llm, tts=target_tts, vad=target_vad, turn_detector=target_turn_detector, denoise=target_denoise, avatar=self.avatar, eou_config=self.eou_config, interrupt_config=self.interrupt_config, conversational_graph=self.conversational_graph, max_context_items=self.max_context_items, voice_mail_detector=self.voice_mail_detector, realtime_config=self.realtime_config ) return components_change_status = {} if stt is not NO_CHANGE and self.stt != stt: await swap_component_in_orchestrator( self, 'stt', stt, 'speech_understanding', 'stt_lock', register_stt_transcript_listener ) components_change_status["new_stt"] = "success" if llm is not NO_CHANGE and self.llm != llm: await swap_llm(self, llm) components_change_status["new_llm"] = "success" if tts is not NO_CHANGE and self.tts != tts: await swap_tts(self, tts) components_change_status["new_tts"] = "success" if vad is not NO_CHANGE and self.vad != vad: await swap_component_in_orchestrator(self, 'vad', vad, 'speech_understanding') components_change_status["new_vad"] = "success" if turn_detector is not NO_CHANGE and self.turn_detector != turn_detector: await swap_component_in_orchestrator(self, 'turn_detector', turn_detector, 'speech_understanding', 'turn_detector_lock') components_change_status["new_turn_detector"] = "success" if denoise is not NO_CHANGE and self.denoise != denoise: await swap_component_in_orchestrator(self, 'denoise', denoise, 'speech_understanding', 'denoise_lock') components_change_status["new_denoise"] = "success" # 3. REBOOT: Rebuild config with updated components self.config = build_pipeline_config( stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, avatar=self.avatar, denoise=self.denoise, realtime_model=self._realtime_model, realtime_config_mode=( self.realtime_config.mode if self.realtime_config and self.realtime_config.mode else None ), ) end_time = time.perf_counter() time_data = { "start_time": start_time, "end_time": end_time } if self._is_realtime_mode: self._configure_components() self._setup_error_handlers() await self.start() metrics_collector.traces_flow_manager.create_components_change_trace(components_change_status, components_change_data, time_data) new_mode = self.config.pipeline_mode.value logger.info(f"New pipeline mode: {new_mode}") returnDynamically change components. This will close the old components and set the new ones.
async def change_pipeline(self,
stt: STT | None = None,
llm: LLM | RealtimeBaseModel | None = None,
tts: TTS | None = None,
vad: VAD | None = None,
turn_detector: EOU | None = None,
avatar: typing.Any | None = None,
denoise: Denoise | None = None,
eou_config: EOUConfig | None = None,
interrupt_config: InterruptConfig | None = None,
conversational_graph: typing.Any | None = None,
max_context_items: int | None = None,
voice_mail_detector: VoiceMailDetector | None = None,
realtime_config: RealtimeConfig | None = None) ‑> None-
Expand source code
async def change_pipeline( self, stt: STT | None = None, llm: LLM | RealtimeBaseModel | None = None, tts: TTS | None = None, vad: VAD | None = None, turn_detector: EOU | None = None, avatar: Any | None = None, denoise: Denoise | None = None, eou_config: EOUConfig | None = None, interrupt_config: InterruptConfig | None = None, conversational_graph: Any | None = None, max_context_items: int | None = None, voice_mail_detector: VoiceMailDetector | None = None, realtime_config: RealtimeConfig | None = None ) -> None: """ Dynamically change pipeline configuration and components. This method allows switching between different modes (Realtime, Cascading, Hybrid) and updating individual components. """ logger.info("Changing pipeline configuration...") if self.orchestrator: await self.orchestrator.interrupt() get_provider_info = self.agent.session._get_provider_info start_time = time.perf_counter() original_pipeline_config = {} if not self.config.is_realtime: if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') original_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.llm: p_class, p_model = get_provider_info(self.llm, 'llm') original_pipeline_config["llm"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') original_pipeline_config["tts"] = {"class": p_class, "model": p_model} if hasattr(self, 'vad') and self.vad: p_class, p_model = get_provider_info(self.vad, 'vad') original_pipeline_config["vad"] = {"class": p_class, "model": p_model} if hasattr(self, 'turn_detector') and self.turn_detector: p_class, p_model = get_provider_info(self.turn_detector, 'eou') original_pipeline_config["eou"] = {"class": p_class, "model": p_model} else: if self._realtime_model: original_pipeline_config["realtime"] = {"class": self._realtime_model.__class__.__name__, "model": getattr(self._realtime_model, 'model', '')} if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') original_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') original_pipeline_config["tts"] = {"class": p_class, "model": p_model} original_pipeline_config["pipeline_mode"] = self.config.pipeline_mode.value original_pipeline_config["denoise"] = self.denoise.__class__.__name__ original_pipeline_config["eou_config"] = asdict(self.eou_config) original_pipeline_config["interrupt_config"] = asdict(self.interrupt_config) original_pipeline_config["max_context_items"] = self.max_context_items if self._realtime_model and hasattr(self._realtime_model, 'audio_track'): self._realtime_model.audio_track = None await cleanup_pipeline(self, llm_changing=True) # 2.Update components await swap_component_in_orchestrator( self, 'stt', stt, 'speech_understanding', 'stt_lock', register_stt_transcript_listener ) await swap_tts(self, tts) await swap_component_in_orchestrator(self, 'vad', vad, 'speech_understanding') await swap_component_in_orchestrator(self, 'turn_detector', turn_detector, 'speech_understanding', 'turn_detector_lock') await swap_component_in_orchestrator(self, 'denoise', denoise, 'speech_understanding', 'denoise_lock') if self.avatar and self.avatar != avatar: await self.avatar.aclose() self.avatar = avatar # Update configs if eou_config is not None: self.eou_config = eou_config if interrupt_config is not None: self.interrupt_config = interrupt_config if max_context_items is not None: self.max_context_items = max_context_items if voice_mail_detector is not None: self.voice_mail_detector = voice_mail_detector if realtime_config is not None: self.realtime_config = realtime_config if conversational_graph is not None: self.conversational_graph = conversational_graph if self.conversational_graph and hasattr(self.conversational_graph, 'compile'): self.conversational_graph.compile() # Update LLM / Realtime Model await swap_llm(self, llm) # 3. REBOOT: Detect mode and restart self.config = build_pipeline_config( stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, avatar=self.avatar, denoise=self.denoise, realtime_model=self._realtime_model, realtime_config_mode=( self.realtime_config.mode if self.realtime_config and self.realtime_config.mode else None ), ) new_mode = self.config.pipeline_mode.value logger.info(f"New pipeline mode: {new_mode}") if self.agent: logger.info("Restarting pipeline with updated components") self.set_agent(self.agent) self._configure_components() end_time = time.perf_counter() time_data = { "start_time": start_time, "end_time": end_time } new_pipeline_config = {} if not self.config.is_realtime: if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') new_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.llm: p_class, p_model = get_provider_info(self.llm, 'llm') new_pipeline_config["llm"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') new_pipeline_config["tts"] = {"class": p_class, "model": p_model} if hasattr(self, 'vad') and self.vad: p_class, p_model = get_provider_info(self.vad, 'vad') new_pipeline_config["vad"] = {"class": p_class, "model": p_model} if hasattr(self, 'turn_detector') and self.turn_detector: p_class, p_model = get_provider_info(self.turn_detector, 'eou') new_pipeline_config["eou"] = {"class": p_class, "model": p_model} else: if self._realtime_model: new_pipeline_config["realtime"] = {"class": self._realtime_model.__class__.__name__, "model": getattr(self._realtime_model, 'model', '')} if self.stt: p_class, p_model = get_provider_info(self.stt, 'stt') new_pipeline_config["stt"] = {"class": p_class, "model": p_model} if self.tts: p_class, p_model = get_provider_info(self.tts, 'tts') new_pipeline_config["tts"] = {"class": p_class, "model": p_model} new_pipeline_config["pipeline_mode"] = self.config.pipeline_mode.value new_pipeline_config["eou_config"] = asdict(self.eou_config) new_pipeline_config["interrupt_config"] = asdict(self.interrupt_config) new_pipeline_config["max_context_items"] = self.max_context_items metrics_collector.traces_flow_manager.create_pipeline_change_trace(time_data, original_pipeline_config, new_pipeline_config) self._setup_error_handlers() await self.start()Dynamically change pipeline configuration and components.
This method allows switching between different modes (Realtime, Cascading, Hybrid) and updating individual components.
async def cleanup(self) ‑> None-
Expand source code
async def cleanup(self) -> None: """Release all pipeline resources, close components, and reset internal state.""" logger.info("Cleaning up pipeline") if self.config.is_realtime: if self._realtime_model: await self._realtime_model.aclose() self._realtime_model = None if self.avatar: if hasattr(self.avatar, 'cleanup'): await self.avatar.cleanup() elif hasattr(self.avatar, 'aclose'): await self.avatar.aclose() self.avatar = None if self.denoise: await self.denoise.aclose() self.denoise = None else: if self.stt: await self.stt.aclose() self.stt = None if self.llm and not isinstance(self.llm, RealtimeLLMAdapter): await self.llm.aclose() self.llm = None if self.tts: await self.tts.aclose() self.tts = None if self.vad: await self.vad.aclose() self.vad = None if self.turn_detector: await self.turn_detector.aclose() self.turn_detector = None if self.denoise: await self.denoise.aclose() self.denoise = None if self.avatar: if hasattr(self.avatar, 'cleanup'): await self.avatar.cleanup() elif hasattr(self.avatar, 'aclose'): await self.avatar.aclose() self.avatar = None if self.orchestrator: await self.orchestrator.cleanup() self.orchestrator = None self.agent = None self.vision = False self.loop = None self.audio_track = None self._wake_up_callback = None self._recent_frames = [] self._current_utterance_handle = None logger.info("Pipeline cleaned up")Release all pipeline resources, close components, and reset internal state.
def get_component_configs(self) ‑> Dict[str, Dict[str, Any]]-
Expand source code
def get_component_configs(self) -> Dict[str, Dict[str, Any]]: """Return a dictionary of public configuration attributes for each active pipeline component.""" configs: Dict[str, Dict[str, Any]] = {} for comp_name, comp in [ ("stt", self.stt), ("llm", self.llm if not self.config.is_realtime else self._realtime_model), ("tts", self.tts), ("vad", self.vad), ("eou", self.turn_detector), ]: if comp: try: configs[comp_name] = { k: v for k, v in comp.__dict__.items() if not k.startswith("_") and not callable(v) } except Exception: configs[comp_name] = {} return configsReturn a dictionary of public configuration attributes for each active pipeline component.
def get_latest_frames(self, num_frames: int = 1) ‑> list[av.video.frame.VideoFrame]-
Expand source code
def get_latest_frames(self, num_frames: int = 1) -> list[av.VideoFrame]: """ Get the latest video frames from the pipeline. Args: num_frames: Number of frames to retrieve (default: 1, max: 5) Returns: List of VideoFrame objects """ if not self.vision: logger.warning("Vision not enabled") return [] num_frames = max(1, min(num_frames, self._max_frames_buffer)) if not self._recent_frames: return [] return self._recent_frames[-num_frames:]Get the latest video frames from the pipeline.
Args
num_frames- Number of frames to retrieve (default: 1, max: 5)
Returns
List of VideoFrame objects
def get_session_metrics_snapshot(self) ‑> dict-
Expand source code
def get_session_metrics_snapshot(self) -> dict: """Return dict suitable for populating SessionMetrics fields.""" return { "pipeline_type": self.config.pipeline_mode.value, "components": self.config.component_names, }Return dict suitable for populating SessionMetrics fields.
def interrupt(self) ‑> None-
Expand source code
def interrupt(self) -> None: """Interrupt the current agent speech, cancelling ongoing generation and playback if interruptible.""" if self.config.is_realtime: if self._realtime_model: if self._realtime_model.current_utterance and not self._realtime_model.current_utterance.is_interruptible: logger.info("Interruption disabled for current utterance") return asyncio.create_task(self._realtime_model.interrupt()) if self.config.realtime_mode == RealtimeMode.HYBRID_TTS and self.speech_generation: asyncio.create_task(self.speech_generation.interrupt()) if self.avatar and hasattr(self.avatar, 'interrupt'): asyncio.create_task(self.avatar.interrupt()) if self._current_utterance_handle and not self._current_utterance_handle.done(): if self._current_utterance_handle.is_interruptible: self._current_utterance_handle.interrupt() else: if self.orchestrator: asyncio.create_task(self.orchestrator.interrupt()) if self.avatar and hasattr(self.avatar, 'interrupt'): asyncio.create_task(self.avatar.interrupt())Interrupt the current agent speech, cancelling ongoing generation and playback if interruptible.
async def leave(self) ‑> None-
Expand source code
async def leave(self) -> None: """Leave the pipeline by performing a full cleanup of all resources.""" await self.cleanup()Leave the pipeline by performing a full cleanup of all resources.
def on(self,
event: Literal['speech_in', 'speech_out', 'agents.stt', 'agents.llm', 'agents.tts', 'vision_frame', 'user_turn_start', 'user_turn_end', 'agent_turn_start', 'agent_turn_end'] | str,
callback: Callable | None = None) ‑> Callable-
Expand source code
def on( self, event: Literal["speech_in", "speech_out", "stt", "llm", "tts", "vision_frame", "user_turn_start", "user_turn_end", "agent_turn_start", "agent_turn_end"] | str, callback: Callable | None = None ) -> Callable: """ Register a listener for pipeline events or a hook for processing stages. Can be used as a decorator or with a callback. Supported hooks (decorator only): - stt: STT processing (async iterator: audio -> events) - tts: TTS processing (async iterator: text -> audio) - llm: Called when LLM generates content. Return/yield str to modify, return None to observe. - vision_frame: Process video frames when vision is enabled (async iterator) - user_turn_start: Called when user turn starts - user_turn_end: Called when user turn ends - agent_turn_start: Called when agent processing starts - agent_turn_end: Called when agent finishes speaking Supported events (listener): - transcript_ready - synthesis_complete - error Examples: @pipeline.on("llm") async def on_llm(data): print(f"LLM generated: {data['text']}") @pipeline.on("llm") async def modify_response(data): text = data.get("text", "") yield text.replace("SSN", "[REDACTED]") """ if event in ["stt", "tts", "llm", "vision_frame", "user_turn_start", "user_turn_end", "agent_turn_start", "agent_turn_end"]: return self.hooks.on(event)(callback) if callback else self.hooks.on(event) return super().on(event, callback)Register a listener for pipeline events or a hook for processing stages.
Can be used as a decorator or with a callback.
Supported hooks (decorator only): - stt: STT processing (async iterator: audio -> events) - tts: TTS processing (async iterator: text -> audio) - llm: Called when LLM generates content. Return/yield str to modify, return None to observe. - vision_frame: Process video frames when vision is enabled (async iterator) - user_turn_start: Called when user turn starts - user_turn_end: Called when user turn ends - agent_turn_start: Called when agent processing starts - agent_turn_end: Called when agent finishes speaking
Supported events (listener): - transcript_ready - synthesis_complete - error
Examples
@pipeline.on("llm") async def on_llm(data): print(f"LLM generated: {data['text']}")
@pipeline.on("llm") async def modify_response(data): text = data.get("text", "") yield text.replace("SSN", "[REDACTED]")
async def on_audio_delta(self, audio_data: bytes) ‑> None-
Expand source code
async def on_audio_delta(self, audio_data: bytes) -> None: """ Handle incoming audio data from the user. Args: audio_data: Raw audio bytes """ if self.config.realtime_mode == RealtimeMode.HYBRID_STT and self.orchestrator: await self.orchestrator.process_audio(audio_data) elif self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.handle_audio_input(audio_data) else: if self.orchestrator: await self.orchestrator.process_audio(audio_data) if not hasattr(self, '_first_audio_logged'): self._first_audio_logged = True if self.config.realtime_mode == RealtimeMode.HYBRID_STT: logger.info("Audio routing: hybrid_stt → orchestrator (external STT)") elif self.config.is_realtime: logger.info("Audio routing: realtime mode → realtime model") else: logger.info("Audio routing: traditional mode → orchestrator")Handle incoming audio data from the user.
Args
audio_data- Raw audio bytes
def on_component_error(self, source: str, error_data: Any) ‑> None-
Expand source code
def on_component_error(self, source: str, error_data: Any) -> None: """Handle error events from components""" logger.error(f"[{source}] Component error: {error_data}") metrics_collector.add_error(source, error_data) self.emit("error", {"source": source, "error": str(error_data)})Handle error events from components
async def on_video_delta(self, video_data: av.video.frame.VideoFrame) ‑> None-
Expand source code
async def on_video_delta(self, video_data: av.VideoFrame) -> None: """ Handle incoming video data from the user. Args: video_data: Video frame """ if not self.vision: return if self._vision_lock.locked(): return # Process through vision_frame hook if available if self.hooks and self.hooks.has_vision_frame_hooks(): async def frame_stream(): yield video_data processed_stream = self.hooks.process_vision_frame(frame_stream()) async for processed_frame in processed_stream: video_data = processed_frame self._recent_frames.append(video_data) if len(self._recent_frames) > self._max_frames_buffer: self._recent_frames.pop(0) if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.handle_video_input(video_data)Handle incoming video data from the user.
Args
video_data- Video frame
async def process_text(self, text: str) ‑> None-
Expand source code
async def process_text(self, text: str) -> None: """ Process text input directly (bypasses STT). Args: text: User text input """ if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.send_text_message(text) else: if self.orchestrator: await self.orchestrator.process_text(text) else: logger.warning("No orchestrator available for text processing")Process text input directly (bypasses STT).
Args
text- User text input
async def reply_with_context(self,
instructions: str,
wait_for_playback: bool,
handle: UtteranceHandle,
frames: list[av.video.frame.VideoFrame] | None = None) ‑> None-
Expand source code
async def reply_with_context( self, instructions: str, wait_for_playback: bool, handle: UtteranceHandle, frames: list[av.VideoFrame] | None = None ) -> None: """ Generate a reply using instructions and current chat context. Args: instructions: Instructions to add to context wait_for_playback: If True, wait for playback to complete handle: Utterance handle frames: Optional video frames for vision """ self._current_utterance_handle = handle if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): self.llm.current_utterance = handle if frames and hasattr(self.llm, 'send_message_with_frames'): async with self._vision_lock: await self.llm.send_message_with_frames(instructions, frames) else: await self.llm.send_text_message(instructions) else: if self.orchestrator: await self.orchestrator.reply_with_context(instructions, wait_for_playback, handle, frames) else: logger.warning("No orchestrator available") handle._mark_done()Generate a reply using instructions and current chat context.
Args
instructions- Instructions to add to context
wait_for_playback- If True, wait for playback to complete
handle- Utterance handle
frames- Optional video frames for vision
async def send_message(self,
message: str,
handle: UtteranceHandle) ‑> None-
Expand source code
async def send_message(self, message: str, handle: UtteranceHandle) -> None: """ Send a message to the pipeline. Args: message: Message text to send handle: Utterance handle to track """ self._current_utterance_handle = handle if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): self.llm.current_utterance = handle try: await self.llm.send_message(message) except Exception as e: logger.error(f"Error sending message: {e}") handle._mark_done() else: if self.orchestrator: await self.orchestrator.say(message, handle) else: logger.warning("No orchestrator available") handle._mark_done()Send a message to the pipeline.
Args
message- Message text to send
handle- Utterance handle to track
async def send_text_message(self, message: str) ‑> None-
Expand source code
async def send_text_message(self, message: str) -> None: """ Send a text message (for A2A or text-only scenarios). Args: message: Text message to send """ if self.config.is_realtime: if isinstance(self.llm, RealtimeLLMAdapter): await self.llm.send_text_message(message) else: if self.orchestrator: await self.orchestrator.process_text(message)Send a text message (for A2A or text-only scenarios).
Args
message- Text message to send
def set_agent(self, agent: Any) ‑> None-
Expand source code
def set_agent(self, agent: Any) -> None: """Associate an agent with this pipeline and configure the orchestrator based on the pipeline mode.""" self.agent = agent # Configure metrics with pipeline info metrics_collector.configure_pipeline( pipeline_mode=self.config.pipeline_mode, realtime_mode=self.config.realtime_mode, active_components=self.config.active_components, ) metrics_collector.set_eou_config(self.eou_config) metrics_collector.set_interrupt_config(self.interrupt_config) if self.config.realtime_mode in (RealtimeMode.HYBRID_STT, RealtimeMode.LLM_ONLY): logger.info(f"Creating orchestrator for {self.config.realtime_mode.value} mode") self.orchestrator = PipelineOrchestrator( agent=agent, stt=self.stt, llm=None, tts=None, vad=self.vad, turn_detector=self.turn_detector, denoise=self.denoise, avatar=None, mode=self.eou_config.mode, min_speech_wait_timeout=self.eou_config.min_max_speech_wait_timeout, interrupt_mode=self.interrupt_config.mode, interrupt_min_duration=self.interrupt_config.interrupt_min_duration, interrupt_min_words=self.interrupt_config.interrupt_min_words, false_interrupt_pause_duration=self.interrupt_config.false_interrupt_pause_duration, resume_on_false_interrupt=self.interrupt_config.resume_on_false_interrupt, conversational_graph=None, max_context_items=self.max_context_items, voice_mail_detector=self.voice_mail_detector, hooks=self.hooks, ) self.orchestrator.on("transcript_ready", self._wrap_async(self._on_transcript_ready_hybrid_stt)) logger.info("Registered hybrid_stt event listener on orchestrator") if isinstance(self.llm, RealtimeLLMAdapter): self.llm.set_agent(agent) elif self.config.realtime_mode == RealtimeMode.HYBRID_TTS: logger.info("Setting up hybrid_tts mode: realtime STT+LLM + external TTS") if hasattr(self._realtime_model, 'audio_track'): self._realtime_model.audio_track = None logger.info("Disconnected realtime model audio track (external TTS will be used)") if self.tts: self.speech_generation = SpeechGeneration( agent=agent, tts=self.tts, avatar=self.avatar, hooks=self.hooks, ) if self._realtime_model and not hasattr(self, '_hybrid_tts_listeners_registered'): self._hybrid_tts_listeners_registered = True self._realtime_model.on("realtime_model_transcription", self._wrap_async(self._on_realtime_transcription_hybrid_tts)) logger.info("Registered hybrid_tts event listener for realtime_model_transcription") if isinstance(self.llm, RealtimeLLMAdapter): self.llm.set_agent(agent) elif self.config.realtime_mode == RealtimeMode.FULL_S2S: if isinstance(self.llm, RealtimeLLMAdapter): self.llm.set_agent(agent) elif not self.config.is_realtime: if self.conversational_graph: self.conversational_graph.compile() self.orchestrator = PipelineOrchestrator( agent=agent, stt=self.stt, llm=self.llm, tts=self.tts, vad=self.vad, turn_detector=self.turn_detector, denoise=self.denoise, avatar=self.avatar, mode=self.eou_config.mode, min_speech_wait_timeout=self.eou_config.min_max_speech_wait_timeout, interrupt_mode=self.interrupt_config.mode, interrupt_min_duration=self.interrupt_config.interrupt_min_duration, interrupt_min_words=self.interrupt_config.interrupt_min_words, false_interrupt_pause_duration=self.interrupt_config.false_interrupt_pause_duration, resume_on_false_interrupt=self.interrupt_config.resume_on_false_interrupt, conversational_graph=self.conversational_graph, max_context_items=self.max_context_items, voice_mail_detector=self.voice_mail_detector, hooks=self.hooks, ) self.orchestrator.on("transcript_ready", lambda data: self.emit("transcript_ready", data)) self.orchestrator.on("content_generated", lambda data: self.emit("content_generated", data)) self.orchestrator.on("synthesis_complete", lambda data: self.emit("synthesis_complete", data)) self.orchestrator.on("voicemail_result", lambda data: self.emit("voicemail_result", data))Associate an agent with this pipeline and configure the orchestrator based on the pipeline mode.
def set_voice_mail_detector(self,
detector: VoiceMailDetector | None) ‑> None-
Expand source code
def set_voice_mail_detector(self, detector: VoiceMailDetector | None) -> None: """Set or replace the voicemail detector on the pipeline and its orchestrator.""" self.voice_mail_detector = detector if self.orchestrator: self.orchestrator.set_voice_mail_detector(detector)Set or replace the voicemail detector on the pipeline and its orchestrator.
def set_wake_up_callback(self, callback: Callable[[], None]) ‑> None-
Expand source code
def set_wake_up_callback(self, callback: Callable[[], None]) -> None: """Set a callback to be invoked when user speech is first detected.""" self._wake_up_callback = callbackSet a callback to be invoked when user speech is first detected.
async def start(self, **kwargs: Any) ‑> None-
Expand source code
async def start(self, **kwargs: Any) -> None: """ Start the pipeline processing. Args: **kwargs: Additional arguments for pipeline configuration """ logger.info( f"Starting pipeline | mode={self.config.pipeline_mode.value} " f"| realtime={self.config.realtime_mode.value if self.config.realtime_mode else 'none'} " f"| components={self.config.component_names}" ) if self.config.is_realtime: if self._realtime_model: await self._realtime_model.connect() if isinstance(self.llm, RealtimeLLMAdapter): self.llm.on_user_speech_started(lambda data: self._on_user_speech_started_realtime(data)) self.llm.on_user_speech_ended(lambda data: asyncio.create_task(self._on_user_speech_ended_realtime(data))) self.llm.on_agent_speech_started(lambda data: asyncio.create_task(self._on_agent_speech_started_realtime(data))) # self.llm.on_agent_speech_ended(lambda data: self._on_agent_speech_ended_realtime(data)) self.llm.on_transcription(self._on_realtime_transcription) if self.config.realtime_mode == RealtimeMode.HYBRID_STT and self.orchestrator: await self.orchestrator.start() logger.info("Started orchestrator for hybrid_stt mode") else: if self.orchestrator: await self.orchestrator.start()Start the pipeline processing.
Args
**kwargs- Additional arguments for pipeline configuration
Inherited members
class PipelineComponent (*args, **kwds)-
Expand source code
@enum.unique class PipelineComponent(enum.Enum): """Identifiers for each component slot in the pipeline.""" STT = "stt" LLM = "llm" TTS = "tts" VAD = "vad" TURN_DETECTOR = "turn_detector" AVATAR = "avatar" DENOISE = "denoise" REALTIME_MODEL = "realtime_model"Identifiers for each component slot in the pipeline.
Ancestors
- enum.Enum
Class variables
var AVATARvar DENOISEvar LLMvar REALTIME_MODELvar STTvar TTSvar TURN_DETECTORvar VAD
class PipelineConfig (pipeline_mode: PipelineMode,
realtime_mode: RealtimeMode | None,
is_realtime: bool,
active_components: frozenset[PipelineComponent])-
Expand source code
@dataclass(frozen=True) class PipelineConfig: """ Immutable snapshot of the pipeline's detected configuration. Computed once during Pipeline.__init__ and accessible everywhere via pipeline.config. """ pipeline_mode: PipelineMode realtime_mode: RealtimeMode | None is_realtime: bool active_components: frozenset[PipelineComponent] def has_component(self, component: PipelineComponent) -> bool: """Check whether a specific component is present.""" return component in self.active_components @property def component_names(self) -> list[str]: """Return sorted list of active component value strings (for metrics/logging).""" return sorted(c.value for c in self.active_components)Immutable snapshot of the pipeline's detected configuration. Computed once during Pipeline.init and accessible everywhere via pipeline.config.
Instance variables
var active_components : frozenset[PipelineComponent]prop component_names : list[str]-
Expand source code
@property def component_names(self) -> list[str]: """Return sorted list of active component value strings (for metrics/logging).""" return sorted(c.value for c in self.active_components)Return sorted list of active component value strings (for metrics/logging).
var is_realtime : boolvar pipeline_mode : PipelineModevar realtime_mode : RealtimeMode | None
Methods
def has_component(self,
component: PipelineComponent) ‑> bool-
Expand source code
def has_component(self, component: PipelineComponent) -> bool: """Check whether a specific component is present.""" return component in self.active_componentsCheck whether a specific component is present.
class PipelineMode (*args, **kwds)-
Expand source code
@enum.unique class PipelineMode(enum.Enum): """The overall pipeline architecture mode based on which components are present.""" REALTIME = "realtime" FULL_CASCADING = "full_cascading" LLM_TTS_ONLY = "llm_tts_only" STT_LLM_ONLY = "stt_llm_only" LLM_ONLY = "llm_only" STT_ONLY = "stt_only" TTS_ONLY = "tts_only" STT_TTS_ONLY = "stt_tts_only" HYBRID = "hybrid" PARTIAL_CASCADING = "partial_cascading"The overall pipeline architecture mode based on which components are present.
Ancestors
- enum.Enum
Class variables
var FULL_CASCADINGvar HYBRIDvar LLM_ONLYvar LLM_TTS_ONLYvar PARTIAL_CASCADINGvar REALTIMEvar STT_LLM_ONLYvar STT_ONLYvar STT_TTS_ONLYvar TTS_ONLY
class PlaygroundManager (ctx)-
Expand source code
class PlaygroundManager: """Manages publishing agent metrics to the playground via the room's PubSub channel.""" def __init__(self, ctx): self.job_context = ctx self.job_context.playground_manager = self def send_cascading_metrics(self, metrics: dict, full_turn_data: bool = False): """Sends cascading metrics to the playground. Args: metrics (dict): The metrics to send. full_turn_data (bool): Whether to send full turn data. """ if full_turn_data: metrics = asdict(metrics) metrics = json.dumps(metrics) publish_config = PubSubPublishConfig( topic="AGENT_METRICS", message={"type": "cascading", "metrics": metrics, "full_turn_data": full_turn_data} ) if self.job_context.room: asyncio.create_task(self.job_context.room.publish_to_pubsub(publish_config)) else: logger.debug("Cannot send cascading metrics: room is not available") def send_realtime_metrics(self, metrics: dict, full_turn_data: bool = False): """Sends realtime metrics to the playground. Args: metrics (dict): The metrics to send. full_turn_data (bool): Whether to send full turn data. """ if full_turn_data: metrics = asdict(metrics) metrics = json.dumps(metrics) publish_config = PubSubPublishConfig( topic="AGENT_METRICS", message={"type": "realtime", "metrics": metrics, "full_turn_data": full_turn_data} ) if self.job_context.room: asyncio.create_task(self.job_context.room.publish_to_pubsub(publish_config))Manages publishing agent metrics to the playground via the room's PubSub channel.
Methods
def send_cascading_metrics(self, metrics: dict, full_turn_data: bool = False)-
Expand source code
def send_cascading_metrics(self, metrics: dict, full_turn_data: bool = False): """Sends cascading metrics to the playground. Args: metrics (dict): The metrics to send. full_turn_data (bool): Whether to send full turn data. """ if full_turn_data: metrics = asdict(metrics) metrics = json.dumps(metrics) publish_config = PubSubPublishConfig( topic="AGENT_METRICS", message={"type": "cascading", "metrics": metrics, "full_turn_data": full_turn_data} ) if self.job_context.room: asyncio.create_task(self.job_context.room.publish_to_pubsub(publish_config)) else: logger.debug("Cannot send cascading metrics: room is not available")Sends cascading metrics to the playground.
Args
metrics:dict- The metrics to send.
full_turn_data:bool- Whether to send full turn data.
def send_realtime_metrics(self, metrics: dict, full_turn_data: bool = False)-
Expand source code
def send_realtime_metrics(self, metrics: dict, full_turn_data: bool = False): """Sends realtime metrics to the playground. Args: metrics (dict): The metrics to send. full_turn_data (bool): Whether to send full turn data. """ if full_turn_data: metrics = asdict(metrics) metrics = json.dumps(metrics) publish_config = PubSubPublishConfig( topic="AGENT_METRICS", message={"type": "realtime", "metrics": metrics, "full_turn_data": full_turn_data} ) if self.job_context.room: asyncio.create_task(self.job_context.room.publish_to_pubsub(publish_config))Sends realtime metrics to the playground.
Args
metrics:dict- The metrics to send.
full_turn_data:bool- Whether to send full turn data.
class ProcessResource (resource_id: str, config: Dict[str, Any])-
Expand source code
class ProcessResource(BaseResource): """ Process-based resource for task execution. Uses multiprocessing to create isolated processes for task execution. """ def __init__(self, resource_id: str, config: Dict[str, Any]): super().__init__(resource_id, config) self.process: Optional[Process] = None self.task_queue: Optional[Queue] = None self.result_queue: Optional[Queue] = None self.control_queue: Optional[Queue] = None self._process_ready = False @property def resource_type(self) -> ResourceType: return ResourceType.PROCESS async def _initialize_impl(self) -> None: """Initialize the process resource.""" # Create queues for communication self.task_queue = Queue() self.result_queue = Queue() self.control_queue = Queue() # Start the process self.process = Process( target=self._process_worker, args=( self.resource_id, self.task_queue, self.result_queue, self.control_queue, self.config, ), daemon=True, ) self.process.start() child_pid = self.process.pid logger.info( f"New process started | resource_id={self.resource_id} | pid={self.process.pid}" ) # Wait for process to be ready timeout = self.config.get("initialize_timeout", 10.0) start_time = time.time() while not self._process_ready and (time.time() - start_time) < timeout: try: # Check for ready signal if not self.control_queue.empty(): message = self.control_queue.get_nowait() if message.get("type") == "ready": self._process_ready = True break await asyncio.sleep(0.1) except Exception as e: logger.warning(f"Error checking process readiness: {e}") if not self._process_ready: raise TimeoutError( f"Process {self.resource_id} failed to initialize within {timeout}s" ) async def _execute_task_impl( self, task_id: str, config, entrypoint: Callable, args: tuple, kwargs: dict ) -> Any: """Execute task in the process.""" if not self._process_ready: raise RuntimeError(f"Process {self.resource_id} is not ready") # Send task to process # Note: entrypoint and args must be picklable task_data = { "task_id": task_id, "config": config, "entrypoint": entrypoint, "args": args, "kwargs": kwargs, } self.task_queue.put(task_data) # Wait for result timeout = config.timeout start_time = time.time() while (time.time() - start_time) < timeout: try: # Check if child process is still alive if self.process and not self.process.is_alive(): logger.info( f"Process {self.resource_id} exited (pid={self.process.pid}), " f"treating task {task_id} as completed" ) # Drain any remaining results from the queue try: if not self.result_queue.empty(): result_data = self.result_queue.get_nowait() if result_data.get("status") == "success": return result_data.get("result") except Exception: pass return None # Process exited cleanly after cleanup if not self.result_queue.empty(): result_data = self.result_queue.get_nowait() if result_data.get("task_id") == task_id: if result_data.get("status") == "success": return result_data.get("result") else: raise RuntimeError( result_data.get("error", "Unknown error") ) except RuntimeError: raise except Exception as e: logger.warning(f"Error checking task result: {e}") await asyncio.sleep(0.1) raise TimeoutError(f"Task {task_id} timed out after {timeout}s") async def _shutdown_impl(self) -> None: """Shutdown the process resource.""" if self.process and self.process.is_alive(): # Send shutdown signal self.control_queue.put({"type": "shutdown"}) # Wait for graceful shutdown timeout = self.config.get("close_timeout", 60.0) start_time = time.time() while self.process.is_alive() and (time.time() - start_time) < timeout: await asyncio.sleep(0.1) # Force terminate if still alive if self.process.is_alive(): logger.warning(f"Force terminating process {self.resource_id}") self.process.terminate() self.process.join(timeout=5.0) if self.process.is_alive(): self.process.kill() @staticmethod def _process_worker( resource_id: str, task_queue: Queue, result_queue: Queue, control_queue: Queue, config: Dict[str, Any], ): """Worker function that runs in the process.""" try: logger.info(f"Process worker {resource_id} started") # Signal ready control_queue.put({"type": "ready"}) # Main task processing loop while True: try: # Check for shutdown signal if not control_queue.empty(): message = control_queue.get_nowait() if message.get("type") == "shutdown": break # Check for tasks if not task_queue.empty(): task_data = task_queue.get_nowait() task_id = task_data["task_id"] entrypoint = task_data["entrypoint"] args = task_data.get("args", ()) kwargs = task_data.get("kwargs", {}) logger.info( f"Executing task {task_id} on resource {resource_id}" ) try: # Execute the task if asyncio.iscoroutinefunction(entrypoint): # Use asyncio.run() which properly handles cleanup: # 1. Creates a fresh event loop # 2. Runs the coroutine to completion # 3. Cancels ALL remaining tasks # 4. Closes the loop result = asyncio.run(entrypoint(*args, **kwargs)) else: result = entrypoint(*args, **kwargs) result_queue.put( { "task_id": task_id, "status": "success", "result": result, } ) except Exception as e: logger.error(f"Task execution failed: {e}") result_queue.put( {"task_id": task_id, "status": "error", "error": str(e)} ) else: time.sleep(0.1) except Exception as e: logger.error(f"Error in process worker {resource_id}: {e}") time.sleep(1.0) logger.info(f"Process worker {resource_id} shutting down") except Exception as e: logger.error(f"Fatal error in process worker {resource_id}: {e}")Process-based resource for task execution.
Uses multiprocessing to create isolated processes for task execution.
Ancestors
- BaseResource
- abc.ABC
Inherited members
class RealtimeBaseModel-
Expand source code
class RealtimeBaseModel(EventEmitter[Union[BaseEventTypes, TEvent]], Generic[TEvent], ABC): """ Base class for realtime models with event emission capabilities. Allows for extension with additional event types through TEvent. """ def __init__(self) -> None: """Initialize the realtime model""" super().__init__() self.current_utterance: UtteranceHandle | None = None self.audio_track = None self.loop = None @abstractmethod async def aclose(self) -> None: """Cleanup resources - must be implemented by subclasses.""" self.audio_track = None self.loop = None async def cleanup(self) -> None: """Cleanup resources - calls aclose for compatibility""" await self.aclose() async def __aenter__(self) -> RealtimeBaseModel: """Async context manager entry""" return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: """Async context manager exit""" await self.aclose()Base class for realtime models with event emission capabilities. Allows for extension with additional event types through TEvent.
Initialize the realtime model
Ancestors
- EventEmitter
- typing.Generic
- abc.ABC
Methods
async def aclose(self) ‑> None-
Expand source code
@abstractmethod async def aclose(self) -> None: """Cleanup resources - must be implemented by subclasses.""" self.audio_track = None self.loop = NoneCleanup resources - must be implemented by subclasses.
async def cleanup(self) ‑> None-
Expand source code
async def cleanup(self) -> None: """Cleanup resources - calls aclose for compatibility""" await self.aclose()Cleanup resources - calls aclose for compatibility
Inherited members
class RealtimeLLMAdapter (realtime_model: RealtimeBaseModel,
agent: Agent | None = None)-
Expand source code
class RealtimeLLMAdapter(EventEmitter): """ Wraps a RealtimeBaseModel to expose an LLM-compatible interface. This allows realtime models (like OpenAI Realtime API, Gemini Live) to be used in place of standard LLM components in the pipeline architecture. Key differences from standard LLMs: - Realtime models handle their own audio I/O (STT + TTS built-in) - They maintain their own conversation state - Function calling may work differently This wrapper primarily delegates to the underlying realtime model and provides adapter methods to make it look like an LLM from the pipeline's perspective. """ def __init__(self, realtime_model: RealtimeBaseModel, agent: Agent | None = None): super().__init__() self.realtime_model = realtime_model self.agent = agent self._is_realtime = True self.audio_track = None self.loop = None self.realtime_model.on("error", lambda error: self.emit("error", error)) self.realtime_model.on("user_speech_started", lambda data: self.emit("user_speech_started", data)) self.realtime_model.on("user_speech_ended", lambda data: self.emit("user_speech_ended", data)) self.realtime_model.on("agent_speech_started", lambda data: self.emit("agent_speech_started", data)) self.realtime_model.on("agent_speech_ended", lambda data: self.emit("agent_speech_ended", data)) self.realtime_model.on("realtime_model_transcription", lambda data: self.emit("realtime_model_transcription", data)) self.realtime_model.on("llm_text_output", lambda data: self.emit("llm_text_output", data)) def set_agent(self, agent: Agent) -> None: """Set the agent for this wrapper""" self.agent = agent if hasattr(self.realtime_model, 'set_agent'): self.realtime_model.set_agent(agent) async def connect(self) -> None: """Connect the realtime model""" await self.realtime_model.connect() async def chat( self, context: ChatContext, tools: list[Any] | None = None, conversational_graph: Any | None = None, **kwargs ) -> AsyncIterator[ResponseChunk]: """ Adapter method for LLM compatibility. For realtime models, the chat method is less relevant since they handle audio I/O directly. This method exists for interface compatibility but yields minimal content since the actual response happens through audio. Args: context: Chat context (may be ignored by realtime models) tools: Available function tools conversational_graph: Optional conversational graph **kwargs: Additional arguments Yields: ResponseChunk objects (mostly empty for realtime models) """ logger.info("RealtimeLLMAdapter.chat() called - realtime models handle I/O directly") async def empty_gen(): yield ResponseChunk(content="", metadata={"realtime_mode": True}, role=ChatRole.ASSISTANT) async for chunk in empty_gen(): yield chunk async def handle_audio_input(self, audio_data: bytes) -> None: """ Process incoming audio through the realtime model. Args: audio_data: Raw audio bytes """ await self.realtime_model.handle_audio_input(audio_data) async def handle_video_input(self, video_frame: Any) -> None: """ Process incoming video through the realtime model (if supported). Args: video_frame: Video frame data """ if hasattr(self.realtime_model, 'handle_video_input'): await self.realtime_model.handle_video_input(video_frame) else: logger.warning(f"Realtime model {type(self.realtime_model).__name__} does not support video input") async def send_message(self, message: str) -> None: """ Send a text message to the realtime model. Args: message: Text message to send """ await self.realtime_model.send_message(message) async def send_text_message(self, message: str) -> None: """ Send a text-only message (for models supporting text modality). Args: message: Text message to send """ if hasattr(self.realtime_model, 'send_text_message'): await self.realtime_model.send_text_message(message) else: await self.realtime_model.send_message(message) async def send_message_with_frames(self, message: str, frames: list[Any]) -> None: """ Send a message with video frames (for vision-enabled models). Args: message: Text message frames: List of video frames """ if hasattr(self.realtime_model, 'send_message_with_frames'): await self.realtime_model.send_message_with_frames(message, frames) else: logger.warning(f"Realtime model {type(self.realtime_model).__name__} does not support frames") await self.send_message(message) async def interrupt(self) -> None: """Interrupt the realtime model's current response""" if hasattr(self.realtime_model, 'interrupt'): await self.realtime_model.interrupt() async def cancel_current_generation(self) -> None: """Cancel the current generation (LLM compatibility method)""" await self.interrupt() def on_user_speech_started(self, callback) -> None: """Register callback for user speech started event""" self.realtime_model.on("user_speech_started", callback) def on_user_speech_ended(self, callback) -> None: """Register callback for user speech ended event""" self.realtime_model.on("user_speech_ended", callback) def on_agent_speech_started(self, callback) -> None: """Register callback for agent speech started event""" self.realtime_model.on("agent_speech_started", callback) def on_agent_speech_ended(self, callback) -> None: """Register callback for agent speech ended event""" self.realtime_model.on("agent_speech_ended", callback) def on_transcription(self, callback) -> None: """Register callback for transcription events""" self.realtime_model.on("realtime_model_transcription", callback) @property def current_utterance(self): """Get current utterance handle""" return getattr(self.realtime_model, 'current_utterance', None) @current_utterance.setter def current_utterance(self, value): """Set current utterance handle""" if hasattr(self.realtime_model, 'current_utterance'): self.realtime_model.current_utterance = value async def aclose(self) -> None: """Close and cleanup the realtime model""" await self.realtime_model.aclose() async def cleanup(self) -> None: """Cleanup resources""" await self.aclose()Wraps a RealtimeBaseModel to expose an LLM-compatible interface.
This allows realtime models (like OpenAI Realtime API, Gemini Live) to be used in place of standard LLM components in the pipeline architecture.
Key differences from standard LLMs: - Realtime models handle their own audio I/O (STT + TTS built-in) - They maintain their own conversation state - Function calling may work differently
This wrapper primarily delegates to the underlying realtime model and provides adapter methods to make it look like an LLM from the pipeline's perspective.
Ancestors
- EventEmitter
- typing.Generic
Instance variables
prop current_utterance-
Expand source code
@property def current_utterance(self): """Get current utterance handle""" return getattr(self.realtime_model, 'current_utterance', None)Get current utterance handle
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """Close and cleanup the realtime model""" await self.realtime_model.aclose()Close and cleanup the realtime model
async def cancel_current_generation(self) ‑> None-
Expand source code
async def cancel_current_generation(self) -> None: """Cancel the current generation (LLM compatibility method)""" await self.interrupt()Cancel the current generation (LLM compatibility method)
async def chat(self,
context: ChatContext,
tools: list[Any] | None = None,
conversational_graph: Any | None = None,
**kwargs) ‑> AsyncIterator[ResponseChunk]-
Expand source code
async def chat( self, context: ChatContext, tools: list[Any] | None = None, conversational_graph: Any | None = None, **kwargs ) -> AsyncIterator[ResponseChunk]: """ Adapter method for LLM compatibility. For realtime models, the chat method is less relevant since they handle audio I/O directly. This method exists for interface compatibility but yields minimal content since the actual response happens through audio. Args: context: Chat context (may be ignored by realtime models) tools: Available function tools conversational_graph: Optional conversational graph **kwargs: Additional arguments Yields: ResponseChunk objects (mostly empty for realtime models) """ logger.info("RealtimeLLMAdapter.chat() called - realtime models handle I/O directly") async def empty_gen(): yield ResponseChunk(content="", metadata={"realtime_mode": True}, role=ChatRole.ASSISTANT) async for chunk in empty_gen(): yield chunkAdapter method for LLM compatibility.
For realtime models, the chat method is less relevant since they handle audio I/O directly. This method exists for interface compatibility but yields minimal content since the actual response happens through audio.
Args
context- Chat context (may be ignored by realtime models)
tools- Available function tools
conversational_graph- Optional conversational graph
**kwargs- Additional arguments
Yields
ResponseChunk objects (mostly empty for realtime models)
async def cleanup(self) ‑> None-
Expand source code
async def cleanup(self) -> None: """Cleanup resources""" await self.aclose()Cleanup resources
async def connect(self) ‑> None-
Expand source code
async def connect(self) -> None: """Connect the realtime model""" await self.realtime_model.connect()Connect the realtime model
async def handle_audio_input(self, audio_data: bytes) ‑> None-
Expand source code
async def handle_audio_input(self, audio_data: bytes) -> None: """ Process incoming audio through the realtime model. Args: audio_data: Raw audio bytes """ await self.realtime_model.handle_audio_input(audio_data)Process incoming audio through the realtime model.
Args
audio_data- Raw audio bytes
async def handle_video_input(self, video_frame: Any) ‑> None-
Expand source code
async def handle_video_input(self, video_frame: Any) -> None: """ Process incoming video through the realtime model (if supported). Args: video_frame: Video frame data """ if hasattr(self.realtime_model, 'handle_video_input'): await self.realtime_model.handle_video_input(video_frame) else: logger.warning(f"Realtime model {type(self.realtime_model).__name__} does not support video input")Process incoming video through the realtime model (if supported).
Args
video_frame- Video frame data
async def interrupt(self) ‑> None-
Expand source code
async def interrupt(self) -> None: """Interrupt the realtime model's current response""" if hasattr(self.realtime_model, 'interrupt'): await self.realtime_model.interrupt()Interrupt the realtime model's current response
def on_agent_speech_ended(self, callback) ‑> None-
Expand source code
def on_agent_speech_ended(self, callback) -> None: """Register callback for agent speech ended event""" self.realtime_model.on("agent_speech_ended", callback)Register callback for agent speech ended event
def on_agent_speech_started(self, callback) ‑> None-
Expand source code
def on_agent_speech_started(self, callback) -> None: """Register callback for agent speech started event""" self.realtime_model.on("agent_speech_started", callback)Register callback for agent speech started event
def on_transcription(self, callback) ‑> None-
Expand source code
def on_transcription(self, callback) -> None: """Register callback for transcription events""" self.realtime_model.on("realtime_model_transcription", callback)Register callback for transcription events
def on_user_speech_ended(self, callback) ‑> None-
Expand source code
def on_user_speech_ended(self, callback) -> None: """Register callback for user speech ended event""" self.realtime_model.on("user_speech_ended", callback)Register callback for user speech ended event
def on_user_speech_started(self, callback) ‑> None-
Expand source code
def on_user_speech_started(self, callback) -> None: """Register callback for user speech started event""" self.realtime_model.on("user_speech_started", callback)Register callback for user speech started event
async def send_message(self, message: str) ‑> None-
Expand source code
async def send_message(self, message: str) -> None: """ Send a text message to the realtime model. Args: message: Text message to send """ await self.realtime_model.send_message(message)Send a text message to the realtime model.
Args
message- Text message to send
async def send_message_with_frames(self, message: str, frames: list[Any]) ‑> None-
Expand source code
async def send_message_with_frames(self, message: str, frames: list[Any]) -> None: """ Send a message with video frames (for vision-enabled models). Args: message: Text message frames: List of video frames """ if hasattr(self.realtime_model, 'send_message_with_frames'): await self.realtime_model.send_message_with_frames(message, frames) else: logger.warning(f"Realtime model {type(self.realtime_model).__name__} does not support frames") await self.send_message(message)Send a message with video frames (for vision-enabled models).
Args
message- Text message
frames- List of video frames
async def send_text_message(self, message: str) ‑> None-
Expand source code
async def send_text_message(self, message: str) -> None: """ Send a text-only message (for models supporting text modality). Args: message: Text message to send """ if hasattr(self.realtime_model, 'send_text_message'): await self.realtime_model.send_text_message(message) else: await self.realtime_model.send_message(message)Send a text-only message (for models supporting text modality).
Args
message- Text message to send
def set_agent(self, agent: Agent) ‑> None-
Expand source code
def set_agent(self, agent: Agent) -> None: """Set the agent for this wrapper""" self.agent = agent if hasattr(self.realtime_model, 'set_agent'): self.realtime_model.set_agent(agent)Set the agent for this wrapper
Inherited members
class RealtimeMode (*args, **kwds)-
Expand source code
@enum.unique class RealtimeMode(enum.Enum): """The realtime sub-mode when a RealtimeBaseModel is used as the LLM.""" FULL_S2S = "full_s2s" HYBRID_STT = "hybrid_stt" HYBRID_TTS = "hybrid_tts" LLM_ONLY = "llm_only"The realtime sub-mode when a RealtimeBaseModel is used as the LLM.
Ancestors
- enum.Enum
Class variables
var FULL_S2Svar HYBRID_STTvar HYBRID_TTSvar LLM_ONLY
class RecordingOptions (video: bool = False, screen_share: bool = False)-
Expand source code
@dataclass class RecordingOptions: """ Extra recording when RoomOptions.recording is True. Audio is always recorded when recording=True (track API, kind=audio). Set video and/or screen_share here only when you need them. screen_share=True requires RoomOptions.vision=True. """ video: bool = False screen_share: bool = FalseExtra recording when RoomOptions.recording is True.
Audio is always recorded when recording=True (track API, kind=audio). Set video and/or screen_share here only when you need them. screen_share=True requires RoomOptions.vision=True.
Instance variables
var video : bool
class ResizeOptions (width: int, height: int)-
Expand source code
@dataclass class ResizeOptions: """Configuration for resizing av.VideoFrame during the process of encoding to a standard image format.""" width: int """The target width for resizing""" height: int """The target height for resizing the image."""Configuration for resizing av.VideoFrame during the process of encoding to a standard image format.
Instance variables
var height : int-
The target height for resizing the image.
var width : int-
The target width for resizing
class ResourceConfig (resource_type: ResourceType = ResourceType.PROCESS,
num_idle_resources: int = 2,
max_resources: int = 10,
initialize_timeout: float = 10.0,
close_timeout: float = 60.0,
memory_warn_mb: float = 500.0,
memory_limit_mb: float = 0.0,
ping_interval: float = 30.0,
health_check_interval: float = 5.0,
load_threshold: float = 0.75,
max_concurrent_tasks: int = 1,
executor_type: ExecutorType = ExecutorType.PROCESS,
use_dedicated_inference_process: bool = True,
inference_process_timeout: float = 30.0,
inference_memory_warn_mb: float = 1000.0)-
Expand source code
@dataclass class ResourceConfig: """Configuration for resource management.""" # Resource type and count resource_type: ResourceType = ResourceType.PROCESS num_idle_resources: int = 2 max_resources: int = 10 # Timeouts initialize_timeout: float = 10.0 close_timeout: float = 60.0 # Memory management memory_warn_mb: float = 500.0 memory_limit_mb: float = 0.0 # Health monitoring ping_interval: float = 30.0 health_check_interval: float = 5.0 # Load balancing load_threshold: float = 0.75 max_concurrent_tasks: int = 1 # Platform-specific executor_type: ExecutorType = _default_executor_type # Legacy IPC compatibility use_dedicated_inference_process: bool = ( True # New: Enable dedicated inference process ) inference_process_timeout: float = 30.0 # Longer timeout for AI model loading inference_memory_warn_mb: float = 1000.0 # Higher threshold for AI modelsConfiguration for resource management.
Instance variables
var close_timeout : floatvar executor_type : ExecutorTypevar health_check_interval : floatvar inference_memory_warn_mb : floatvar inference_process_timeout : floatvar initialize_timeout : floatvar load_threshold : floatvar max_concurrent_tasks : intvar max_resources : intvar memory_limit_mb : floatvar memory_warn_mb : floatvar num_idle_resources : intvar ping_interval : floatvar resource_type : ResourceTypevar use_dedicated_inference_process : bool
class ResourceInfo (resource_id: str,
resource_type: ResourceType,
status: ResourceStatus,
current_load: float = 0.0,
memory_usage_mb: float = 0.0,
cpu_usage_percent: float = 0.0,
active_tasks: int = 0,
total_tasks_processed: int = 0,
last_heartbeat: float = 0.0,
metadata: Dict[str, Any] = <factory>)-
Expand source code
@dataclass class ResourceInfo: """Information about a resource.""" resource_id: str resource_type: ResourceType status: ResourceStatus current_load: float = 0.0 memory_usage_mb: float = 0.0 cpu_usage_percent: float = 0.0 active_tasks: int = 0 total_tasks_processed: int = 0 last_heartbeat: float = 0.0 # Resource-specific metadata metadata: Dict[str, Any] = field(default_factory=dict)Information about a resource.
Instance variables
var active_tasks : intvar cpu_usage_percent : floatvar current_load : floatvar last_heartbeat : floatvar memory_usage_mb : floatvar metadata : Dict[str, Any]var resource_id : strvar resource_type : ResourceTypevar status : ResourceStatusvar total_tasks_processed : int
class ResourceManager (config: ResourceConfig)-
Expand source code
class ResourceManager: """ Manages resources for task execution. This class handles: - Resource creation and lifecycle management - Load balancing across resources - Health monitoring and recovery - Resource allocation for tasks - Dedicated inference process management (legacy IPC compatibility) """ def __init__(self, config: ResourceConfig): self.config = config self.resources: List[BaseResource] = [] self._shutdown = False self._health_check_task: Optional[asyncio.Task] = None self._resource_creation_task: Optional[asyncio.Task] = None # Dedicated inference resource (legacy IPC compatibility) self.dedicated_inference_resource: Optional[DedicatedInferenceResource] = None async def start(self): """Start the resource manager.""" logger.info("Starting resource manager") # Create dedicated inference resource if enabled if self.config.use_dedicated_inference_process: await self._create_dedicated_inference_resource() # Start health monitoring self._health_check_task = asyncio.create_task(self._health_check_loop()) # Start resource creation self._resource_creation_task = asyncio.create_task( self._resource_lifecycle_loop() ) # Initialize initial resources await self._create_initial_resources() logger.info("Resource manager started") async def stop(self): """Stop the resource manager.""" logger.info("Stopping resource manager") self._shutdown = True # Cancel background tasks if self._health_check_task: self._health_check_task.cancel() if self._resource_creation_task: self._resource_creation_task.cancel() # Shutdown all resources shutdown_tasks = [] for resource in self.resources: shutdown_tasks.append(resource.shutdown()) # Shutdown dedicated inference resource if self.dedicated_inference_resource: shutdown_tasks.append(self.dedicated_inference_resource.shutdown()) if shutdown_tasks: await asyncio.gather(*shutdown_tasks, return_exceptions=True) logger.info("Resource manager stopped") async def _create_dedicated_inference_resource(self): """Create the dedicated inference resource (legacy IPC compatibility).""" logger.info("Creating dedicated inference resource") inference_config = { "inference_process_timeout": self.config.inference_process_timeout, "inference_memory_warn_mb": self.config.inference_memory_warn_mb, "ping_interval": self.config.ping_interval, "close_timeout": self.config.close_timeout, } self.dedicated_inference_resource = DedicatedInferenceResource( resource_id="dedicated-inference", config=inference_config ) await self.dedicated_inference_resource.initialize() logger.info("Dedicated inference resource created") async def _create_initial_resources(self): """Create initial resources based on configuration.""" initial_count = self.config.num_idle_resources logger.info( f"Creating {initial_count} initial {self.config.resource_type.value} resources" ) for i in range(initial_count): await self._create_resource(self.config.resource_type) async def _create_resource(self, resource_type: ResourceType) -> BaseResource: """Create a new resource of the specified type.""" resource_id = f"{resource_type.value}-{uuid.uuid4().hex[:8]}" config = { "max_concurrent_tasks": self.config.max_concurrent_tasks, "initialize_timeout": self.config.initialize_timeout, "close_timeout": self.config.close_timeout, "health_check_interval": self.config.health_check_interval, } if resource_type == ResourceType.PROCESS: resource = ProcessResource(resource_id, config) elif resource_type == ResourceType.THREAD: resource = ThreadResource(resource_id, config) else: raise ValueError(f"Unsupported resource type: {resource_type}") # Initialize the resource await resource.initialize() # Add to resources list self.resources.append(resource) logger.info(f"Created {resource_type.value} resource: {resource_id}") return resource async def _resource_lifecycle_loop(self): """Background loop for managing resource lifecycle (creation and cleanup).""" # Wait a bit longer before starting the loop to allow initial resources to stabilize await asyncio.sleep(5.0) while not self._shutdown: try: # Group resources by status available_resources = [r for r in self.resources if r.is_available] active_resources = [r for r in self.resources if not r.is_available] available_count = len(available_resources) total_count = len(self.resources) # S C A L E U P # Create more resources if needed if ( available_count < self.config.num_idle_resources and total_count < self.config.max_resources ): logger.info( f"Scaling up: Creating additional {self.config.resource_type.value} resource. " f"Available: {available_count}, Target Idle: {self.config.num_idle_resources}" ) await self._create_resource(self.config.resource_type) # S C A L E D O W N # Remove excess idle resources elif available_count > self.config.num_idle_resources: # Determine how many to remove excess_count = available_count - self.config.num_idle_resources # Sort by last heartbeat (approximate for idle time) or just take from end # We want to keep resources that might be "warm", but really we just need to reduce count. # Taking from the front of the list implies removing older resources if we append new ones. resources_to_remove = available_resources[:excess_count] if resources_to_remove: logger.info( f"Scaling down: Removing {len(resources_to_remove)} excess idle resources. " f"Available: {available_count}, Target Idle: {self.config.num_idle_resources}" ) for resource in resources_to_remove: if resource in self.resources: self.resources.remove(resource) # Shutdown in background to not block loop asyncio.create_task(resource.shutdown()) await asyncio.sleep(5.0) # Check every 5 seconds except Exception as e: logger.error(f"Error in resource lifecycle loop: {e}") await asyncio.sleep(5.0) async def _health_check_loop(self): """Background loop for health monitoring.""" while not self._shutdown: try: # Check job resources for resource in self.resources[ : ]: # Copy list to avoid modification during iteration try: is_healthy = await resource.health_check() if not is_healthy: logger.warning( f"Unhealthy resource detected: {resource.resource_id}" ) # Remove unhealthy resource self.resources.remove(resource) await resource.shutdown() # Create replacement if needed if len(self.resources) < self.config.num_idle_resources: await self._create_resource(self.config.resource_type) except Exception as e: logger.error( f"Health check failed for {resource.resource_id}: {e}" ) # Check dedicated inference resource if self.dedicated_inference_resource: try: is_healthy = ( await self.dedicated_inference_resource.health_check() ) if not is_healthy: logger.warning( "Unhealthy dedicated inference resource detected" ) # Recreate inference resource await self.dedicated_inference_resource.shutdown() await self._create_dedicated_inference_resource() except Exception as e: logger.error( f"Health check failed for dedicated inference resource: {e}" ) await asyncio.sleep(self.config.health_check_interval) except Exception as e: logger.error(f"Error in health check loop: {e}") await asyncio.sleep(5.0) async def execute_task( self, task_config: TaskConfig, entrypoint: Callable, *args, **kwargs ) -> TaskResult: """Execute a task using an available resource.""" task_id = str(uuid.uuid4()) # Route inference tasks to dedicated inference resource if ( task_config.task_type == TaskType.INFERENCE and self.dedicated_inference_resource ): logger.info( f"Routing inference task {task_id} to dedicated inference resource" ) return await self.dedicated_inference_resource.execute_task( task_id, task_config, entrypoint, args, kwargs ) # Route other tasks to job resources resource = await self._get_available_resource(task_config.task_type) if not resource: raise RuntimeError("No available resources for task execution") # Execute the task return await resource.execute_task( task_id, task_config, entrypoint, args, kwargs ) async def _get_available_resource( self, task_type: TaskType ) -> Optional[BaseResource]: """Get an available resource for task execution.""" # For now, use simple round-robin selection # In the future, this could be enhanced with load balancing, priority, etc. available_resources = [r for r in self.resources if r.is_available] if available_resources: # Simple round-robin selection # In a real implementation, you might want more sophisticated load balancing return available_resources[0] return None def get_stats(self) -> Dict[str, Any]: """Get resource manager statistics.""" available_resources = [r for r in self.resources if r.is_available] active_resources = [ r for r in self.resources if r.status != ResourceStatus.IDLE ] total_resources = len(self.resources) average_load = ( len(active_resources) / total_resources if total_resources > 0 else 0.0 ) stats = { "total_resources": total_resources, "available_resources": len(available_resources), "active_resources": len(active_resources), "average_load": average_load, "resources": [ { "resource_id": r.get_info().resource_id, "resource_type": r.get_info().resource_type.value, "status": r.get_info().status.value, "current_load": r.get_info().current_load, "memory_usage_mb": r.get_info().memory_usage_mb, "cpu_usage_percent": r.get_info().cpu_usage_percent, "active_tasks": r.get_info().active_tasks, "total_tasks_processed": r.get_info().total_tasks_processed, "last_heartbeat": r.get_info().last_heartbeat, "metadata": r.get_info().metadata, } for r in self.resources ], "dedicated_inference": None, } # Dedicated inference resource stats if self.dedicated_inference_resource: info = self.dedicated_inference_resource.get_info() stats["dedicated_inference"] = { "resource_id": info.resource_id, "resource_type": info.resource_type.value, "status": info.status.value, "current_load": info.current_load, "memory_usage_mb": info.memory_usage_mb, "cpu_usage_percent": info.cpu_usage_percent, "active_tasks": info.active_tasks, "total_tasks_processed": info.total_tasks_processed, "last_heartbeat": info.last_heartbeat, "metadata": info.metadata, } return stats def get_resource_info(self) -> List[ResourceInfo]: """Get information about all resources.""" resource_info = [] # Job resources for resource in self.resources: resource_info.append(resource.get_info()) # Dedicated inference resource if self.dedicated_inference_resource: resource_info.append(self.dedicated_inference_resource.get_info()) return resource_infoManages resources for task execution.
This class handles: - Resource creation and lifecycle management - Load balancing across resources - Health monitoring and recovery - Resource allocation for tasks - Dedicated inference process management (legacy IPC compatibility)
Methods
async def execute_task(self,
task_config: TaskConfig,
entrypoint: Callable,
*args,
**kwargs) ‑> TaskResult-
Expand source code
async def execute_task( self, task_config: TaskConfig, entrypoint: Callable, *args, **kwargs ) -> TaskResult: """Execute a task using an available resource.""" task_id = str(uuid.uuid4()) # Route inference tasks to dedicated inference resource if ( task_config.task_type == TaskType.INFERENCE and self.dedicated_inference_resource ): logger.info( f"Routing inference task {task_id} to dedicated inference resource" ) return await self.dedicated_inference_resource.execute_task( task_id, task_config, entrypoint, args, kwargs ) # Route other tasks to job resources resource = await self._get_available_resource(task_config.task_type) if not resource: raise RuntimeError("No available resources for task execution") # Execute the task return await resource.execute_task( task_id, task_config, entrypoint, args, kwargs )Execute a task using an available resource.
def get_resource_info(self) ‑> List[ResourceInfo]-
Expand source code
def get_resource_info(self) -> List[ResourceInfo]: """Get information about all resources.""" resource_info = [] # Job resources for resource in self.resources: resource_info.append(resource.get_info()) # Dedicated inference resource if self.dedicated_inference_resource: resource_info.append(self.dedicated_inference_resource.get_info()) return resource_infoGet information about all resources.
def get_stats(self) ‑> Dict[str, Any]-
Expand source code
def get_stats(self) -> Dict[str, Any]: """Get resource manager statistics.""" available_resources = [r for r in self.resources if r.is_available] active_resources = [ r for r in self.resources if r.status != ResourceStatus.IDLE ] total_resources = len(self.resources) average_load = ( len(active_resources) / total_resources if total_resources > 0 else 0.0 ) stats = { "total_resources": total_resources, "available_resources": len(available_resources), "active_resources": len(active_resources), "average_load": average_load, "resources": [ { "resource_id": r.get_info().resource_id, "resource_type": r.get_info().resource_type.value, "status": r.get_info().status.value, "current_load": r.get_info().current_load, "memory_usage_mb": r.get_info().memory_usage_mb, "cpu_usage_percent": r.get_info().cpu_usage_percent, "active_tasks": r.get_info().active_tasks, "total_tasks_processed": r.get_info().total_tasks_processed, "last_heartbeat": r.get_info().last_heartbeat, "metadata": r.get_info().metadata, } for r in self.resources ], "dedicated_inference": None, } # Dedicated inference resource stats if self.dedicated_inference_resource: info = self.dedicated_inference_resource.get_info() stats["dedicated_inference"] = { "resource_id": info.resource_id, "resource_type": info.resource_type.value, "status": info.status.value, "current_load": info.current_load, "memory_usage_mb": info.memory_usage_mb, "cpu_usage_percent": info.cpu_usage_percent, "active_tasks": info.active_tasks, "total_tasks_processed": info.total_tasks_processed, "last_heartbeat": info.last_heartbeat, "metadata": info.metadata, } return statsGet resource manager statistics.
async def start(self)-
Expand source code
async def start(self): """Start the resource manager.""" logger.info("Starting resource manager") # Create dedicated inference resource if enabled if self.config.use_dedicated_inference_process: await self._create_dedicated_inference_resource() # Start health monitoring self._health_check_task = asyncio.create_task(self._health_check_loop()) # Start resource creation self._resource_creation_task = asyncio.create_task( self._resource_lifecycle_loop() ) # Initialize initial resources await self._create_initial_resources() logger.info("Resource manager started")Start the resource manager.
async def stop(self)-
Expand source code
async def stop(self): """Stop the resource manager.""" logger.info("Stopping resource manager") self._shutdown = True # Cancel background tasks if self._health_check_task: self._health_check_task.cancel() if self._resource_creation_task: self._resource_creation_task.cancel() # Shutdown all resources shutdown_tasks = [] for resource in self.resources: shutdown_tasks.append(resource.shutdown()) # Shutdown dedicated inference resource if self.dedicated_inference_resource: shutdown_tasks.append(self.dedicated_inference_resource.shutdown()) if shutdown_tasks: await asyncio.gather(*shutdown_tasks, return_exceptions=True) logger.info("Resource manager stopped")Stop the resource manager.
class ResourceStatus (*args, **kwds)-
Expand source code
class ResourceStatus(Enum): """Status of a resource.""" IDLE = "idle" BUSY = "busy" INITIALIZING = "initializing" SHUTTING_DOWN = "shutting_down" ERROR = "error"Status of a resource.
Ancestors
- enum.Enum
Class variables
var BUSYvar ERRORvar IDLEvar INITIALIZINGvar SHUTTING_DOWN
class ResourceType (*args, **kwds)-
Expand source code
class ResourceType(Enum): """Type of resource for task execution.""" PROCESS = "process" THREAD = "thread"Type of resource for task execution.
Ancestors
- enum.Enum
Class variables
var PROCESSvar THREAD
class RoomOptions (transport_mode: str | TransportMode | None = None,
websocket: WebSocketConfig | None = None,
webrtc: WebRTCConfig | None = None,
traces: TracesOptions | None = None,
metrics: MetricsOptions | None = None,
logs: LoggingOptions | None = None,
**kwargs)-
Expand source code
@dataclass class RoomOptions: """Configuration options for connecting to and managing a VideoSDK room, including transport, telemetry, and session settings.""" room_id: Optional[str] = None auth_token: Optional[str] = None name: Optional[str] = "Agent" agent_participant_id: Optional[str] = None playground: bool = True vision: bool = False recording: bool = False # recording=True → always record audio (track API). Optional RecordingOptions.video / # RecordingOptions.screen_share for camera video and/or screen share (see validate/resolve). recording_options: Optional[RecordingOptions] = None avatar: Optional[Any] = None join_meeting: Optional[bool] = True on_room_error: Optional[Callable[[Any], None]] = None send_analytics_to_pubsub: Optional[bool] = False # Session management options auto_end_session: bool = True session_timeout_seconds: Optional[int] = 5 no_participant_timeout_seconds: Optional[int] = 90 # VideoSDK connection options signaling_base_url: Optional[str] = "api.videosdk.live" background_audio: bool = False send_logs_to_dashboard: bool = False dashboard_log_level: str = "INFO" # Telemetry and logging configurations traces: Optional[TracesOptions] = None metrics: Optional[MetricsOptions] = None logs: Optional[LoggingOptions] = None # New Configuration Fields _transport_mode: TransportMode = field(default=TransportMode.VIDEOSDK, init=False, repr=False) # Structured configs websocket: Optional[WebSocketConfig] = None webrtc: Optional[WebRTCConfig] = None # Alias properties for easier usage as requested @property def transport_mode(self) -> TransportMode: return self._transport_mode @transport_mode.setter def transport_mode(self, value): if isinstance(value, str): try: self._transport_mode = TransportMode(value.lower()) except ValueError: # Fallback for compatibility or custom modes pass elif isinstance(value, TransportMode): self._transport_mode = value def __init__( self, transport_mode: Optional[str | TransportMode] = None, websocket: Optional[WebSocketConfig] = None, webrtc: Optional[WebRTCConfig] = None, traces: Optional[TracesOptions] = None, metrics: Optional[MetricsOptions] = None, logs: Optional[LoggingOptions] = None, **kwargs, ): # Initialize internal field self._transport_mode = TransportMode.VIDEOSDK # Handle telemetry options self.traces = traces or TracesOptions() self.metrics = metrics or MetricsOptions() self.logs = logs or LoggingOptions() # Handle connection mode if transport_mode: if isinstance(transport_mode, str): try: self._transport_mode = TransportMode(transport_mode.lower()) except ValueError: pass elif isinstance(transport_mode, TransportMode): self._transport_mode = transport_mode self.websocket = websocket or WebSocketConfig() self.webrtc = webrtc or WebRTCConfig() # Handle standard fields for key, value in kwargs.items(): if hasattr(self, key): setattr(self, key, value)Configuration options for connecting to and managing a VideoSDK room, including transport, telemetry, and session settings.
Instance variables
var agent_participant_id : str | Nonevar auth_token : str | Nonevar auto_end_session : boolvar avatar : typing.Any | Nonevar background_audio : boolvar dashboard_log_level : strvar join_meeting : bool | Nonevar logs : LoggingOptions | Nonevar metrics : MetricsOptions | Nonevar name : str | Nonevar no_participant_timeout_seconds : int | Nonevar on_room_error : Callable[[Any], None] | Nonevar playground : boolvar recording : boolvar recording_options : RecordingOptions | Nonevar room_id : str | Nonevar send_analytics_to_pubsub : bool | Nonevar send_logs_to_dashboard : boolvar session_timeout_seconds : int | Nonevar signaling_base_url : str | Nonevar traces : TracesOptions | Noneprop transport_mode : TransportMode-
Expand source code
@property def transport_mode(self) -> TransportMode: return self._transport_mode var vision : boolvar webrtc : WebRTCConfig | Nonevar websocket : WebSocketConfig | None
class STT-
Expand source code
class STT(EventEmitter[Literal["error"]]): """Base class for Speech-to-Text implementations""" def __init__( self, ) -> None: super().__init__() self._label = f"{type(self).__module__}.{type(self).__name__}" self._transcript_callback: Optional[Callable[[STTResponse], Awaitable[None]]] = None @property def label(self) -> str: """Get the STT provider label""" return self._label def on_stt_transcript(self, callback: Callable[[STTResponse], Awaitable[None]]) -> None: """Set callback for receiving STT transcripts""" self._transcript_callback = callback @abstractmethod async def process_audio( self, audio_frames: bytes, language: Optional[str] = None, **kwargs: Any ) -> None: """ Process audio frames and convert to text Args: audio_frames: Iterator of bytes to process language: Optional language code for recognition **kwargs: Additional provider-specific arguments Returns: AsyncIterator yielding STTResponse objects """ raise NotImplementedError async def stream_transcribe( self, audio_stream: AsyncIterator[bytes], **kwargs: Any ) -> AsyncIterator[STTResponse]: """ Process audio stream and yield STT events (simulated via process_audio). This default implementation allows using the streaming hook pattern even if the STT subclass doesn't implement a native streaming method, by redirecting callbacks to a queue. Args: audio_stream: Async iterator of audio bytes **kwargs: Additional provider-specific arguments Yields: STTResponse objects """ event_queue = asyncio.Queue() async def capture_callback(response): await event_queue.put(response) original_callback = self._transcript_callback self.on_stt_transcript(capture_callback) async def feed_audio(): try: async for chunk in audio_stream: await self.process_audio(chunk, **kwargs) finally: pass feed_task = asyncio.create_task(feed_audio()) try: while True: done, pending = await asyncio.wait( [asyncio.create_task(event_queue.get()), feed_task], return_when=asyncio.FIRST_COMPLETED ) for task in done: if task == feed_task: pass else: event = task.result() yield event if feed_task.done() and event_queue.empty(): break finally: self._transcript_callback = original_callback if not feed_task.done(): feed_task.cancel() async def aclose(self) -> None: """Cleanup resources""" logger.info(f"Cleaning up STT: {self.label}") self._transcript_callback = None try: import gc gc.collect() logger.info(f"STT garbage collection completed: {self.label}") except Exception as e: logger.error(f"Error during STT garbage collection: {e}") logger.info(f"STT cleanup completed: {self.label}")Base class for Speech-to-Text implementations
Ancestors
- EventEmitter
- typing.Generic
Subclasses
Instance variables
prop label : str-
Expand source code
@property def label(self) -> str: """Get the STT provider label""" return self._labelGet the STT provider label
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """Cleanup resources""" logger.info(f"Cleaning up STT: {self.label}") self._transcript_callback = None try: import gc gc.collect() logger.info(f"STT garbage collection completed: {self.label}") except Exception as e: logger.error(f"Error during STT garbage collection: {e}") logger.info(f"STT cleanup completed: {self.label}")Cleanup resources
def on_stt_transcript(self,
callback: Callable[[STTResponse], Awaitable[None]]) ‑> None-
Expand source code
def on_stt_transcript(self, callback: Callable[[STTResponse], Awaitable[None]]) -> None: """Set callback for receiving STT transcripts""" self._transcript_callback = callbackSet callback for receiving STT transcripts
async def process_audio(self, audio_frames: bytes, language: Optional[str] = None, **kwargs: Any) ‑> None-
Expand source code
@abstractmethod async def process_audio( self, audio_frames: bytes, language: Optional[str] = None, **kwargs: Any ) -> None: """ Process audio frames and convert to text Args: audio_frames: Iterator of bytes to process language: Optional language code for recognition **kwargs: Additional provider-specific arguments Returns: AsyncIterator yielding STTResponse objects """ raise NotImplementedErrorProcess audio frames and convert to text
Args
audio_frames- Iterator of bytes to process
language- Optional language code for recognition
**kwargs- Additional provider-specific arguments
Returns
AsyncIterator yielding STTResponse objects
async def stream_transcribe(self, audio_stream: AsyncIterator[bytes], **kwargs: Any) ‑> AsyncIterator[STTResponse]-
Expand source code
async def stream_transcribe( self, audio_stream: AsyncIterator[bytes], **kwargs: Any ) -> AsyncIterator[STTResponse]: """ Process audio stream and yield STT events (simulated via process_audio). This default implementation allows using the streaming hook pattern even if the STT subclass doesn't implement a native streaming method, by redirecting callbacks to a queue. Args: audio_stream: Async iterator of audio bytes **kwargs: Additional provider-specific arguments Yields: STTResponse objects """ event_queue = asyncio.Queue() async def capture_callback(response): await event_queue.put(response) original_callback = self._transcript_callback self.on_stt_transcript(capture_callback) async def feed_audio(): try: async for chunk in audio_stream: await self.process_audio(chunk, **kwargs) finally: pass feed_task = asyncio.create_task(feed_audio()) try: while True: done, pending = await asyncio.wait( [asyncio.create_task(event_queue.get()), feed_task], return_when=asyncio.FIRST_COMPLETED ) for task in done: if task == feed_task: pass else: event = task.result() yield event if feed_task.done() and event_queue.empty(): break finally: self._transcript_callback = original_callback if not feed_task.done(): feed_task.cancel()Process audio stream and yield STT events (simulated via process_audio).
This default implementation allows using the streaming hook pattern even if the STT subclass doesn't implement a native streaming method, by redirecting callbacks to a queue.
Args
audio_stream- Async iterator of audio bytes
**kwargs- Additional provider-specific arguments
Yields
STTResponse objects
Inherited members
class STTResponse (**data: Any)-
Expand source code
class STTResponse(BaseModel): """Response from STT processing Attributes: event_type: The type of speech event. data: The data from the speech event. metadata: Additional metadata from the speech event. """ event_type: SpeechEventType data: SpeechData metadata: Optional[dict[str, Any]] = NoneResponse from STT processing
Attributes
event_type- The type of speech event.
data- The data from the speech event.
metadata- Additional metadata from the speech event.
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var data : SpeechDatavar event_type : SpeechEventTypevar metadata : dict[str, typing.Any] | Nonevar model_config
class SpeechData (text: str,
confidence: float = 0.0,
language: Optional[str] = None,
start_time: float = 0.0,
end_time: float = 0.0,
duration: float = 0.0)-
Expand source code
@dataclass class SpeechData: """Data structure for speech recognition results Attributes: text: The recognized text. confidence: The confidence level of the recognition. language: The language of the recognized text. start_time: The start time of the speech. end_time: The end time of the speech. """ text: str confidence: float = 0.0 language: Optional[str] = None start_time: float = 0.0 end_time: float = 0.0 duration: float = 0.0Data structure for speech recognition results
Attributes
text- The recognized text.
confidence- The confidence level of the recognition.
language- The language of the recognized text.
start_time- The start time of the speech.
end_time- The end time of the speech.
Instance variables
var confidence : floatvar duration : floatvar end_time : floatvar language : str | Nonevar start_time : floatvar text : str
class SpeechEventType (*args, **kwds)-
Expand source code
class SpeechEventType(str, Enum): """Type of speech event""" START = "start_of_speech" INTERIM = "interim_transcript" PREFLIGHT = "preflight_transcript" FINAL = "final_transcript" END = "end_of_speech"Type of speech event
Ancestors
- builtins.str
- enum.Enum
Class variables
var ENDvar FINALvar INTERIMvar PREFLIGHTvar START
class TTS (sample_rate: int = 16000, num_channels: int = 1)-
Expand source code
class TTS(EventEmitter[Literal["error"]]): """Base class for Text-to-Speech implementations""" def __init__( self, sample_rate: int = 16000, num_channels: int = 1 ) -> None: super().__init__() self._label = f"{type(self).__module__}.{type(self).__name__}" self._sample_rate = sample_rate self._num_channels = num_channels self._first_audio_callback: Optional[Callable[[], Awaitable[None]]] = None self.audio_track = None @property def label(self) -> str: """Get the TTS provider label""" return self._label @property def sample_rate(self) -> int: """Get audio sample rate""" return self._sample_rate @property def num_channels(self) -> int: """Get number of audio channels""" return self._num_channels def on_first_audio_byte(self, callback: Callable[[], Awaitable[None]]) -> None: """Set callback for when first audio byte is produced""" self._first_audio_callback = callback def reset_first_audio_tracking(self) -> None: """Reset the first audio tracking state for next TTS task""" # To be overridden by implementations for TTFB metrics pass async def pause(self) -> None: """Pause audio playback if the audio track supports it, otherwise interrupt.""" if self.audio_track and hasattr(self.audio_track, 'pause'): await self.audio_track.pause() else: await self.interrupt() async def resume(self) -> None: """Resume audio playback if the audio track supports it.""" if self.audio_track and hasattr(self.audio_track, 'resume'): await self.audio_track.resume() @property def can_pause(self) -> bool: """Return whether the current audio track supports pausing.""" return self.audio_track and hasattr(self.audio_track, 'can_pause') and self.audio_track.can_pause @abstractmethod async def synthesize( self, text: AsyncIterator[str] | str, voice_id: Optional[str] = None, **kwargs: Any ) -> None: """ Convert text to speech Args: text: Text to convert to speech (either string or async iterator of strings) voice_id: Optional voice identifier **kwargs: Additional provider-specific arguments Returns: None """ raise NotImplementedError @abstractmethod async def interrupt(self) -> None: """Interrupt the TTS process""" raise NotImplementedError async def stream_synthesize( self, text_stream: AsyncIterator[str], **kwargs: Any ) -> AsyncIterator[bytes]: """ Synthesize text stream to audio stream. This default implementation mocks the audio track to capture frames. Args: text_stream: Async iterator of text **kwargs: Additional arguments Yields: Audio bytes """ original_track = self.audio_track frame_queue = asyncio.Queue() class QueueTrack: def __init__(self): self.hooks = None async def add_new_bytes(self, audio_data: bytes): await frame_queue.put(audio_data) def on_last_audio_byte(self, cb): pass def set_pipeline_hooks(self, hooks): self.hooks = hooks def enable_audio_input(self, manual_control=False): pass def interrupt(self): pass mock_track = QueueTrack() self.audio_track = mock_track async def synthesize_task(): try: await self.synthesize(text_stream, **kwargs) finally: await frame_queue.put(None) task = asyncio.create_task(synthesize_task()) try: while True: get_task = asyncio.create_task(frame_queue.get()) done, pending = await asyncio.wait( [get_task, task], return_when=asyncio.FIRST_COMPLETED ) if get_task in done: data = get_task.result() if data is None: break yield data if task in done: if task.exception(): raise task.exception() finally: self.audio_track = original_track if not task.done(): task.cancel() async def aclose(self) -> None: """Cleanup resources""" logger.info(f"Cleaning up TTS: {self.label}") self._first_audio_callback = None try: import gc gc.collect() except Exception as e: logger.error(f"Error during TTS garbage collection: {e}") logger.info(f"TTS cleanup completed: {self.label}") async def __aenter__(self) -> TTS: return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: await self.aclose()Base class for Text-to-Speech implementations
Ancestors
- EventEmitter
- typing.Generic
Subclasses
Instance variables
prop can_pause : bool-
Expand source code
@property def can_pause(self) -> bool: """Return whether the current audio track supports pausing.""" return self.audio_track and hasattr(self.audio_track, 'can_pause') and self.audio_track.can_pauseReturn whether the current audio track supports pausing.
prop label : str-
Expand source code
@property def label(self) -> str: """Get the TTS provider label""" return self._labelGet the TTS provider label
prop num_channels : int-
Expand source code
@property def num_channels(self) -> int: """Get number of audio channels""" return self._num_channelsGet number of audio channels
prop sample_rate : int-
Expand source code
@property def sample_rate(self) -> int: """Get audio sample rate""" return self._sample_rateGet audio sample rate
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """Cleanup resources""" logger.info(f"Cleaning up TTS: {self.label}") self._first_audio_callback = None try: import gc gc.collect() except Exception as e: logger.error(f"Error during TTS garbage collection: {e}") logger.info(f"TTS cleanup completed: {self.label}")Cleanup resources
async def interrupt(self) ‑> None-
Expand source code
@abstractmethod async def interrupt(self) -> None: """Interrupt the TTS process""" raise NotImplementedErrorInterrupt the TTS process
def on_first_audio_byte(self, callback: Callable[[], Awaitable[None]]) ‑> None-
Expand source code
def on_first_audio_byte(self, callback: Callable[[], Awaitable[None]]) -> None: """Set callback for when first audio byte is produced""" self._first_audio_callback = callbackSet callback for when first audio byte is produced
async def pause(self) ‑> None-
Expand source code
async def pause(self) -> None: """Pause audio playback if the audio track supports it, otherwise interrupt.""" if self.audio_track and hasattr(self.audio_track, 'pause'): await self.audio_track.pause() else: await self.interrupt()Pause audio playback if the audio track supports it, otherwise interrupt.
def reset_first_audio_tracking(self) ‑> None-
Expand source code
def reset_first_audio_tracking(self) -> None: """Reset the first audio tracking state for next TTS task""" # To be overridden by implementations for TTFB metrics passReset the first audio tracking state for next TTS task
async def resume(self) ‑> None-
Expand source code
async def resume(self) -> None: """Resume audio playback if the audio track supports it.""" if self.audio_track and hasattr(self.audio_track, 'resume'): await self.audio_track.resume()Resume audio playback if the audio track supports it.
async def stream_synthesize(self, text_stream: AsyncIterator[str], **kwargs: Any) ‑> AsyncIterator[bytes]-
Expand source code
async def stream_synthesize( self, text_stream: AsyncIterator[str], **kwargs: Any ) -> AsyncIterator[bytes]: """ Synthesize text stream to audio stream. This default implementation mocks the audio track to capture frames. Args: text_stream: Async iterator of text **kwargs: Additional arguments Yields: Audio bytes """ original_track = self.audio_track frame_queue = asyncio.Queue() class QueueTrack: def __init__(self): self.hooks = None async def add_new_bytes(self, audio_data: bytes): await frame_queue.put(audio_data) def on_last_audio_byte(self, cb): pass def set_pipeline_hooks(self, hooks): self.hooks = hooks def enable_audio_input(self, manual_control=False): pass def interrupt(self): pass mock_track = QueueTrack() self.audio_track = mock_track async def synthesize_task(): try: await self.synthesize(text_stream, **kwargs) finally: await frame_queue.put(None) task = asyncio.create_task(synthesize_task()) try: while True: get_task = asyncio.create_task(frame_queue.get()) done, pending = await asyncio.wait( [get_task, task], return_when=asyncio.FIRST_COMPLETED ) if get_task in done: data = get_task.result() if data is None: break yield data if task in done: if task.exception(): raise task.exception() finally: self.audio_track = original_track if not task.done(): task.cancel()Synthesize text stream to audio stream.
This default implementation mocks the audio track to capture frames.
Args
text_stream- Async iterator of text
**kwargs- Additional arguments
Yields
Audio bytes
async def synthesize(self,
text: AsyncIterator[str] | str,
voice_id: Optional[str] = None,
**kwargs: Any) ‑> None-
Expand source code
@abstractmethod async def synthesize( self, text: AsyncIterator[str] | str, voice_id: Optional[str] = None, **kwargs: Any ) -> None: """ Convert text to speech Args: text: Text to convert to speech (either string or async iterator of strings) voice_id: Optional voice identifier **kwargs: Additional provider-specific arguments Returns: None """ raise NotImplementedErrorConvert text to speech
Args
text- Text to convert to speech (either string or async iterator of strings)
voice_id- Optional voice identifier
**kwargs- Additional provider-specific arguments
Returns
None
Inherited members
class TaskConfig (task_type: TaskType,
timeout: float = 300.0,
retry_count: int = 3,
priority: int = 0,
required_memory_mb: float = 100.0,
required_cpu_cores: int = 1,
data: Dict[str, Any] = <factory>)-
Expand source code
@dataclass class TaskConfig: """Configuration for task execution.""" task_type: TaskType timeout: float = 300.0 # 5 minutes default retry_count: int = 3 priority: int = 0 # Higher number = higher priority # Resource requirements required_memory_mb: float = 100.0 required_cpu_cores: int = 1 # Task-specific data data: Dict[str, Any] = field(default_factory=dict)Configuration for task execution.
Instance variables
var data : Dict[str, Any]var priority : intvar required_cpu_cores : intvar required_memory_mb : floatvar retry_count : intvar task_type : TaskTypevar timeout : float
class TaskExecutor (config: ResourceConfig)-
Expand source code
class TaskExecutor: """ High-level task executor that manages task execution using resources. This class provides a simple interface for executing tasks while handling resource management, retries, and monitoring internally. """ def __init__(self, config: ResourceConfig): self.config = config self.resource_manager = ResourceManager(config) self._shutdown = False self._total_tasks = 0 self._completed_tasks = 0 self._failed_tasks = 0 self._total_execution_time = 0.0 async def start(self): """Start the task executor.""" logger.info("Starting task executor") await self.resource_manager.start() logger.info("Task executor started") async def stop(self): """Stop the task executor.""" logger.info("Stopping task executor") self._shutdown = True # Stop resource manager await self.resource_manager.stop() logger.info("Task executor stopped") async def execute( self, entrypoint: Callable, task_type: TaskType = TaskType.JOB, timeout: float = 300.0, retry_count: int = 3, priority: int = 0, *args, **kwargs, ) -> TaskResult: """ Execute a task using the resource manager. Args: entrypoint: Function to execute task_type: Type of task (inference, meeting, job) timeout: Task timeout in seconds retry_count: Number of retries on failure priority: Task priority (higher = higher priority) *args, **kwargs: Arguments to pass to entrypoint Returns: TaskResult with execution results """ task_config = TaskConfig( task_type=task_type, timeout=timeout, retry_count=retry_count, priority=priority, ) # Execute with retries last_error = None for attempt in range(retry_count + 1): try: result = await self.resource_manager.execute_task( task_config, entrypoint, *args, **kwargs ) # Update stats self._update_stats(result) if result.status == TaskStatus.COMPLETED: return result else: last_error = result.error except Exception as e: last_error = str(e) logger.warning(f"Task execution attempt {attempt + 1} failed: {e}") if attempt < retry_count: await asyncio.sleep(1.0 * (attempt + 1)) # Exponential backoff # All retries failed failed_result = TaskResult( task_id=task_config.task_type.value, status=TaskStatus.FAILED, error=f"All {retry_count + 1} attempts failed. Last error: {last_error}", execution_time=0.0, ) self._update_stats(failed_result) return failed_result def _update_stats(self, result: TaskResult): """Update execution statistics.""" self._total_tasks += 1 if result.status == TaskStatus.COMPLETED: self._completed_tasks += 1 self._total_execution_time += result.execution_time elif result.status == TaskStatus.FAILED: self._failed_tasks += 1 def get_stats(self) -> Dict[str, Any]: """Get executor statistics.""" resource_stats = self.resource_manager.get_stats() average_execution_time = ( self._total_execution_time / self._completed_tasks if self._completed_tasks > 0 else 0.0 ) return { "executor_stats": { "total_tasks": self._total_tasks, "completed_tasks": self._completed_tasks, "failed_tasks": self._failed_tasks, "pending_tasks": 0, "average_execution_time": average_execution_time, "total_execution_time": self._total_execution_time, }, "resource_stats": resource_stats, } def get_resource_info(self) -> List[ResourceInfo]: """Get information about all resources.""" return self.resource_manager.get_resource_info()High-level task executor that manages task execution using resources.
This class provides a simple interface for executing tasks while handling resource management, retries, and monitoring internally.
Methods
async def execute(self,
entrypoint: Callable,
task_type: TaskType = TaskType.JOB,
timeout: float = 300.0,
retry_count: int = 3,
priority: int = 0,
*args,
**kwargs) ‑> TaskResult-
Expand source code
async def execute( self, entrypoint: Callable, task_type: TaskType = TaskType.JOB, timeout: float = 300.0, retry_count: int = 3, priority: int = 0, *args, **kwargs, ) -> TaskResult: """ Execute a task using the resource manager. Args: entrypoint: Function to execute task_type: Type of task (inference, meeting, job) timeout: Task timeout in seconds retry_count: Number of retries on failure priority: Task priority (higher = higher priority) *args, **kwargs: Arguments to pass to entrypoint Returns: TaskResult with execution results """ task_config = TaskConfig( task_type=task_type, timeout=timeout, retry_count=retry_count, priority=priority, ) # Execute with retries last_error = None for attempt in range(retry_count + 1): try: result = await self.resource_manager.execute_task( task_config, entrypoint, *args, **kwargs ) # Update stats self._update_stats(result) if result.status == TaskStatus.COMPLETED: return result else: last_error = result.error except Exception as e: last_error = str(e) logger.warning(f"Task execution attempt {attempt + 1} failed: {e}") if attempt < retry_count: await asyncio.sleep(1.0 * (attempt + 1)) # Exponential backoff # All retries failed failed_result = TaskResult( task_id=task_config.task_type.value, status=TaskStatus.FAILED, error=f"All {retry_count + 1} attempts failed. Last error: {last_error}", execution_time=0.0, ) self._update_stats(failed_result) return failed_resultExecute a task using the resource manager.
Args
entrypoint- Function to execute
task_type- Type of task (inference, meeting, job)
timeout- Task timeout in seconds
retry_count- Number of retries on failure
priority- Task priority (higher = higher priority)
args, *kwargs: Arguments to pass to entrypoint
Returns
TaskResult with execution results
def get_resource_info(self) ‑> List[ResourceInfo]-
Expand source code
def get_resource_info(self) -> List[ResourceInfo]: """Get information about all resources.""" return self.resource_manager.get_resource_info()Get information about all resources.
def get_stats(self) ‑> Dict[str, Any]-
Expand source code
def get_stats(self) -> Dict[str, Any]: """Get executor statistics.""" resource_stats = self.resource_manager.get_stats() average_execution_time = ( self._total_execution_time / self._completed_tasks if self._completed_tasks > 0 else 0.0 ) return { "executor_stats": { "total_tasks": self._total_tasks, "completed_tasks": self._completed_tasks, "failed_tasks": self._failed_tasks, "pending_tasks": 0, "average_execution_time": average_execution_time, "total_execution_time": self._total_execution_time, }, "resource_stats": resource_stats, }Get executor statistics.
async def start(self)-
Expand source code
async def start(self): """Start the task executor.""" logger.info("Starting task executor") await self.resource_manager.start() logger.info("Task executor started")Start the task executor.
async def stop(self)-
Expand source code
async def stop(self): """Stop the task executor.""" logger.info("Stopping task executor") self._shutdown = True # Stop resource manager await self.resource_manager.stop() logger.info("Task executor stopped")Stop the task executor.
class TaskResult (task_id: str,
status: TaskStatus,
result: typing.Any | None = None,
error: str | None = None,
execution_time: float = 0.0,
memory_used_mb: float = 0.0)-
Expand source code
@dataclass class TaskResult: """Result of a task execution.""" task_id: str status: TaskStatus result: Optional[Any] = None error: Optional[str] = None execution_time: float = 0.0 memory_used_mb: float = 0.0Result of a task execution.
Instance variables
var error : str | Nonevar execution_time : floatvar memory_used_mb : floatvar result : typing.Any | Nonevar status : TaskStatusvar task_id : str
class TaskStatus (*args, **kwds)-
Expand source code
class TaskStatus(Enum): """Status of a task.""" PENDING = "pending" RUNNING = "running" COMPLETED = "completed" FAILED = "failed" CANCELLED = "cancelled"Status of a task.
Ancestors
- enum.Enum
Class variables
var CANCELLEDvar COMPLETEDvar FAILEDvar PENDINGvar RUNNING
class TaskType (*args, **kwds)-
Expand source code
class TaskType(Enum): """Type of task to be executed.""" INFERENCE = "inference" # For AI model inference MEETING = "meeting" # For video meeting tasks JOB = "job" # For general job executionType of task to be executed.
Ancestors
- enum.Enum
Class variables
var INFERENCEvar JOBvar MEETING
class TeeCustomAudioStreamTrack (loop, sinks=None, pipeline=None)-
Expand source code
class TeeCustomAudioStreamTrack(CustomAudioStreamTrack): """Audio track that duplicates outgoing audio bytes to registered sinks such as avatar plugins or local speakers.""" def __init__(self, loop, sinks=None, pipeline=None): super().__init__(loop) self.sinks = sinks if sinks is not None else [] self.pipeline = pipeline def add_sink(self, sink): """Add a new sink (callback or object)""" if sink not in self.sinks: self.sinks.append(sink) def remove_sink(self, sink): if sink in self.sinks: self.sinks.remove(sink) async def add_new_bytes(self, audio_data: bytes): await super().add_new_bytes(audio_data) # Route audio to sinks (avatars, etc.) for sink in self.sinks: try: if hasattr(sink, "handle_audio_input"): await sink.handle_audio_input(audio_data) elif callable(sink): if asyncio.iscoroutinefunction(sink): await sink(audio_data) else: sink(audio_data) except Exception as e: import logging as _logging _logging.getLogger(__name__).warning("Avatar sink error (audio will continue): %s", e) async def recv(self) -> AudioFrame: """ When avatar sinks are present the Avatar Server publishes audio to the meeting, so we must NOT publish the raw TTS audio directly (it would cause double audio). We still call super().recv() so that all internal state tracking and callbacks (is_speaking, on_first_audio_byte, on_last_audio_byte) continue to work correctly. The actual audio payload is replaced with silence before handing back to WebRTC. """ frame = await super().recv() if self.sinks and frame is not None: silence = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in silence.planes: p.update(bytes(p.buffer_size)) silence.pts = frame.pts silence.time_base = frame.time_base silence.sample_rate = frame.sample_rate return silence return frameAudio track that duplicates outgoing audio bytes to registered sinks such as avatar plugins or local speakers.
Ancestors
- CustomAudioStreamTrack
- videosdk.custom_audio_track.CustomAudioTrack
- vsaiortc.mediastreams.MediaStreamTrack
- pyee.asyncio.AsyncIOEventEmitter
- pyee.base.EventEmitter
Subclasses
Methods
def add_sink(self, sink)-
Expand source code
def add_sink(self, sink): """Add a new sink (callback or object)""" if sink not in self.sinks: self.sinks.append(sink)Add a new sink (callback or object)
async def recv(self) ‑> av.audio.frame.AudioFrame-
Expand source code
async def recv(self) -> AudioFrame: """ When avatar sinks are present the Avatar Server publishes audio to the meeting, so we must NOT publish the raw TTS audio directly (it would cause double audio). We still call super().recv() so that all internal state tracking and callbacks (is_speaking, on_first_audio_byte, on_last_audio_byte) continue to work correctly. The actual audio payload is replaced with silence before handing back to WebRTC. """ frame = await super().recv() if self.sinks and frame is not None: silence = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in silence.planes: p.update(bytes(p.buffer_size)) silence.pts = frame.pts silence.time_base = frame.time_base silence.sample_rate = frame.sample_rate return silence return frameWhen avatar sinks are present the Avatar Server publishes audio to the meeting, so we must NOT publish the raw TTS audio directly (it would cause double audio). We still call super().recv() so that all internal state tracking and callbacks (is_speaking, on_first_audio_byte, on_last_audio_byte) continue to work correctly. The actual audio payload is replaced with silence before handing back to WebRTC.
def remove_sink(self, sink)-
Expand source code
def remove_sink(self, sink): if sink in self.sinks: self.sinks.remove(sink)
Inherited members
class TeeMixingCustomAudioStreamTrack (loop, sinks=None, pipeline=None)-
Expand source code
class TeeMixingCustomAudioStreamTrack(MixingCustomAudioStreamTrack): """Combines mixing and tee functionality, mixing background audio while also forwarding audio bytes to registered sinks.""" def __init__(self, loop, sinks=None, pipeline=None): super().__init__(loop) self.sinks = sinks if sinks is not None else [] self.pipeline = pipeline async def add_new_bytes(self, audio_data: bytes): await super().add_new_bytes(audio_data) # Route audio to sinks (avatars, etc.) for sink in self.sinks: try: if hasattr(sink, "handle_audio_input"): await sink.handle_audio_input(audio_data) elif callable(sink): if asyncio.iscoroutinefunction(sink): await sink(audio_data) else: sink(audio_data) except Exception as e: import logging as _logging _logging.getLogger(__name__).warning("Avatar sink error (audio will continue): %s", e) async def recv(self) -> AudioFrame: """Silence the direct WebRTC output when avatar sinks are handling playback.""" frame = await super().recv() if self.sinks and frame is not None: silence = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in silence.planes: p.update(bytes(p.buffer_size)) silence.pts = frame.pts silence.time_base = frame.time_base silence.sample_rate = frame.sample_rate return silence return frameCombines mixing and tee functionality, mixing background audio while also forwarding audio bytes to registered sinks.
Ancestors
- MixingCustomAudioStreamTrack
- CustomAudioStreamTrack
- videosdk.custom_audio_track.CustomAudioTrack
- vsaiortc.mediastreams.MediaStreamTrack
- pyee.asyncio.AsyncIOEventEmitter
- pyee.base.EventEmitter
Methods
async def recv(self) ‑> av.audio.frame.AudioFrame-
Expand source code
async def recv(self) -> AudioFrame: """Silence the direct WebRTC output when avatar sinks are handling playback.""" frame = await super().recv() if self.sinks and frame is not None: silence = AudioFrame(format="s16", layout="mono", samples=self.samples) for p in silence.planes: p.update(bytes(p.buffer_size)) silence.pts = frame.pts silence.time_base = frame.time_base silence.sample_rate = frame.sample_rate return silence return frameSilence the direct WebRTC output when avatar sinks are handling playback.
Inherited members
class ThreadResource (resource_id: str, config: Dict[str, Any])-
Expand source code
class ThreadResource(BaseResource): """ Thread-based resource for task execution. Uses threading for concurrent task execution within the same process. """ def __init__(self, resource_id: str, config: Dict[str, Any]): super().__init__(resource_id, config) self.thread: Optional[threading.Thread] = None self.task_queue: asyncio.Queue = asyncio.Queue() self.result_queue: asyncio.Queue = asyncio.Queue() self.control_queue: asyncio.Queue = asyncio.Queue() self._thread_ready = False self._loop: Optional[asyncio.AbstractEventLoop] = None @property def resource_type(self) -> ResourceType: return ResourceType.THREAD async def _initialize_impl(self) -> None: """Initialize the thread resource.""" # Start the thread self.thread = threading.Thread( target=self._thread_worker, args=( self.resource_id, self.task_queue, self.result_queue, self.control_queue, self.config, ), daemon=True, ) self.thread.start() # Simple readiness check - just wait a bit for thread to start await asyncio.sleep(0.5) # Mark as ready if thread is alive if self.thread.is_alive(): self._thread_ready = True else: raise RuntimeError(f"Thread {self.resource_id} failed to start") async def _execute_task_impl( self, task_id: str, config, entrypoint: Callable, args: tuple, kwargs: dict ) -> Any: """Execute task in the thread.""" if not self._thread_ready: raise RuntimeError(f"Thread {self.resource_id} is not ready") # Send task to thread task_data = { "task_id": task_id, "config": config, "entrypoint": entrypoint, "args": args, "kwargs": kwargs, } await self.task_queue.put(task_data) # Wait for result timeout = config.timeout start_time = time.time() while (time.time() - start_time) < timeout: try: if not self.result_queue.empty(): result_data = await self.result_queue.get() if result_data.get("task_id") == task_id: if result_data.get("status") == "success": return result_data.get("result") else: raise RuntimeError( result_data.get("error", "Unknown error") ) except RuntimeError: raise except Exception as e: logger.warning(f"Error checking task result: {e}") await asyncio.sleep(0.1) raise TimeoutError(f"Task {task_id} timed out after {timeout}s") async def _shutdown_impl(self) -> None: """Shutdown the thread resource.""" if self.thread and self.thread.is_alive(): # Send shutdown signal await self.control_queue.put({"type": "shutdown"}) # Wait for graceful shutdown timeout = self.config.get("close_timeout", 60.0) start_time = time.time() while self.thread.is_alive() and (time.time() - start_time) < timeout: await asyncio.sleep(0.1) @staticmethod def _thread_worker( resource_id: str, task_queue: asyncio.Queue, result_queue: asyncio.Queue, control_queue: asyncio.Queue, config: Dict[str, Any], ): """Worker function that runs in the thread.""" try: # Set up event loop for this thread loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) logger.info(f"Thread worker {resource_id} started") async def worker_main(): # Main task processing loop while True: try: # Check for shutdown signal if not control_queue.empty(): message = await control_queue.get() if message.get("type") == "shutdown": break # Check for tasks if not task_queue.empty(): task_data = await task_queue.get() task_id = task_data["task_id"] entrypoint = task_data["entrypoint"] args = task_data.get("args", ()) kwargs = task_data.get("kwargs", {}) try: # Execute the task if asyncio.iscoroutinefunction(entrypoint): result = await entrypoint(*args, **kwargs) else: result = entrypoint(*args, **kwargs) await result_queue.put( { "task_id": task_id, "status": "success", "result": result, } ) except Exception as e: await result_queue.put( { "task_id": task_id, "status": "error", "error": str(e), } ) else: await asyncio.sleep(0.1) except Exception as e: logger.error(f"Error in thread worker {resource_id}: {e}") await asyncio.sleep(1.0) logger.info(f"Thread worker {resource_id} shutting down") # Run the worker loop.run_until_complete(worker_main()) except Exception as e: logger.error(f"Fatal error in thread worker {resource_id}: {e}") finally: if loop and not loop.is_closed(): loop.close()Thread-based resource for task execution.
Uses threading for concurrent task execution within the same process.
Ancestors
- BaseResource
- abc.ABC
Inherited members
class TracesOptions (enabled: bool = True,
export_url: str | None = None,
export_headers: Dict[str, str] | None = None)-
Expand source code
@dataclass class TracesOptions: """Configuration for OpenTelemetry trace export settings.""" enabled: bool = True export_url: Optional[str] = None export_headers: Optional[Dict[str, str]] = NoneConfiguration for OpenTelemetry trace export settings.
Instance variables
var enabled : boolvar export_headers : Dict[str, str] | Nonevar export_url : str | None
class UserState (*args, **kwds)-
Expand source code
@enum.unique class UserState(enum.Enum): """Represents the current state of the user in a conversation session.""" IDLE = "idle" SPEAKING = "speaking" LISTENING = "listening"Represents the current state of the user in a conversation session.
Ancestors
- enum.Enum
Class variables
var IDLEvar LISTENINGvar SPEAKING
class UtteranceHandle (utterance_id: str, interruptible: bool = True)-
Expand source code
class UtteranceHandle: """Manages the lifecycle of a single agent utterance.""" def __init__(self, utterance_id: str, interruptible: bool = True): self._id = utterance_id self._done_fut = asyncio.Future() self._interrupt_fut = asyncio.Future() self._interruptible = interruptible @property def id(self) -> str: return self._id @property def is_interruptible(self) -> bool: return self._interruptible def done(self) -> bool: """Returns True if the utterance is complete (played out or interrupted).""" return self._done_fut.done() @property def interrupted(self) -> bool: """Returns True if the utterance was interrupted.""" return self._interrupt_fut.done() def interrupt(self, *, force: bool = False) -> None: """Marks the utterance as interrupted.""" if not force and not self.is_interruptible: raise RuntimeError("This utterance does not allow interruptions.") if not self._interrupt_fut.done(): self._interrupt_fut.set_result(None) self._mark_done() def _mark_done(self) -> None: """Internal method to mark the utterance as complete.""" if not self._done_fut.done(): self._done_fut.set_result(None) def __await__(self) -> Generator[Any, None, None]: """Allows the handle to be awaited.""" return self._done_fut.__await__()Manages the lifecycle of a single agent utterance.
Instance variables
prop id : str-
Expand source code
@property def id(self) -> str: return self._id prop interrupted : bool-
Expand source code
@property def interrupted(self) -> bool: """Returns True if the utterance was interrupted.""" return self._interrupt_fut.done()Returns True if the utterance was interrupted.
prop is_interruptible : bool-
Expand source code
@property def is_interruptible(self) -> bool: return self._interruptible
Methods
def done(self) ‑> bool-
Expand source code
def done(self) -> bool: """Returns True if the utterance is complete (played out or interrupted).""" return self._done_fut.done()Returns True if the utterance is complete (played out or interrupted).
def interrupt(self, *, force: bool = False) ‑> None-
Expand source code
def interrupt(self, *, force: bool = False) -> None: """Marks the utterance as interrupted.""" if not force and not self.is_interruptible: raise RuntimeError("This utterance does not allow interruptions.") if not self._interrupt_fut.done(): self._interrupt_fut.set_result(None) self._mark_done()Marks the utterance as interrupted.
class VAD (sample_rate: int = 16000,
threshold: float = 0.5,
min_speech_duration: float = 0.5,
min_silence_duration: float = 0.5)-
Expand source code
class VAD(EventEmitter[Literal["error", "info"]]): """Base class for Voice Activity Detection implementations""" def __init__( self, sample_rate: int = 16000, threshold: float = 0.5, min_speech_duration: float = 0.5, min_silence_duration: float = 0.5 ) -> None: super().__init__() self._label = f"{type(self).__module__}.{type(self).__name__}" self._sample_rate = sample_rate self._threshold = threshold self._min_speech_duration = min_speech_duration self._min_silence_duration = min_silence_duration self._vad_callback: Optional[Callable[[VADResponse], Awaitable[None]]] = None @property def label(self) -> str: """Get the VAD provider label""" return self._label @property def sample_rate(self) -> int: """Get audio sample rate""" return self._sample_rate @abstractmethod async def process_audio( self, audio_frames: bytes, **kwargs: Any ) -> None: """ Process audio frames and detect voice activity Args: audio_frames: Iterator of audio frames to process **kwargs: Additional provider-specific arguments Returns: AsyncIterator yielding VADResponse objects """ raise NotImplementedError async def aclose(self) -> None: """Cleanup resources""" logger.info(f"Cleaning up VAD: {self.label}") self._vad_callback = None try: import gc gc.collect() logger.info(f"VAD garbage collection completed: {self.label}") except Exception as e: logger.error(f"Error during VAD garbage collection: {e}") logger.info(f"VAD cleanup completed: {self.label}") async def __aenter__(self) -> VAD: return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: await self.aclose() def on_vad_event(self, callback: Callable[[VADResponse], Awaitable[None]]) -> None: """Set callback for receiving VAD events""" self._vad_callback = callbackBase class for Voice Activity Detection implementations
Ancestors
- EventEmitter
- typing.Generic
Instance variables
prop label : str-
Expand source code
@property def label(self) -> str: """Get the VAD provider label""" return self._labelGet the VAD provider label
prop sample_rate : int-
Expand source code
@property def sample_rate(self) -> int: """Get audio sample rate""" return self._sample_rateGet audio sample rate
Methods
async def aclose(self) ‑> None-
Expand source code
async def aclose(self) -> None: """Cleanup resources""" logger.info(f"Cleaning up VAD: {self.label}") self._vad_callback = None try: import gc gc.collect() logger.info(f"VAD garbage collection completed: {self.label}") except Exception as e: logger.error(f"Error during VAD garbage collection: {e}") logger.info(f"VAD cleanup completed: {self.label}")Cleanup resources
def on_vad_event(self,
callback: Callable[[VADResponse], Awaitable[None]]) ‑> None-
Expand source code
def on_vad_event(self, callback: Callable[[VADResponse], Awaitable[None]]) -> None: """Set callback for receiving VAD events""" self._vad_callback = callbackSet callback for receiving VAD events
async def process_audio(self, audio_frames: bytes, **kwargs: Any) ‑> None-
Expand source code
@abstractmethod async def process_audio( self, audio_frames: bytes, **kwargs: Any ) -> None: """ Process audio frames and detect voice activity Args: audio_frames: Iterator of audio frames to process **kwargs: Additional provider-specific arguments Returns: AsyncIterator yielding VADResponse objects """ raise NotImplementedErrorProcess audio frames and detect voice activity
Args
audio_frames- Iterator of audio frames to process
**kwargs- Additional provider-specific arguments
Returns
AsyncIterator yielding VADResponse objects
Inherited members
class VADEventType (*args, **kwds)-
Expand source code
class VADEventType(str, Enum): START_OF_SPEECH = "start_of_speech" END_OF_SPEECH = "end_of_speech"str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
Ancestors
- builtins.str
- enum.Enum
Class variables
var END_OF_SPEECHvar START_OF_SPEECH
class VADResponse (**data: Any)-
Expand source code
class VADResponse(BaseModel): """Response from VAD processing""" event_type: VADEventType data: VADData metadata: Optional[dict[str, Any]] = NoneResponse from VAD processing
Create a new model by parsing and validating input data from keyword arguments.
Raises [
ValidationError][pydantic_core.ValidationError] if the input data cannot be validated to form a valid model.selfis explicitly positional-only to allowselfas a field name.Ancestors
- pydantic.main.BaseModel
Class variables
var data : VADDatavar event_type : VADEventTypevar metadata : dict[str, typing.Any] | Nonevar model_config
class VoiceMailDetector (llm: LLM,
callback: Callable,
duration: float = 2.0,
custom_prompt: str | None = None)-
Expand source code
class VoiceMailDetector: """ Detects if the initial speech is a voicemail/answering machine. """ SYSTEM_PROMPT = """You are a voicemail detection classifier for an OUTBOUND calling system. A bot has called a phone number and you need to determine if a human answered or if the call went to voicemail based on the provided text. Answer in one word yes or no.""" def __init__(self, llm: LLM, callback: Callable, duration: float = 2.0, custom_prompt: Optional[str] = None) -> None: """ Args: llm: The LLM instance to use for classification. callback: Callback function to run if voicemail is detected. duration: Time in seconds to buffer speech before checking (default 2.0s). """ self.llm = llm self.duration = duration self.CUSTOM_PROMPT = custom_prompt self.callback = callback async def detect(self, transcript: str) -> bool: """Classify the given transcript using the LLM to determine if it is a voicemail greeting.""" if not transcript or not transcript.strip(): return False try: context = ChatContext() if self.CUSTOM_PROMPT: context.add_message(ChatRole.SYSTEM, self.CUSTOM_PROMPT) else: context.add_message(ChatRole.SYSTEM, self.SYSTEM_PROMPT) context.add_message(ChatRole.USER, f"Text to classify: {transcript}") response_content = "" async for chunk in self.llm.chat(context): if chunk.content: response_content += chunk.content result = response_content.strip().lower() logger.info(f"Voice Mail Detection Result: '{result}' for text: '{transcript}'") return "yes" in result except Exception as e: logger.error(f"Error during voice mail detection: {e}") return FalseDetects if the initial speech is a voicemail/answering machine.
Args
llm- The LLM instance to use for classification.
callback- Callback function to run if voicemail is detected.
duration- Time in seconds to buffer speech before checking (default 2.0s).
Class variables
var SYSTEM_PROMPT
Methods
async def detect(self, transcript: str) ‑> bool-
Expand source code
async def detect(self, transcript: str) -> bool: """Classify the given transcript using the LLM to determine if it is a voicemail greeting.""" if not transcript or not transcript.strip(): return False try: context = ChatContext() if self.CUSTOM_PROMPT: context.add_message(ChatRole.SYSTEM, self.CUSTOM_PROMPT) else: context.add_message(ChatRole.SYSTEM, self.SYSTEM_PROMPT) context.add_message(ChatRole.USER, f"Text to classify: {transcript}") response_content = "" async for chunk in self.llm.chat(context): if chunk.content: response_content += chunk.content result = response_content.strip().lower() logger.info(f"Voice Mail Detection Result: '{result}' for text: '{transcript}'") return "yes" in result except Exception as e: logger.error(f"Error during voice mail detection: {e}") return FalseClassify the given transcript using the LLM to determine if it is a voicemail greeting.
class WebRTCConfig (signaling_url: str | None = None,
signaling_type: str = 'websocket',
ice_servers: list | None = None)-
Expand source code
@dataclass class WebRTCConfig: """Configuration for WebRTC transport including signaling and ICE server settings.""" signaling_url: Optional[str] = None signaling_type: str = "websocket" ice_servers: Optional[list] = None def __post_init__(self): if self.ice_servers is None: self.ice_servers = [{"urls": "stun:stun.l.google.com:19302"}]Configuration for WebRTC transport including signaling and ICE server settings.
Instance variables
var ice_servers : list | Nonevar signaling_type : strvar signaling_url : str | None
class WebSocketConfig (port: int = 8080, path: str = '/ws')-
Expand source code
@dataclass class WebSocketConfig: """Configuration for WebSocket transport including port and endpoint path.""" port: int = 8080 path: str = "/ws"Configuration for WebSocket transport including port and endpoint path.
Instance variables
var path : strvar port : int
class Worker (options: WorkerOptions,
default_room_options: RoomOptions | None = None)-
Expand source code
class Worker: """ VideoSDK worker that manages job execution and backend registration. def run(self): job_context = functools.partial(self.job.jobctx) entrypoint = functools.partial(self.job.entrypoint) p = multiprocessing.Process( target=_job_runner, args=(entrypoint, job_context) Automatically selects the appropriate executor type based on platform. """ def __init__( self, options: WorkerOptions, default_room_options: Optional[RoomOptions] = None ): """Initialize the worker.""" self.options = options self.default_room_options = default_room_options self._shutdown = False self._draining = False self._worker_load = 0.0 self._current_jobs: Dict[str, RunningJobInfo] = {} self._tasks: Set[asyncio.Task] = set() self.backend_connection: Optional[BackendConnection] = None self.process_manager: Optional[TaskExecutor] = ( None # Changed from ProcessManager ) self._http_server: Optional[HttpServer] = None # Add debounce mechanism for status updates self._last_status_update = 0.0 self._status_update_debounce_seconds = ( 2.0 # Minimum 2 seconds between status updates ) # Initialize tracing self._tracing = Tracing.with_handle("worker") self._worker_load_graph = Tracing.add_graph( title="worker_load", x_label="time", y_label="load", x_type="time", y_range=(0, 1), max_data_points=1000, ) # Validate configuration if not self.options.auth_token: raise ValueError( "auth_token is required, or add VIDEOSDK_AUTH_TOKEN in your environment" ) @staticmethod def run_worker( options: WorkerOptions, default_room_options: Optional[RoomOptions] = None ): """ Run a VideoSDK worker with the given options. This is the main entry point for running a VideoSDK worker, providing a high-level interface for worker initialization, job management, and lifecycle control. Args: options: Worker configuration options default_room_options: Optional default room options Example: ```python from videosdk.agents import Worker, WorkerOptions def my_agent(job_ctx): # Your agent code here pass # Configure worker with custom log level - logging is automatically configured! options = WorkerOptions( entrypoint_fnc=my_agent, log_level="DEBUG" # Options: DEBUG, INFO, WARNING, ERROR ) # Run the worker - no manual logging setup needed! Worker.run_worker(options) ``` """ worker = Worker(options, default_room_options=default_room_options) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) async def main_task(): try: await worker.initialize() if options.register: # Backend registration mode await worker._run_backend_mode() else: # Default mode - just keep alive while not worker._shutdown: await asyncio.sleep(1) except asyncio.CancelledError: logger.info("Main task cancelled") except Exception as e: logger.error(f"Worker error: {e}") raise finally: await worker.shutdown() main_future = loop.create_task(main_task()) shutting_down = False def signal_handler(signum, frame): nonlocal shutting_down if shutting_down: # If already shutting down, cancel all tasks more aggressively for task in asyncio.all_tasks(loop): task.cancel() return shutting_down = True logger.info(f"Received signal {signum}. Initiating graceful shutdown...") # Cancel the main task loop.call_soon_threadsafe(main_future.cancel) # Set a timeout for graceful shutdown loop.call_later( 3.0, lambda: [task.cancel() for task in asyncio.all_tasks(loop)] ) try: signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) loop.run_until_complete(main_future) except KeyboardInterrupt: logger.info("Keyboard interrupt received") if not shutting_down: shutting_down = True if not main_future.done(): main_future.cancel() loop.run_until_complete(worker.shutdown()) finally: try: loop.close() except Exception as e: logger.error(f"Error closing event loop: {e}") if loop.is_closed(): logger.info("Event loop closed successfully") async def initialize(self, default_room_options: Optional[RoomOptions] = None): """Initialize the worker.""" logger.info("Initializing VideoSDK worker") # Initialize task executor with new execution architecture # Convert ExecutorType to ResourceType resource_type = ( ResourceType.THREAD if self.options.executor_type == ExecutorType.THREAD else ResourceType.PROCESS ) config = ResourceConfig( resource_type=resource_type, num_idle_resources=self.options.num_idle_processes, max_resources=self.options.max_processes, initialize_timeout=self.options.initialize_timeout, close_timeout=self.options.close_timeout, memory_warn_mb=self.options.memory_warn_mb, memory_limit_mb=self.options.memory_limit_mb, ping_interval=self.options.ping_interval, load_threshold=self.options.load_threshold, max_concurrent_tasks=1, # Each resource handles one task at a time executor_type=self.options.executor_type, # Legacy IPC compatibility - dedicated inference process use_dedicated_inference_process=False, # Disable dedicated inference process for now inference_process_timeout=30.0, # Longer timeout for AI model loading inference_memory_warn_mb=1000.0, # Higher threshold for AI models ) self.process_manager = TaskExecutor(config) await self.process_manager.start() # Initialize backend connection if registering if self.options.register: await self._initialize_backend_connection() # Initialize and start debug HTTP server self._http_server = HttpServer( host=self.options.host, port=self.options.port, ) self._http_server.set_worker(self) await self._http_server.start() logger.info("VideoSDK worker initialized successfully") async def _initialize_backend_connection(self): """Initialize connection to the backend registry.""" if not self.options.register: return # Fetch agent init config to get registry URL try: logger.info("Fetching agent init config...") registry_url = await fetch_agent_init_config( auth_token=self.options.auth_token, api_base_url=f"https://{self.options.signaling_base_url}", ) logger.info(f"Using registry URL: {registry_url}") except Exception as e: logger.error(f"Failed to fetch agent init config: {e}") raise RuntimeError(f"Agent init config is mandatory. Error: {e}") self.backend_connection = BackendConnection( auth_token=self.options.auth_token, agent_id=self.options.agent_id, worker_type=self.options.worker_type.value, max_retry=self.options.max_retry, backend_url=registry_url, load_threshold=self.options.load_threshold, max_processes=self.options.max_processes, ) # Set up message handlers self.backend_connection.on_register(self._handle_register) self.backend_connection.on_availability(self._handle_availability) self.backend_connection.on_assignment(self._handle_assignment) self.backend_connection.on_termination(self._handle_termination) # Connect to backend await self.backend_connection.connect() async def _run_backend_mode(self): """Run the worker in backend registration mode.""" logger.info("Running in backend registration mode") # Start status update loop status_task = asyncio.create_task(self._status_update_loop()) self._tasks.add(status_task) try: # Keep the worker running while not self._shutdown: await asyncio.sleep(1) finally: status_task.cancel() self._tasks.discard(status_task) def _handle_register(self, worker_id: str, server_info: Dict[str, Any]): """Handle registration response from backend.""" logger.info(f"Registered with backend: {worker_id}") logger.info(f"Server info: {server_info}") def _handle_availability(self, request: AvailabilityRequest): """Handle availability request from backend.""" logger.info(f"Received availability request for job {request.job_id}") asyncio.create_task(self._answer_availability(request)) async def _answer_availability(self, request: AvailabilityRequest): """Answer availability request.""" try: # Check if we can accept the job can_accept = ( not self._draining and self._worker_load < self.options.load_threshold and len(self._current_jobs) < self.options.max_processes ) if can_accept: # Accept the job and provide our auth token response = AvailabilityResponse( job_id=request.job_id, available=True, token=self.options.auth_token, # Provide worker's auth token ) logger.info(f"Accepting job {request.job_id}") else: # Reject the job response = AvailabilityResponse( job_id=request.job_id, available=False, error="Worker at capacity or draining", ) logger.info(f"Rejecting job {request.job_id}") # Send response await self.backend_connection.send_message(response) except Exception as e: logger.error(f"Error handling availability request: {e}") # Send rejection on error response = AvailabilityResponse( job_id=request.job_id, available=False, error=str(e), ) await self.backend_connection.send_message(response) def _handle_assignment(self, assignment: JobAssignment): """Handle job assignment from backend.""" logger.info(f"Received job assignment: {assignment.job_id}") asyncio.create_task(self._handle_job_assignment(assignment)) async def _handle_job_assignment(self, assignment: JobAssignment): """Handle job assignment.""" try: # Create job accept arguments args = JobAcceptArguments( identity=f"agent_{assignment.job_id}", name=self.options.agent_id, metadata="", ) # Launch the job await self._launch_job_from_assignment(assignment, args) except Exception as e: logger.error(f"Error handling job assignment: {e}") # Send job update with error job_update = JobUpdate( job_id=assignment.job_id, status="failed", error=str(e), ) await self.backend_connection.send_message(job_update) async def _handle_termination(self, termination: JobTermination): """Handle job termination request.""" logger.info(f"Received job termination: {termination.job_id}") if termination.job_id in self._current_jobs: job_info = self._current_jobs[termination.job_id] try: await job_info.job.shutdown() logger.info(f"Successfully terminated job {termination.job_id}") except Exception as e: logger.error(f"Error terminating job {termination.job_id}: {e}") # Remove job from current jobs del self._current_jobs[termination.job_id] logger.info( f"Removed job {termination.job_id} from current jobs. Remaining jobs: {len(self._current_jobs)}" ) # Notify registry about job completion if self.backend_connection and self.backend_connection.is_connected: try: job_update = JobUpdate( job_id=termination.job_id, status="completed", error="Job terminated by registry", ) await self.backend_connection.send_message(job_update) logger.info( f"Sent job completion update for terminated job {termination.job_id}" ) except Exception as e: logger.error( f"Failed to send job completion update for terminated job {termination.job_id}: {e}" ) # IMMEDIATELY send status update to reflect reduced job count # This bypasses the debounce mechanism to ensure registry gets correct info await self._send_immediate_status_update() else: logger.warning( f"Job {termination.job_id} not found in current jobs for termination" ) async def _handle_meeting_end(self, job_id: str, reason: str = "meeting_ended"): """Handle meeting end/leave events and inform registry.""" logger.info(f"Meeting ended for job {job_id}, reason: {reason}") logger.info( f"Checking if job {job_id} is in current_jobs: {job_id in self._current_jobs}" ) logger.info(f"Current jobs: {list(self._current_jobs.keys())}") if job_id in self._current_jobs: # Remove job from worker's current jobs job_info = self._current_jobs.pop(job_id, None) if job_info: logger.info( f"Removed job {job_id} from worker's current jobs. Remaining jobs: {len(self._current_jobs)}" ) # Inform registry about job completion if self.backend_connection and self.backend_connection.is_connected: try: job_update = JobUpdate( job_id=job_id, status="completed", error=f"Meeting ended: {reason}", ) await self.backend_connection.send_message(job_update) logger.info( f"Sent job completion update to registry for job {job_id}" ) except Exception as e: logger.error( f"Failed to send job completion update to registry: {e}" ) # IMMEDIATELY send status update to reflect reduced job count # This bypasses the debounce mechanism to ensure registry gets correct info await self._send_immediate_status_update() else: logger.warning(f"Job {job_id} not found in current jobs when meeting ended") async def _send_immediate_status_update(self): """Send an immediate status update, bypassing debounce mechanism.""" if not self.backend_connection or not self.backend_connection.is_connected: return try: # Calculate current load job_count = len(self._current_jobs) load = min(job_count / self.options.max_processes, 1.0) self._worker_load = load logger.info( f"Sending immediate status update - job_count: {job_count}, load: {load}, max_processes: {self.options.max_processes}" ) # Log the actual job IDs for debugging if job_count > 0: job_ids = list(self._current_jobs.keys()) logger.info(f"Active job IDs: {job_ids}") else: logger.info("No active jobs") # Send status update status_msg = WorkerMessage( type="status_update", worker_id=self.backend_connection.worker_id, agent_name=self.options.agent_id, status="available" if not self._draining else "draining", load=load, job_count=job_count, ) await self.backend_connection.send_message(status_msg) logger.info("Immediate status update sent successfully") except Exception as e: logger.error(f"Error sending immediate status update: {e}") def setup_meeting_event_handlers(self, job_context, job_id: str): """Set up meeting event handlers for a specific job.""" if not job_context.room: logger.warning( f"Cannot set up meeting handlers for job {job_id}: room not available" ) # Set up a delayed handler setup that will be called when room becomes available original_connect = job_context.connect def delayed_handler_setup(): if job_context.room: self._setup_meeting_event_handlers_impl(job_context, job_id) else: logger.warning( f"Room still not available for job {job_id} after connect" ) # Override connect method to set up handlers after room is created async def connect_with_handlers(): result = await original_connect() delayed_handler_setup() return result job_context.connect = connect_with_handlers logger.info(f"Set up delayed meeting event handlers for job {job_id}") return # Room is available, set up handlers immediately self._setup_meeting_event_handlers_impl(job_context, job_id) def _setup_meeting_event_handlers_impl(self, job_context, job_id: str): """Internal method to set up the actual meeting event handlers.""" if not job_context.room: logger.warning(f"Room not available for job {job_id} in handler setup") return # Store original event handler original_on_meeting_left = job_context.room.on_meeting_left # Create wrapper that calls original and then handles cleanup def on_meeting_left_wrapper(data=None): # Call original handler first if original_on_meeting_left and callable(original_on_meeting_left): try: # Call as a method with self bound import inspect sig = inspect.signature(original_on_meeting_left) # Check if it's a bound method or function if hasattr(original_on_meeting_left, "__self__"): # It's a bound method if len(sig.parameters) > 1: # self + data original_on_meeting_left(data) else: # just self original_on_meeting_left() else: # It's a function if len(sig.parameters) > 0: original_on_meeting_left(data) else: original_on_meeting_left() except Exception as e: logger.warning(f"Error calling original on_meeting_left: {e}") # Handle meeting end for this specific job logger.info(f"Meeting left event - triggering job cleanup for {job_id}") asyncio.create_task(self._handle_meeting_end(job_id, "meeting_left")) # Replace the handler with our wrapper job_context.room.on_meeting_left = on_meeting_left_wrapper logger.info(f"Set up meeting end handler for job {job_id}") async def _launch_job_from_assignment( self, assignment: JobAssignment, args: JobAcceptArguments ): """Launch a job from backend assignment.""" try: # Use assignment token if available, otherwise fall back to worker's auth token auth_token = ( assignment.token if assignment.token else self.options.auth_token ) # Create room options from assignment (this was already done in _handle_job_assignment) room_options = RoomOptions( room_id=assignment.room_id, name=self.default_room_options.name, auth_token=auth_token, signaling_base_url=self.options.signaling_base_url, recording=self.default_room_options.recording, recording_options=getattr( self.default_room_options, "recording_options", None ), background_audio=self.default_room_options.background_audio, agent_participant_id=self.default_room_options.agent_participant_id, join_meeting=self.default_room_options.join_meeting, auto_end_session=self.default_room_options.auto_end_session, session_timeout_seconds=self.default_room_options.session_timeout_seconds, send_logs_to_dashboard=self.default_room_options.send_logs_to_dashboard, dashboard_log_level=self.default_room_options.dashboard_log_level, traces=self.default_room_options.traces, metrics=self.default_room_options.metrics, logs=self.default_room_options.logs, ) # Apply RoomOptions from assignment if provided if assignment.room_options: logger.info( f"Received room_options from assignment: {assignment.room_options}" ) if "auto_end_session" in assignment.room_options: room_options.auto_end_session = assignment.room_options[ "auto_end_session" ] logger.info( f"Set auto_end_session: {room_options.auto_end_session}" ) if "session_timeout_seconds" in assignment.room_options: room_options.session_timeout_seconds = assignment.room_options[ "session_timeout_seconds" ] logger.info( f"Set session_timeout_seconds: {room_options.session_timeout_seconds}" ) if "playground" in assignment.room_options: room_options.playground = assignment.room_options["playground"] logger.info(f"Set playground: {room_options.playground}") if "vision" in assignment.room_options: room_options.vision = assignment.room_options["vision"] logger.info(f"Set vision: {room_options.vision}") if "join_meeting" in assignment.room_options: room_options.join_meeting = assignment.room_options["join_meeting"] logger.info(f"Set join_meeting: {room_options.join_meeting}") if "recording" in assignment.room_options: room_options.recording = assignment.room_options["recording"] logger.info(f"Set recording: {room_options.recording}") if "recording_options" in assignment.room_options: ro = assignment.room_options["recording_options"] if isinstance(ro, dict): room_options.recording_options = _coerce_recording_options_dict(ro) else: room_options.recording_options = ro logger.info(f"Set recording_options: {room_options.recording_options}") if "background_audio" in assignment.room_options: room_options.background_audio = assignment.room_options[ "background_audio" ] logger.info( f"Set background_audio: {room_options.background_audio}" ) if "agent_participant_id" in assignment.room_options: room_options.agent_participant_id = assignment.room_options[ "agent_participant_id" ] logger.info( f"Set agent_participant_id: {room_options.agent_participant_id}" ) if "traces" in assignment.room_options: room_options.traces = assignment.room_options["traces"] logger.info(f"Set traces options via worker assignment") if "metrics" in assignment.room_options: room_options.metrics = assignment.room_options["metrics"] logger.info(f"Set metrics options via worker assignment") if "logs" in assignment.room_options: room_options.logs = assignment.room_options["logs"] logger.info(f"Set logs options via worker assignment") else: logger.warning("No room_options received from assignment") # Create job context job_context = JobContext( room_options=room_options, metadata=assignment.metadata, ) # Create running job info with correct parameters job_info = RunningJobInfo( accept_arguments=args, job=job_context, url=assignment.url, token=auth_token, worker_id=self.backend_connection.worker_id, ) # Store job info BEFORE executing entrypoint self._current_jobs[assignment.job_id] = job_info logger.info( f"Added job {assignment.job_id} to worker's current jobs. Total jobs: {len(self._current_jobs)}" ) # Execute the job using the process manager logger.info(f"Executing job {assignment.job_id} via process manager") if not self.process_manager: raise RuntimeError("Task executor not initialized") # If wait=True, create a cross-process event to wait for meeting join # before sending the "running" status to registry import multiprocessing mp_meeting_joined_event = None logger.info( f"Job {assignment.job_id}: assignment.wait={assignment.wait} (type={type(assignment.wait).__name__})" ) if assignment.wait: manager = multiprocessing.Manager() mp_meeting_joined_event = manager.Event() logger.info( f"Job {assignment.job_id}: wait=True, deferring 'running' status until meeting joined" ) else: # Send immediate "running" status when wait is not requested job_update = JobUpdate( job_id=assignment.job_id, status="running", ) await self.backend_connection.send_message(job_update) try: # Execute in separate process/thread # Pass arguments positionally to ensure they are captured by *args in TaskExecutor timeout_val = ( assignment.timeout if hasattr(assignment, "timeout") and assignment.timeout else 3600.0 ) # Start execution as a background task so we can await meeting join in parallel execute_task = asyncio.create_task( self.process_manager.execute( _execute_job_entrypoint, # entrypoint TaskType.JOB, # task_type timeout_val, # timeout 0, # retry_count (entrypoint errors are not retriable) 0, # priority self.options.entrypoint_fnc, # arg1 room_options, # arg2 assignment.metadata, # arg3 assignment.wait, # arg4: wait_for_meeting_join mp_meeting_joined_event, # arg5: cross-process event ) ) # If wait=True, poll the multiprocessing event until meeting joins or execution fails if mp_meeting_joined_event is not None: meeting_join_timeout = 30.0 poll_start = asyncio.get_event_loop().time() joined = False while not joined: if mp_meeting_joined_event.is_set(): joined = True break if execute_task.done(): # Entrypoint finished/failed before meeting join break if ( asyncio.get_event_loop().time() - poll_start ) > meeting_join_timeout: logger.warning( f"Job {assignment.job_id}: timeout waiting for meeting join, sending 'running' anyway" ) break await asyncio.sleep(0.1) if joined: logger.info( f"Job {assignment.job_id}: meeting joined, sending 'running' status" ) if joined: job_update = JobUpdate( job_id=assignment.job_id, status="running", ) await self.backend_connection.send_message(job_update) elif execute_task.done(): # Entrypoint failed before meeting join — let the result # handler below send the final status to avoid duplicates logger.warning( f"Job {assignment.job_id}: entrypoint failed before meeting join" ) else: # Timeout waiting for meeting join but task still running job_update = JobUpdate( job_id=assignment.job_id, status="running", ) await self.backend_connection.send_message(job_update) result = await execute_task logger.info( f"Job {assignment.job_id} execution completed: {result.status}" ) if result.status.value == "completed": final_update = JobUpdate( job_id=assignment.job_id, status="completed", ) await self.backend_connection.send_message(final_update) else: final_update = JobUpdate( job_id=assignment.job_id, status="failed", error=result.error or "Unknown error during execution", ) await self.backend_connection.send_message(final_update) except Exception as execution_error: logger.error( f"Execution failed for job {assignment.job_id}: {execution_error}" ) error_update = JobUpdate( job_id=assignment.job_id, status="failed", error=str(execution_error), ) await self.backend_connection.send_message(error_update) self._current_jobs.pop(assignment.job_id, None) logger.info(f"Removed job {assignment.job_id} from current jobs") # Clean up the multiprocessing manager if we created one if mp_meeting_joined_event is not None: try: manager.shutdown() except Exception: pass await self._send_immediate_status_update() except Exception as e: logger.error(f"Error launching job {assignment.job_id}: {e}") # Send error update job_update = JobUpdate( job_id=assignment.job_id, status="failed", error=str(e), ) await self.backend_connection.send_message(job_update) # Remove job from current jobs since it failed to launch self._current_jobs.pop(assignment.job_id, None) logger.info(f"Removed failed job {assignment.job_id} from current jobs") # Send immediate status update to reflect reduced job count await self._send_immediate_status_update() def setup_session_end_callback(self, job_context, job_id: str): """Set up session end callback for automatic session ending.""" if not job_context.room: logger.warning( f"Cannot set up session end callback for job {job_id}: room not available" ) # Set up a delayed callback setup that will be called when room becomes available original_connect = job_context.connect def delayed_callback_setup(): if job_context.room: self._setup_session_end_callback_impl(job_context, job_id) else: logger.warning( f"Room still not available for job {job_id} after connect" ) # Override connect method to set up callback after room is created async def connect_with_callback(): result = await original_connect() delayed_callback_setup() return result job_context.connect = connect_with_callback logger.info(f"Set up delayed session end callback for job {job_id}") return # Room is available, set up callback immediately self._setup_session_end_callback_impl(job_context, job_id) def _setup_session_end_callback_impl(self, job_context, job_id: str): """Internal method to set up the actual session end callback.""" if not job_context.room: logger.warning(f"Room not available for job {job_id} in callback setup") return # Store original callback if it exists original_on_session_end = job_context.room.on_session_end def on_session_end_wrapper(reason: str): logger.info(f"Session ended for job {job_id}, reason: {reason}") # Call original callback if it exists if original_on_session_end: try: original_on_session_end(reason) except Exception as e: logger.error(f"Error in original session end callback: {e}") logger.info(f"Calling _handle_meeting_end for job {job_id}") # Handle meeting end asynchronously asyncio.create_task( self._handle_meeting_end(job_id, f"session_ended: {reason}") ) # Set the wrapped session end callback job_context.room.on_session_end = on_session_end_wrapper logger.info(f"Session end callback set up for job {job_id}") async def _status_update_loop(self): """Periodic status update loop.""" while not self._shutdown: try: await self._update_worker_status() await asyncio.sleep(self.options.ping_interval) except Exception as e: logger.error(f"Error in status update loop: {e}") await asyncio.sleep(5) # Wait before retrying async def _update_worker_status(self): """Update worker status with backend.""" if not self.backend_connection or not self.backend_connection.is_connected: return # Check debounce - don't send status updates too frequently current_time = time.time() if ( current_time - self._last_status_update < self._status_update_debounce_seconds ): logger.debug("Skipping status update due to debounce") return try: # Calculate current load job_count = len(self._current_jobs) load = min(job_count / self.options.max_processes, 1.0) self._worker_load = load # Add detailed logging to track job count changes logger.info( f"Updating worker status - job_count: {job_count}, load: {load}, max_processes: {self.options.max_processes}" ) # Log the actual job IDs for debugging if job_count > 0: job_ids = list(self._current_jobs.keys()) logger.info(f"Active job IDs: {job_ids}") else: logger.info("No active jobs") # Send status update status_msg = WorkerMessage( type="status_update", worker_id=self.backend_connection.worker_id, agent_name=self.options.agent_id, # Include agent_id status="available" if not self._draining else "draining", load=load, job_count=job_count, ) await self.backend_connection.send_message(status_msg) # Update last status update time self._last_status_update = current_time # Update tracing self._worker_load_graph.add_point(load) except Exception as e: logger.error(f"Error updating worker status: {e}") async def execute_job(self, job_data: Dict[str, Any]) -> Dict[str, Any]: """Execute a job using the task executor.""" if not self.process_manager: raise RuntimeError("Task executor not initialized") # Extract entrypoint function from job data entrypoint = job_data.get("entrypoint", self.options.entrypoint_fnc) # Execute using new task executor result = await self.process_manager.execute( entrypoint=entrypoint, task_type=TaskType.JOB, timeout=job_data.get("timeout", 300.0), retry_count=job_data.get("retry_count", 3), priority=job_data.get("priority", 0), *job_data.get("args", ()), **job_data.get("kwargs", {}), ) # Convert TaskResult to expected format return { "status": result.status.value, "result": result.result, "error": result.error, "execution_time": result.execution_time, "task_id": result.task_id, } async def execute_inference(self, inference_data: Dict[str, Any]) -> Dict[str, Any]: """Execute an inference using the task executor.""" if not self.process_manager: raise RuntimeError("Task executor not initialized") # Extract entrypoint function from inference data entrypoint = inference_data.get("entrypoint", self.options.entrypoint_fnc) # Execute using new task executor result = await self.process_manager.execute( entrypoint=entrypoint, task_type=TaskType.INFERENCE, timeout=inference_data.get("timeout", 300.0), retry_count=inference_data.get("retry_count", 3), priority=inference_data.get("priority", 0), *inference_data.get("args", ()), **inference_data.get("kwargs", {}), ) # Convert TaskResult to expected format return { "status": result.status.value, "result": result.result, "error": result.error, "execution_time": result.execution_time, "task_id": result.task_id, } def get_stats(self) -> Dict[str, Any]: """Get worker statistics.""" # Calculate current load dynamically job_count = len(self._current_jobs) current_load = min(job_count / self.options.max_processes, 1.0) stats = { "worker_load": current_load, "draining": self._draining, "current_jobs": job_count, "max_processes": self.options.max_processes, "agent_id": self.options.agent_id, "register": self.options.register, } if self.backend_connection: stats.update( { "backend_connected": self.backend_connection.is_connected, "worker_id": self.backend_connection.worker_id, } ) if self.process_manager: try: process_stats = self.process_manager.get_stats() logger.debug(f"Process manager stats: {process_stats}") # Get current resource stats and dedicated inference status if "resource_stats" in process_stats: stats["resource_stats"] = process_stats["resource_stats"] logger.debug(f"Resource stats: {process_stats['resource_stats']}") if "dedicated_inference" in process_stats: stats["dedicated_inference"] = process_stats["dedicated_inference"] # Also get current resource info for more detailed stats try: resource_info = self.process_manager.get_resource_info() logger.debug( f"Resource info count: {len(resource_info) if resource_info else 0}" ) if resource_info: stats["resource_info"] = [ { "resource_id": info.resource_id, "resource_type": info.resource_type.value, "status": info.status.value, "current_load": info.current_load, "memory_usage_mb": info.memory_usage_mb, "cpu_usage_percent": info.cpu_usage_percent, "active_tasks": info.active_tasks, "total_tasks_processed": info.total_tasks_processed, "last_heartbeat": info.last_heartbeat, "metadata": info.metadata, } for info in resource_info ] # Add summary of resource status resource_summary = { "total_resources": len(resource_info), "available_resources": len( [r for r in resource_info if r.status == "IDLE"] ), "active_resources": len( [r for r in resource_info if r.status != "IDLE"] ), "dedicated_inference_active": any( r.resource_type == "DEDICATED_INFERENCE" and r.status != "IDLE" for r in resource_info ), } stats["resource_summary"] = resource_summary logger.debug(f"Resource summary: {resource_summary}") except Exception as e: logger.debug(f"Could not get detailed resource info: {e}") except Exception as e: logger.error(f"Error getting process manager stats: {e}") stats["resource_stats"] = {"error": str(e)} stats["dedicated_inference"] = None return stats async def drain(self, timeout: Optional[float] = None) -> None: """Drain the worker - wait for current jobs to finish before shutting down.""" if self._draining: return logger.info("Draining VideoSDK worker") self._draining = True await self._update_worker_status() # Wait for current jobs to complete if self._current_jobs: logger.info( f"Waiting for {len(self._current_jobs)} active jobs to complete" ) if timeout: try: await asyncio.wait_for(self._wait_for_jobs(), timeout) except asyncio.TimeoutError: logger.warning( f"Timeout waiting for jobs to complete after {timeout}s" ) else: await self._wait_for_jobs() async def _wait_for_jobs(self) -> None: """Wait for all current jobs to complete.""" while self._current_jobs: # Wait a bit and check again await asyncio.sleep(1) logger.info(f"Still waiting for {len(self._current_jobs)} jobs to complete") async def _cleanup_all_jobs(self): """Clean up all current jobs and notify registry.""" if not self._current_jobs: return logger.info(f"Cleaning up {len(self._current_jobs)} jobs during shutdown") # Create a copy of jobs to iterate over, as they will be modified jobs_to_clean = list(self._current_jobs.items()) for job_id, job_info in jobs_to_clean: try: logger.info(f"Terminating job {job_id}...") await job_info.job.shutdown() # This calls job.shutdown() logger.info(f"Job {job_id} terminated successfully.") except Exception as e: logger.error(f"Error terminating job {job_id}: {e}") try: if self.backend_connection and self.backend_connection.is_connected: job_update = JobUpdate( job_id=job_id, status="completed", error="Worker shutdown", ) await self.backend_connection.send_message(job_update) logger.info( f"Sent job completion update for job {job_id} during shutdown" ) except Exception as e: logger.error(f"Failed to send job completion update for {job_id}: {e}") # Clear all jobs from the worker's state self._current_jobs.clear() logger.info("All jobs cleared from worker") # Send a final status update reflecting zero jobs if self.backend_connection and self.backend_connection.is_connected: await self._send_immediate_status_update() async def shutdown(self): """Shutdown the worker.""" logger.info("Shutting down VideoSDK worker") self._shutdown = True self._draining = True try: # Clean up all jobs first to ensure proper room cleanup await self._cleanup_all_jobs() except Exception as e: logger.error(f"Error during job cleanup: {e}") try: # Send final status update to registry if self.backend_connection and self.backend_connection.is_connected: try: await self._update_worker_status() logger.info("Sent final status update to registry") except Exception as e: logger.warning(f"Failed to send final status update: {e}") # Disconnect from backend if self.backend_connection: logger.info("Disconnecting from backend") await self.backend_connection.disconnect() except Exception as e: logger.error(f"Error during backend cleanup: {e}") try: # Cancel all tasks for task in self._tasks: if not task.done(): task.cancel() # Wait briefly for tasks to complete if self._tasks: done, pending = await asyncio.wait(self._tasks, timeout=2.0) for task in pending: task.cancel() except Exception as e: logger.error(f"Error during task cleanup: {e}") try: # Shutdown task executor if self.process_manager: await self.process_manager.stop() except Exception as e: logger.error(f"Error stopping process manager: {e}") try: # Stop debug HTTP server if self._http_server: await self._http_server.aclose() except Exception as e: logger.error(f"Error stopping HTTP server: {e}") logger.info("VideoSDK worker shutdown complete") async def __aenter__(self): """Async context manager entry.""" await self.initialize() return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit.""" await self.shutdown()VideoSDK worker that manages job execution and backend registration.
def run(self): job_context = functools.partial(self.job.jobctx) entrypoint = functools.partial(self.job.entrypoint) p = multiprocessing.Process( target=_job_runner, args=(entrypoint, job_context) Automatically selects the appropriate executor type based on platform.
Initialize the worker.
Static methods
def run_worker(options: WorkerOptions,
default_room_options: RoomOptions | None = None)-
Expand source code
@staticmethod def run_worker( options: WorkerOptions, default_room_options: Optional[RoomOptions] = None ): """ Run a VideoSDK worker with the given options. This is the main entry point for running a VideoSDK worker, providing a high-level interface for worker initialization, job management, and lifecycle control. Args: options: Worker configuration options default_room_options: Optional default room options Example: ```python from videosdk.agents import Worker, WorkerOptions def my_agent(job_ctx): # Your agent code here pass # Configure worker with custom log level - logging is automatically configured! options = WorkerOptions( entrypoint_fnc=my_agent, log_level="DEBUG" # Options: DEBUG, INFO, WARNING, ERROR ) # Run the worker - no manual logging setup needed! Worker.run_worker(options) ``` """ worker = Worker(options, default_room_options=default_room_options) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) async def main_task(): try: await worker.initialize() if options.register: # Backend registration mode await worker._run_backend_mode() else: # Default mode - just keep alive while not worker._shutdown: await asyncio.sleep(1) except asyncio.CancelledError: logger.info("Main task cancelled") except Exception as e: logger.error(f"Worker error: {e}") raise finally: await worker.shutdown() main_future = loop.create_task(main_task()) shutting_down = False def signal_handler(signum, frame): nonlocal shutting_down if shutting_down: # If already shutting down, cancel all tasks more aggressively for task in asyncio.all_tasks(loop): task.cancel() return shutting_down = True logger.info(f"Received signal {signum}. Initiating graceful shutdown...") # Cancel the main task loop.call_soon_threadsafe(main_future.cancel) # Set a timeout for graceful shutdown loop.call_later( 3.0, lambda: [task.cancel() for task in asyncio.all_tasks(loop)] ) try: signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) loop.run_until_complete(main_future) except KeyboardInterrupt: logger.info("Keyboard interrupt received") if not shutting_down: shutting_down = True if not main_future.done(): main_future.cancel() loop.run_until_complete(worker.shutdown()) finally: try: loop.close() except Exception as e: logger.error(f"Error closing event loop: {e}") if loop.is_closed(): logger.info("Event loop closed successfully")Run a VideoSDK worker with the given options.
This is the main entry point for running a VideoSDK worker, providing a high-level interface for worker initialization, job management, and lifecycle control.
Args
options- Worker configuration options
default_room_options- Optional default room options
Example
from videosdk.agents import Worker, WorkerOptions def my_agent(job_ctx): # Your agent code here pass # Configure worker with custom log level - logging is automatically configured! options = WorkerOptions( entrypoint_fnc=my_agent, log_level="DEBUG" # Options: DEBUG, INFO, WARNING, ERROR ) # Run the worker - no manual logging setup needed! Worker.run_worker(options)
Methods
async def drain(self, timeout: float | None = None) ‑> None-
Expand source code
async def drain(self, timeout: Optional[float] = None) -> None: """Drain the worker - wait for current jobs to finish before shutting down.""" if self._draining: return logger.info("Draining VideoSDK worker") self._draining = True await self._update_worker_status() # Wait for current jobs to complete if self._current_jobs: logger.info( f"Waiting for {len(self._current_jobs)} active jobs to complete" ) if timeout: try: await asyncio.wait_for(self._wait_for_jobs(), timeout) except asyncio.TimeoutError: logger.warning( f"Timeout waiting for jobs to complete after {timeout}s" ) else: await self._wait_for_jobs()Drain the worker - wait for current jobs to finish before shutting down.
async def execute_inference(self, inference_data: Dict[str, Any]) ‑> Dict[str, Any]-
Expand source code
async def execute_inference(self, inference_data: Dict[str, Any]) -> Dict[str, Any]: """Execute an inference using the task executor.""" if not self.process_manager: raise RuntimeError("Task executor not initialized") # Extract entrypoint function from inference data entrypoint = inference_data.get("entrypoint", self.options.entrypoint_fnc) # Execute using new task executor result = await self.process_manager.execute( entrypoint=entrypoint, task_type=TaskType.INFERENCE, timeout=inference_data.get("timeout", 300.0), retry_count=inference_data.get("retry_count", 3), priority=inference_data.get("priority", 0), *inference_data.get("args", ()), **inference_data.get("kwargs", {}), ) # Convert TaskResult to expected format return { "status": result.status.value, "result": result.result, "error": result.error, "execution_time": result.execution_time, "task_id": result.task_id, }Execute an inference using the task executor.
async def execute_job(self, job_data: Dict[str, Any]) ‑> Dict[str, Any]-
Expand source code
async def execute_job(self, job_data: Dict[str, Any]) -> Dict[str, Any]: """Execute a job using the task executor.""" if not self.process_manager: raise RuntimeError("Task executor not initialized") # Extract entrypoint function from job data entrypoint = job_data.get("entrypoint", self.options.entrypoint_fnc) # Execute using new task executor result = await self.process_manager.execute( entrypoint=entrypoint, task_type=TaskType.JOB, timeout=job_data.get("timeout", 300.0), retry_count=job_data.get("retry_count", 3), priority=job_data.get("priority", 0), *job_data.get("args", ()), **job_data.get("kwargs", {}), ) # Convert TaskResult to expected format return { "status": result.status.value, "result": result.result, "error": result.error, "execution_time": result.execution_time, "task_id": result.task_id, }Execute a job using the task executor.
def get_stats(self) ‑> Dict[str, Any]-
Expand source code
def get_stats(self) -> Dict[str, Any]: """Get worker statistics.""" # Calculate current load dynamically job_count = len(self._current_jobs) current_load = min(job_count / self.options.max_processes, 1.0) stats = { "worker_load": current_load, "draining": self._draining, "current_jobs": job_count, "max_processes": self.options.max_processes, "agent_id": self.options.agent_id, "register": self.options.register, } if self.backend_connection: stats.update( { "backend_connected": self.backend_connection.is_connected, "worker_id": self.backend_connection.worker_id, } ) if self.process_manager: try: process_stats = self.process_manager.get_stats() logger.debug(f"Process manager stats: {process_stats}") # Get current resource stats and dedicated inference status if "resource_stats" in process_stats: stats["resource_stats"] = process_stats["resource_stats"] logger.debug(f"Resource stats: {process_stats['resource_stats']}") if "dedicated_inference" in process_stats: stats["dedicated_inference"] = process_stats["dedicated_inference"] # Also get current resource info for more detailed stats try: resource_info = self.process_manager.get_resource_info() logger.debug( f"Resource info count: {len(resource_info) if resource_info else 0}" ) if resource_info: stats["resource_info"] = [ { "resource_id": info.resource_id, "resource_type": info.resource_type.value, "status": info.status.value, "current_load": info.current_load, "memory_usage_mb": info.memory_usage_mb, "cpu_usage_percent": info.cpu_usage_percent, "active_tasks": info.active_tasks, "total_tasks_processed": info.total_tasks_processed, "last_heartbeat": info.last_heartbeat, "metadata": info.metadata, } for info in resource_info ] # Add summary of resource status resource_summary = { "total_resources": len(resource_info), "available_resources": len( [r for r in resource_info if r.status == "IDLE"] ), "active_resources": len( [r for r in resource_info if r.status != "IDLE"] ), "dedicated_inference_active": any( r.resource_type == "DEDICATED_INFERENCE" and r.status != "IDLE" for r in resource_info ), } stats["resource_summary"] = resource_summary logger.debug(f"Resource summary: {resource_summary}") except Exception as e: logger.debug(f"Could not get detailed resource info: {e}") except Exception as e: logger.error(f"Error getting process manager stats: {e}") stats["resource_stats"] = {"error": str(e)} stats["dedicated_inference"] = None return statsGet worker statistics.
async def initialize(self,
default_room_options: RoomOptions | None = None)-
Expand source code
async def initialize(self, default_room_options: Optional[RoomOptions] = None): """Initialize the worker.""" logger.info("Initializing VideoSDK worker") # Initialize task executor with new execution architecture # Convert ExecutorType to ResourceType resource_type = ( ResourceType.THREAD if self.options.executor_type == ExecutorType.THREAD else ResourceType.PROCESS ) config = ResourceConfig( resource_type=resource_type, num_idle_resources=self.options.num_idle_processes, max_resources=self.options.max_processes, initialize_timeout=self.options.initialize_timeout, close_timeout=self.options.close_timeout, memory_warn_mb=self.options.memory_warn_mb, memory_limit_mb=self.options.memory_limit_mb, ping_interval=self.options.ping_interval, load_threshold=self.options.load_threshold, max_concurrent_tasks=1, # Each resource handles one task at a time executor_type=self.options.executor_type, # Legacy IPC compatibility - dedicated inference process use_dedicated_inference_process=False, # Disable dedicated inference process for now inference_process_timeout=30.0, # Longer timeout for AI model loading inference_memory_warn_mb=1000.0, # Higher threshold for AI models ) self.process_manager = TaskExecutor(config) await self.process_manager.start() # Initialize backend connection if registering if self.options.register: await self._initialize_backend_connection() # Initialize and start debug HTTP server self._http_server = HttpServer( host=self.options.host, port=self.options.port, ) self._http_server.set_worker(self) await self._http_server.start() logger.info("VideoSDK worker initialized successfully")Initialize the worker.
def setup_meeting_event_handlers(self, job_context, job_id: str)-
Expand source code
def setup_meeting_event_handlers(self, job_context, job_id: str): """Set up meeting event handlers for a specific job.""" if not job_context.room: logger.warning( f"Cannot set up meeting handlers for job {job_id}: room not available" ) # Set up a delayed handler setup that will be called when room becomes available original_connect = job_context.connect def delayed_handler_setup(): if job_context.room: self._setup_meeting_event_handlers_impl(job_context, job_id) else: logger.warning( f"Room still not available for job {job_id} after connect" ) # Override connect method to set up handlers after room is created async def connect_with_handlers(): result = await original_connect() delayed_handler_setup() return result job_context.connect = connect_with_handlers logger.info(f"Set up delayed meeting event handlers for job {job_id}") return # Room is available, set up handlers immediately self._setup_meeting_event_handlers_impl(job_context, job_id)Set up meeting event handlers for a specific job.
def setup_session_end_callback(self, job_context, job_id: str)-
Expand source code
def setup_session_end_callback(self, job_context, job_id: str): """Set up session end callback for automatic session ending.""" if not job_context.room: logger.warning( f"Cannot set up session end callback for job {job_id}: room not available" ) # Set up a delayed callback setup that will be called when room becomes available original_connect = job_context.connect def delayed_callback_setup(): if job_context.room: self._setup_session_end_callback_impl(job_context, job_id) else: logger.warning( f"Room still not available for job {job_id} after connect" ) # Override connect method to set up callback after room is created async def connect_with_callback(): result = await original_connect() delayed_callback_setup() return result job_context.connect = connect_with_callback logger.info(f"Set up delayed session end callback for job {job_id}") return # Room is available, set up callback immediately self._setup_session_end_callback_impl(job_context, job_id)Set up session end callback for automatic session ending.
async def shutdown(self)-
Expand source code
async def shutdown(self): """Shutdown the worker.""" logger.info("Shutting down VideoSDK worker") self._shutdown = True self._draining = True try: # Clean up all jobs first to ensure proper room cleanup await self._cleanup_all_jobs() except Exception as e: logger.error(f"Error during job cleanup: {e}") try: # Send final status update to registry if self.backend_connection and self.backend_connection.is_connected: try: await self._update_worker_status() logger.info("Sent final status update to registry") except Exception as e: logger.warning(f"Failed to send final status update: {e}") # Disconnect from backend if self.backend_connection: logger.info("Disconnecting from backend") await self.backend_connection.disconnect() except Exception as e: logger.error(f"Error during backend cleanup: {e}") try: # Cancel all tasks for task in self._tasks: if not task.done(): task.cancel() # Wait briefly for tasks to complete if self._tasks: done, pending = await asyncio.wait(self._tasks, timeout=2.0) for task in pending: task.cancel() except Exception as e: logger.error(f"Error during task cleanup: {e}") try: # Shutdown task executor if self.process_manager: await self.process_manager.stop() except Exception as e: logger.error(f"Error stopping process manager: {e}") try: # Stop debug HTTP server if self._http_server: await self._http_server.aclose() except Exception as e: logger.error(f"Error stopping HTTP server: {e}") logger.info("VideoSDK worker shutdown complete")Shutdown the worker.
class WorkerJob (entrypoint,
jobctx=None,
options: Options | None = None)-
Expand source code
class WorkerJob: """Wraps an async entrypoint function and manages its execution either directly or via a Worker process.""" def __init__(self, entrypoint, jobctx=None, options: Optional[Options] = None): """ :param entrypoint: An async function accepting one argument: jobctx :param jobctx: A static object or a callable that returns a context per job :param options: Configuration options for job execution """ if not asyncio.iscoroutinefunction(entrypoint): raise TypeError("entrypoint must be a coroutine function") self.entrypoint = entrypoint self.jobctx = jobctx self.options = options or Options() def start(self): from .worker import Worker, WorkerOptions # Convert JobOptions to WorkerOptions for compatibility worker_options = WorkerOptions( entrypoint_fnc=self.entrypoint, agent_id=self.options.agent_id, auth_token=self.options.auth_token, executor_type=self.options.executor_type, num_idle_processes=self.options.num_idle_processes, initialize_timeout=self.options.initialize_timeout, close_timeout=self.options.close_timeout, memory_warn_mb=self.options.memory_warn_mb, memory_limit_mb=self.options.memory_limit_mb, ping_interval=self.options.ping_interval, max_processes=self.options.max_processes, permissions=self.options.permissions, max_retry=self.options.max_retry, load_threshold=self.options.load_threshold, register=self.options.register, signaling_base_url=self.options.signaling_base_url, host=self.options.host, port=self.options.port, log_level=self.options.log_level, ) # If register=True, run the worker in backend mode (don't execute entrypoint immediately) if self.options.register: default_room_options = None if self.jobctx: if callable(self.jobctx): job_context = self.jobctx() else: job_context = self.jobctx default_room_options = job_context.room_options # Run the worker normally (for backend registration mode) Worker.run_worker( options=worker_options, default_room_options=default_room_options ) else: # Direct mode - run entrypoint immediately if we have a job context if self.jobctx: if callable(self.jobctx): job_context = self.jobctx() else: job_context = self.jobctx # Set the current job context and run the entrypoint token = _set_current_job_context(job_context) try: asyncio.run(self.entrypoint(job_context)) finally: _reset_current_job_context(token) else: # No job context provided, run worker normally Worker.run_worker(worker_options)Wraps an async entrypoint function and manages its execution either directly or via a Worker process.
:param entrypoint: An async function accepting one argument: jobctx :param jobctx: A static object or a callable that returns a context per job :param options: Configuration options for job execution
Methods
def start(self)-
Expand source code
def start(self): from .worker import Worker, WorkerOptions # Convert JobOptions to WorkerOptions for compatibility worker_options = WorkerOptions( entrypoint_fnc=self.entrypoint, agent_id=self.options.agent_id, auth_token=self.options.auth_token, executor_type=self.options.executor_type, num_idle_processes=self.options.num_idle_processes, initialize_timeout=self.options.initialize_timeout, close_timeout=self.options.close_timeout, memory_warn_mb=self.options.memory_warn_mb, memory_limit_mb=self.options.memory_limit_mb, ping_interval=self.options.ping_interval, max_processes=self.options.max_processes, permissions=self.options.permissions, max_retry=self.options.max_retry, load_threshold=self.options.load_threshold, register=self.options.register, signaling_base_url=self.options.signaling_base_url, host=self.options.host, port=self.options.port, log_level=self.options.log_level, ) # If register=True, run the worker in backend mode (don't execute entrypoint immediately) if self.options.register: default_room_options = None if self.jobctx: if callable(self.jobctx): job_context = self.jobctx() else: job_context = self.jobctx default_room_options = job_context.room_options # Run the worker normally (for backend registration mode) Worker.run_worker( options=worker_options, default_room_options=default_room_options ) else: # Direct mode - run entrypoint immediately if we have a job context if self.jobctx: if callable(self.jobctx): job_context = self.jobctx() else: job_context = self.jobctx # Set the current job context and run the entrypoint token = _set_current_job_context(job_context) try: asyncio.run(self.entrypoint(job_context)) finally: _reset_current_job_context(token) else: # No job context provided, run worker normally Worker.run_worker(worker_options)
class WorkerOptions (entrypoint_fnc: Callable[[JobContext], Any],
request_fnc: Callable[[ForwardRef('JobRequest')], Any] | None = None,
initialize_process_fnc: Callable[[Any], Any] | None = None,
executor_type: ExecutorType = ExecutorType.PROCESS,
num_idle_processes: int = 2,
initialize_timeout: float = 10.0,
close_timeout: float = 60.0,
memory_warn_mb: float = 500.0,
memory_limit_mb: float = 0.0,
ping_interval: float = 30.0,
max_processes: int = 10,
agent_id: str = 'VideoSDKAgent',
auth_token: str | None = None,
worker_type: WorkerType = WorkerType.ROOM,
permissions: WorkerPermissions = <factory>,
max_retry: int = 16,
load_threshold: float = 0.75,
register: bool = False,
signaling_base_url: str = 'api.videosdk.live',
host: str = '0.0.0.0',
port: int = 8081,
log_level: str = 'INFO')-
Expand source code
@dataclass class WorkerOptions: """Configuration options for the VideoSDK worker.""" entrypoint_fnc: Callable[[JobContext], Any] """Entrypoint function that will be called when a job is assigned to this worker.""" request_fnc: Optional[Callable[["JobRequest"], Any]] = None """Function to handle job requests and decide whether to accept them.""" initialize_process_fnc: Optional[Callable[[Any], Any]] = None """A function to perform any necessary initialization before the job starts.""" executor_type: ExecutorType = _default_executor_type """Which executor to use to run jobs. Automatically selected based on platform.""" num_idle_processes: int = 2 """Number of idle processes/threads to keep warm.""" initialize_timeout: float = 10.0 """Maximum amount of time to wait for a process/thread to initialize/prewarm""" close_timeout: float = 60.0 """Maximum amount of time to wait for a job to shut down gracefully""" memory_warn_mb: float = 500.0 """Memory warning threshold in MB.""" memory_limit_mb: float = 0.0 """Maximum memory usage for a job in MB. Defaults to 0 (disabled).""" ping_interval: float = 30.0 """Interval between health check pings.""" max_processes: int = 10 """Maximum number of processes/threads.""" agent_id: str = "VideoSDKAgent" """ID of the agent.""" auth_token: Optional[str] = None """VideoSDK authentication token. Uses VIDEOSDK_AUTH_TOKEN env var if not provided. This token is used for both VideoSDK services and registry authentication.""" worker_type: WorkerType = WorkerType.ROOM """Type of worker (room or publisher).""" permissions: WorkerPermissions = field(default_factory=WorkerPermissions) """Permissions for the agent participant.""" max_retry: int = 16 """Maximum number of times to retry connecting to VideoSDK.""" load_threshold: float = 0.75 """Load threshold above which worker is marked as unavailable.""" register: bool = False """Whether to register with the backend. Defaults to False for local development.""" signaling_base_url: str = "api.videosdk.live" """Signaling base URL for VideoSDK services. Defaults to api.videosdk.live.""" host: str = "0.0.0.0" """Host for the debug HTTP server.""" port: int = 8081 """Port for the debug HTTP server.""" log_level: str = "INFO" """Log level for SDK logging. Options: DEBUG, INFO, WARNING, ERROR. Defaults to INFO.""" def __post_init__(self): """Post-initialization setup.""" if not self.auth_token: self.auth_token = os.getenv("VIDEOSDK_AUTH_TOKEN") # Log the selected executor type logger.info(f"Worker configured with {self.executor_type.value} executor")Configuration options for the VideoSDK worker.
Instance variables
var agent_id : str-
ID of the agent.
var auth_token : str | None-
VideoSDK authentication token. Uses VIDEOSDK_AUTH_TOKEN env var if not provided. This token is used for both VideoSDK services and registry authentication.
var close_timeout : float-
Maximum amount of time to wait for a job to shut down gracefully
var entrypoint_fnc : Callable[[JobContext], Any]-
Entrypoint function that will be called when a job is assigned to this worker.
var executor_type : ExecutorType-
Which executor to use to run jobs. Automatically selected based on platform.
var host : str-
Host for the debug HTTP server.
var initialize_process_fnc : Callable[[Any], Any] | None-
A function to perform any necessary initialization before the job starts.
var initialize_timeout : float-
Maximum amount of time to wait for a process/thread to initialize/prewarm
var load_threshold : float-
Load threshold above which worker is marked as unavailable.
var log_level : str-
Log level for SDK logging. Options: DEBUG, INFO, WARNING, ERROR. Defaults to INFO.
var max_processes : int-
Maximum number of processes/threads.
var max_retry : int-
Maximum number of times to retry connecting to VideoSDK.
var memory_limit_mb : float-
Maximum memory usage for a job in MB. Defaults to 0 (disabled).
var memory_warn_mb : float-
Memory warning threshold in MB.
var num_idle_processes : int-
Number of idle processes/threads to keep warm.
var permissions : WorkerPermissions-
Permissions for the agent participant.
var ping_interval : float-
Interval between health check pings.
var port : int-
Port for the debug HTTP server.
var register : bool-
Whether to register with the backend. Defaults to False for local development.
var request_fnc : Callable[[JobRequest], Any] | None-
Function to handle job requests and decide whether to accept them.
var signaling_base_url : str-
Signaling base URL for VideoSDK services. Defaults to api.videosdk.live.
var worker_type : WorkerType-
Type of worker (room or publisher).
class WorkerType (*args, **kwds)-
Expand source code
class WorkerType(Enum): ROOM = "room"Create a collection of name/value pairs.
Example enumeration:
>>> class Color(Enum): ... RED = 1 ... BLUE = 2 ... GREEN = 3Access them by:
- attribute access:
Color.RED
- value lookup:
Color(1)
- name lookup:
Color['RED']
Enumerations can be iterated over, and know how many members they have:
>>> len(Color) 3>>> list(Color) [<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]Methods can be added to enumerations, and members can have their own attributes – see the documentation for details.
Ancestors
- enum.Enum
Class variables
var ROOM