Module agents.videosdk_eval

Sub-modules

agents.videosdk_eval.agent_wrapper
agents.videosdk_eval.audio_track
agents.videosdk_eval.cascading_flow
agents.videosdk_eval.components
agents.videosdk_eval.eval_logger

Custom logging utility for evaluation pipeline. All logs are prefixed with 'Eval:' to distinguish from regular application logs.

agents.videosdk_eval.evaluation
agents.videosdk_eval.factory
agents.videosdk_eval.metrics
agents.videosdk_eval.turn

Classes

class EvalMetric (*args, **kwds)
Expand source code
class EvalMetric(Enum):
    STT_LATENCY = "stt_latency"
    LLM_LATENCY = "llm_ttft"
    TTS_LATENCY = "ttfb"
    END_TO_END_LATENCY = "e2e_latency"

Create a collection of name/value pairs.

Example enumeration:

>>> class Color(Enum):
...     RED = 1
...     BLUE = 2
...     GREEN = 3

Access them by:

  • attribute access:

Color.RED

  • value lookup:

Color(1)

  • name lookup:

Color['RED']

Enumerations can be iterated over, and know how many members they have:

>>> len(Color)
3
>>> list(Color)
[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]

Methods can be added to enumerations, and members can have their own attributes – see the documentation for details.

Ancestors

  • enum.Enum

Class variables

var END_TO_END_LATENCY
var LLM_LATENCY
var STT_LATENCY
var TTS_LATENCY
class EvalTurn (stt: Any | None = None,
llm: Any | None = None,
tts: Any | None = None,
judge: Any | None = None,
id: str | None = None,
name: str | None = None)
Expand source code
@dataclass
class EvalTurn:
    stt: Optional[Any] = None
    llm: Optional[Any] = None
    tts: Optional[Any] = None
    judge: Optional[Any] = None
    id: Optional[str] = None
    name: Optional[str] = None

    def __post_init__(self):
        if self.id is None:
            self.id = str(uuid.uuid4())

EvalTurn(stt: Optional[Any] = None, llm: Optional[Any] = None, tts: Optional[Any] = None, judge: Optional[Any] = None, id: Optional[str] = None, name: Optional[str] = None)

Instance variables

var id : str | None
var judge : Any | None
var llm : Any | None
var name : str | None
var stt : Any | None
var tts : Any | None
class Evaluation (name: str,
metrics: List[videosdk.agents.videosdk_eval.metrics.EvalMetric] = None,
include_context: bool = False,
output_dir: str = './eval_reports')
Expand source code
class Evaluation:
    def __init__(
        self,
        name: str,
        metrics: List[EvalMetric] = None,
        include_context: bool = False,
        output_dir:str = "./eval_reports"
    ):
        self.name = name
        self.metrics = metrics
        self.include_context = include_context
        self.output_dir = output_dir
        self.turns: List[EvalTurn] = []

    def add_turn(self, turn: EvalTurn):
        self.turns.append(turn)

    def run(self) -> EvaluationResult:
        try:
            return asyncio.run(self._run_async())
        except RuntimeError:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            return loop.run_until_complete(self._run_async())

    async def _run_async(self) -> EvaluationResult:
        eval_logger.evaluation_start(self.name, len(self.turns))
        results = []
        
        agent_instructions = "You are a helpful assistant."
        agent_tools = []
        if self.turns:
            for turn in self.turns:
                if turn.llm:
                    _, config = turn.llm
                    if config.get("system_prompt"):
                        agent_instructions = config.get("system_prompt")
                    break

        agent = EvalAgent(instructions=agent_instructions, tools=agent_tools)
        
        try:
            for i, turn in enumerate(self.turns):
                eval_logger.turn_start(i+1, len(self.turns), str(turn.id or 'N/A'))
                turn_start_time = time.perf_counter()
                turn_result = await self._process_turn(turn, i, agent,self.output_dir)
                turn_elapsed = (time.perf_counter() - turn_start_time) * 1000
                eval_logger.turn_end(i+1, turn_elapsed)
                results.append(turn_result)
        finally:
            await agent.cleanup()
        
        # Display formatted logs at the end
        eval_logger.log("=" * 40)
        eval_logger.log("EVALUATION RESULTS")
        eval_logger.log("=" * 40)
        
        for res in results:
            turn_idx = res.get("turn_index", 0)
            turn_id = str(res.get("turn_id", "N/A"))
            metrics = res.get("metrics", {})
            
            # Get LLM input (what was actually sent to LLM)
            llm_input = metrics.get("llm_input") or ""
            
            # Get STT transcript
            stt_transcripts = metrics.get("collected_transcripts", [])
            stt_transcript = ""
            if stt_transcripts:
                stt_transcript = " ".join(stt_transcripts)
            
            # Fallback to user_speech ONLY if it's not the same as llm_input
            if not stt_transcript:
                user_speech = metrics.get("user_speech")
                if user_speech and user_speech != llm_input:
                    stt_transcript = user_speech
                else:
                    stt_transcript = "N/A"
            
            # Get LLM response
            llm_response = res.get("response_text", "") or "N/A"
            
            # Get TTS audio file
            tts_audio_file = metrics.get("tts_audio_file", "")
            
            # Display turn logs
            eval_logger.display_turn_logs(turn_index=turn_idx, turn_id=turn_id, stt_transcript=stt_transcript, 
                                        llm_response=llm_response, tts_audio_file=tts_audio_file, llm_input=llm_input)

        eval_logger.display_judge_results(results)
        eval_logger.display_latency_table(results, self.metrics)
        eval_logger.evaluation_end()
        return EvaluationResult(results=results, metrics_filter=self.metrics,output_dir=self.output_dir)

    async def tts_audio_file(self, audio_track: MockAudioTrack,output_dir:str):
        if audio_track:
            audio_bytes = audio_track.get_audio_bytes()
            if audio_bytes:
                import uuid
                unique_id = uuid.uuid4()
                tts_filename = f"tts_output_{unique_id}.wav"
                os.makedirs(output_dir, exist_ok=True)
                tts_filepath = os.path.join(output_dir, tts_filename)
                import wave
                with wave.open(tts_filepath, "wb") as wf:
                    wf.setnchannels(1)
                    wf.setsampwidth(2)
                    wf.setframerate(24000)
                    wf.writeframes(audio_bytes)
                
                eval_logger.log(f"TTS Audio saved to {tts_filepath}")
                return tts_filepath

    async def _process_turn(self, turn: EvalTurn, index: int, agent: EvalAgent,output_dir:str) -> dict:
        stt_instance = None
        llm_instance = None
        tts_instance = None

        if turn.stt:
            provider, config = turn.stt
            stt_instance = create_stt(provider, config)
        
        if turn.llm:
            provider, config = turn.llm
            llm_instance = create_llm(provider, config)
            system_prompt = config.get("system_prompt")
            if system_prompt:
                agent.instructions = system_prompt
            if config.get("tools"):
                agent.update_tools(config.get("tools"))

        if turn.tts:
            provider, config = turn.tts
            tts_instance = create_tts(provider, config)

        flow = EvalConversationFlow(
            agent=agent,
            stt=stt_instance,
            llm=llm_instance,
            tts=tts_instance
        )
        flow.enable_stt_processing = turn.stt is not None
        flow.enable_llm_processing = turn.llm is not None
        flow.enable_tts_processing = turn.tts is not None
        
        computed_metrics = {}

        try:
            # Initialize metrics for this turn
            cascading_metrics_collector.start_new_interaction()
            
            await flow.start()
            if turn.stt:
                cascading_metrics_collector.on_stt_start()
            
            # Config Extraction
            stt_filepath = None
            stt_chunk_size = 96000
            if turn.stt:
                _, stt_config = turn.stt
                stt_filepath = stt_config.get("file_path")
                stt_chunk_size = stt_config.get("chunk_size", 96000)

            use_stt_output = True
            llm_mock_input = None
            if turn.llm:
                _, llm_config = turn.llm
                use_stt_output = llm_config.get("use_stt_output", True)
                llm_mock_input = llm_config.get("mock_input")

            use_llm_output = True
            tts_mock_input = None
            if turn.tts:
                _, tts_config = turn.tts
                use_llm_output = tts_config.get("use_llm_output", True)
                tts_mock_input = tts_config.get("mock_input")

            # Flow Configuration
            if turn.tts and not use_llm_output and tts_mock_input:
                flow.enable_tts_processing = True
                flow.tts_mock_input = tts_mock_input
                eval_logger.log(f"Configured Mock TTS Input (will override LLM output): {tts_mock_input}")
            else:
                # Standard behavior: use LLM output for TTS if configured
                flow.enable_tts_processing = use_llm_output
                flow.tts_mock_input = None

            # STT
            if stt_filepath and os.path.exists(stt_filepath):
                eval_logger.component_start("STT")
                should_suppress_llm_during_stt = False
                if not use_stt_output and llm_mock_input:
                    should_suppress_llm_during_stt = True
                
                if should_suppress_llm_during_stt:
                    flow.enable_llm_processing = False
                    eval_logger.log("Suppressing LLM for STT streaming (Waiting for transcript)...")

                with open(stt_filepath, "rb") as f:
                    data = f.read()
                    for j in range(0, len(data),stt_chunk_size):
                        chunk = data[j:j+stt_chunk_size]
                        await flow.send_audio_delta(chunk)
                        await asyncio.sleep(0.075)
                
                if hasattr(flow.stt, 'flush'):
                    await flow.stt.flush()
                
                cascading_metrics_collector.on_user_speech_end()
                
                if should_suppress_llm_during_stt:
                    eval_logger.log("Waiting for STT events to settle (draining buffer)...")
                    await asyncio.sleep(15.0)

                    flow.allowed_mock_input = llm_mock_input
                    eval_logger.log(f"STT skipped for LLM. Processing mock input: {llm_mock_input}")
                    flow.generation_done_event.clear()
                    await flow.process_text_input(llm_mock_input)
            else:
                if turn.llm and llm_mock_input:
                    eval_logger.log(f"No STT file. Processing mock input directly: {llm_mock_input}")
                    flow.allowed_mock_input = llm_mock_input
                    flow.generation_done_event.clear()
                    await flow.process_text_input(llm_mock_input)
            
            # Wait for generation if LLM is configured  
            if turn.llm:
                try:
                    await asyncio.wait_for(flow.generation_done_event.wait(), timeout=30.0)
                except asyncio.TimeoutError:
                    eval_logger.log("Timed out waiting for LLM generation.")
            elif turn.stt:
                try:
                    await asyncio.wait_for(flow.stt_done_event.wait(), timeout=30.0)
                except asyncio.TimeoutError:
                    eval_logger.log("Timed out waiting for STT transcription.")
            
            # For TTS-only turns (no LLM), manually synthesize
            if turn.tts and not turn.llm and tts_mock_input:
                eval_logger.log(f"TTS-only turn. Synthesizing: {tts_mock_input}")
                from videosdk.agents.utterance_handle import UtteranceHandle
                from videosdk.agents.utils import AgentState

                if flow.agent and flow.agent.session:
                    mock_handle_id = f"mock_tts_{uuid.uuid4()}" 
                    mock_handle = UtteranceHandle(utterance_id=mock_handle_id, interruptible=False)
                    flow.agent.session.current_utterance = mock_handle
                    flow.agent.session._emit_agent_state(AgentState.SPEAKING)
                
                async def text_iterator():
                    yield tts_mock_input
                
                await flow._synthesize_with_tts(text_iterator())
                cascading_metrics_collector.on_agent_speech_end()

            computed_metrics = flow.metrics
            metrics_data = flow.metrics
            actual_llm_response = metrics_data.get("agent_speech", "") or "N/A"
            spoken_response = actual_llm_response
            
            # For TTS-override turns, keep track of what was actually spoken vs generated
            if turn.tts and not use_llm_output and tts_mock_input:
                spoken_response = tts_mock_input
                if not actual_llm_response or actual_llm_response == "N/A":
                    if flow.agent and flow.agent.chat_context:
                        flow.agent.chat_context.add_message(role=ChatRole.ASSISTANT, content=tts_mock_input)
            cascading_metrics_collector.complete_current_turn()
            computed_metrics = flow.metrics

        except Exception as e:
            eval_logger.log(f"Error executing turn: {e}")
            actual_llm_response = f"Error: {str(e)}"
            spoken_response = f"Error: {str(e)}"
            import traceback
            eval_logger.log(traceback.format_exc())

        finally:
            await flow.cleanup()
            if stt_instance and hasattr(stt_instance, 'aclose'): await stt_instance.aclose()
            if llm_instance and hasattr(llm_instance, 'aclose'): await llm_instance.aclose()
            if tts_instance and hasattr(tts_instance, 'aclose'): await tts_instance.aclose()

        # 3. Judge
        judge_result = {"status": "skipped", "reason": "No judge configured"}
        if turn.judge:
            provider, config = turn.judge
            from .components.llm import LLMEvalConfig
            judge_llm_config = LLMEvalConfig(model=config.model, api_key=os.getenv("GOOGLE_API_KEY_LLM"))
            judge_llm = create_llm(provider, judge_llm_config)
            
            # Construct Judge History from agent's chat context
            history_str = ""
            if agent.chat_context and self.include_context:
                for msg in agent.chat_context._items:
                    role = "User" if msg.role == ChatRole.USER else "Agent" if msg.role == ChatRole.ASSISTANT else "System"
                    history_str += f"{role}: {msg.content}\n"

            current_user_input = "Audio/Unknown"
            
            if turn.llm:
                _, llm_cfg = turn.llm
                if llm_cfg.get('mock_input'):
                    current_user_input = llm_cfg.get('mock_input')
                elif turn.stt:
                    stt_transcripts = computed_metrics.get('collected_transcripts', [])
                    if stt_transcripts:
                        current_user_input = " ".join(stt_transcripts)
            elif turn.stt:
                stt_transcripts = computed_metrics.get('collected_transcripts', [])
                if stt_transcripts:
                    current_user_input = " ".join(stt_transcripts)

            prompt = f"{config.prompt}\n\nFull Conversation History:\n{history_str}\n\nFocus more on the following Current Turn Details:\nUser Input: {current_user_input}\nAgent LLM Response (Actual Output): {actual_llm_response}\nAgent Spoken Output (TTS): {spoken_response}\n\n. Evaluate based on: {config.checks}. It should follow the output format. That will have all keys from checks and one conclusion key that will be the summary of it "
            
            judge_response_text = ""
            try:
                judge_context = ChatContext()
                judge_context.add_message(role=ChatRole.USER, content=prompt)
                
                async for chunk in judge_llm.chat(judge_context):
                    if chunk.content:
                        judge_response_text += chunk.content
                    if chunk.metadata:
                        if not judge_response_text:
                            judge_response_text = json.dumps(chunk.metadata, indent=2)
                
                judge_result = {"passed": True, "evaluation": judge_response_text}
                try:
                    cleaned_text = judge_response_text.strip()
                    if cleaned_text.startswith("```json"):
                        cleaned_text = cleaned_text.split("```json")[1].split("```")[0].strip()
                    elif cleaned_text.startswith("```"):
                        cleaned_text = cleaned_text.split("```")[1].split("```")[0].strip()
                    
                    eval_data = json.loads(cleaned_text)
                    if isinstance(eval_data, dict):
                        score = eval_data.get("score")
                        if score is not None:
                            try:
                                if float(score) >= 3:
                                    judge_result["passed"] = True
                                else:
                                    judge_result["passed"] = False
                            except:
                                pass
                        
                        if "passed" in eval_data:
                            judge_result["passed"] = bool(eval_data["passed"])
                        elif "conclusion" in eval_data:
                            pass
                except:
                    if "score: 1" in judge_response_text or "score: 2" in judge_response_text:
                        judge_result["passed"] = False
                    elif "score:" in judge_response_text:
                        judge_result["passed"] = True
            except Exception as e:
                eval_logger.log(f"Judge evaluation failed: {e}")
                judge_result = {"passed": False, "reason": str(e)}
            if judge_llm and hasattr(judge_llm, 'aclose'):
                await judge_llm.aclose()

        if turn.tts and flow.audio_track:
            tts_filepath = await self.tts_audio_file(flow.audio_track,output_dir)
            computed_metrics["tts_audio_file"] = tts_filepath

        return {
            "turn_index": index,
            "turn_id": turn.id,
            "metrics": computed_metrics,
            "response_text": actual_llm_response,
            "judge": judge_result
        }

Methods

def add_turn(self, turn: videosdk.agents.videosdk_eval.turn.EvalTurn)
Expand source code
def add_turn(self, turn: EvalTurn):
    self.turns.append(turn)
def run(self) ‑> EvaluationResult
Expand source code
def run(self) -> EvaluationResult:
    try:
        return asyncio.run(self._run_async())
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        return loop.run_until_complete(self._run_async())
async def tts_audio_file(self,
audio_track: videosdk.agents.videosdk_eval.audio_track.MockAudioTrack,
output_dir: str)
Expand source code
async def tts_audio_file(self, audio_track: MockAudioTrack,output_dir:str):
    if audio_track:
        audio_bytes = audio_track.get_audio_bytes()
        if audio_bytes:
            import uuid
            unique_id = uuid.uuid4()
            tts_filename = f"tts_output_{unique_id}.wav"
            os.makedirs(output_dir, exist_ok=True)
            tts_filepath = os.path.join(output_dir, tts_filename)
            import wave
            with wave.open(tts_filepath, "wb") as wf:
                wf.setnchannels(1)
                wf.setsampwidth(2)
                wf.setframerate(24000)
                wf.writeframes(audio_bytes)
            
            eval_logger.log(f"TTS Audio saved to {tts_filepath}")
            return tts_filepath
class EvaluationResult (results: List[Dict[str, Any]] = <factory>,
metrics_filter: List = <factory>,
output_dir: str = './eval_reports')
Expand source code
@dataclass
class EvaluationResult:
    results: List[Dict[str, Any]] = field(default_factory=list)
    metrics_filter: List = field(default_factory=lambda: None)
    output_dir: str = field(default="./eval_reports")

    def save(self) -> None:
        run_id = uuid.uuid4()
        os.makedirs(self.output_dir, exist_ok=True)

        self._save_metrics_csv(run_id)
        self._save_transcripts(run_id)
        self._save_judge_results(run_id)

    def _save_metrics_csv(self, run_id: uuid.UUID) -> None:
        rows = []
        fieldnames = {"turn_index", "turn_id"}
        if self.metrics_filter:
            allowed_metrics = {m.value for m in self.metrics_filter}
        else:
            allowed_metrics = None

        for res in self.results:
            metrics = res.get("metrics", {})

            row = {
                "turn_index": res.get("turn_index"),
                "turn_id": res.get("turn_id"),
            }

            for key, value in metrics.items():
                is_latency_metric = (
                    key.startswith(("stt_latency", "llm_ttft", "ttfb", "e2e_latency"))
                    or "time_to" in key
                )
                if is_latency_metric:
                    if allowed_metrics is None or key in allowed_metrics:
                        fieldnames.add(key)
                        row[key] = value

            if len(row) > 2:
                rows.append(row)

        if not rows:
            return

        path = os.path.join(self.output_dir, f"metrics_{run_id}.csv")
        with open(path, "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=sorted(fieldnames))
            writer.writeheader()
            writer.writerows(rows)

        eval_logger.log(f"Metrics saved to {path}")

    def _save_transcripts(self, run_id: uuid.UUID) -> None:
        path = os.path.join(self.output_dir, f"transcripts_{run_id}.txt")

        try:
            with open(path, "w") as f:
                for res in self.results:
                    metrics = res.get("metrics", {})
                    user_transcripts = metrics.get("collected_transcripts", [])
                    agent_response = res.get("response_text", "")
                    
                    if not user_transcripts and not agent_response:
                        continue

                    f.write(f"Turn ID: {res.get('turn_id', 'Unknown')}\n")
                    stt_text = ' '.join(user_transcripts) if user_transcripts else 'N/A'
                    f.write(f"User STT Transcript: {stt_text}\n")
                    
                    llm_input = metrics.get("llm_input")
                    if llm_input and llm_input != stt_text:
                        f.write(f"LLM Input (Overridden): {llm_input}\n")

                    f.write(f"Agent: {agent_response if agent_response else 'N/A'}\n")
                    f.write("-" * 40 + "\n")

            eval_logger.log(f"Transcripts saved to {path}")
        except Exception as e:
            eval_logger.log(f"Failed to save transcripts: {e}")

    def _save_judge_results(self, run_id: uuid.UUID) -> None:
        path = os.path.join(self.output_dir, f"judge_results_{run_id}.txt")

        try:
            with open(path, "w") as f:
                for res in self.results:
                    judge = res.get("judge", {})
                    f.write(f"Turn ID: {res.get('turn_id', 'Unknown')}\n")
                    f.write(f"Judge Passed: {judge.get('passed', False)}\n")
                    f.write(f"LLM-as-Judge Response: {judge}\n")
                    f.write("-" * 40 + "\n")

            eval_logger.log(f"Judge results saved to {path}")
        except Exception as e:
            eval_logger.log(f"Failed to save judge results: {e}")

EvaluationResult(results: List[Dict[str, Any]] = , metrics_filter: List = , output_dir: str = './eval_reports')

Instance variables

var metrics_filter : List
var output_dir : str
var results : List[Dict[str, Any]]

Methods

def save(self) ‑> None
Expand source code
def save(self) -> None:
    run_id = uuid.uuid4()
    os.makedirs(self.output_dir, exist_ok=True)

    self._save_metrics_csv(run_id)
    self._save_transcripts(run_id)
    self._save_judge_results(run_id)
class LLMAsJudge
Expand source code
class LLMAsJudge:
    @staticmethod
    def openai(model: str, prompt: str, checks: List[str]):
        return ("openai", LLMAsJudgeConfig(model=model, prompt=prompt, checks=checks))
    
    @staticmethod
    def anthropic(model: str, prompt: str, checks: List[str]):
        return ("anthropic", LLMAsJudgeConfig(model=model, prompt=prompt, checks=checks))
    
    @staticmethod
    def google(model: str, prompt: str, checks: List[str]):
        return ("google", LLMAsJudgeConfig(model=model, prompt=prompt, checks=checks))

Static methods

def anthropic(model: str, prompt: str, checks: List[str])
Expand source code
@staticmethod
def anthropic(model: str, prompt: str, checks: List[str]):
    return ("anthropic", LLMAsJudgeConfig(model=model, prompt=prompt, checks=checks))
def google(model: str, prompt: str, checks: List[str])
Expand source code
@staticmethod
def google(model: str, prompt: str, checks: List[str]):
    return ("google", LLMAsJudgeConfig(model=model, prompt=prompt, checks=checks))
def openai(model: str, prompt: str, checks: List[str])
Expand source code
@staticmethod
def openai(model: str, prompt: str, checks: List[str]):
    return ("openai", LLMAsJudgeConfig(model=model, prompt=prompt, checks=checks))
class LLMAsJudgeConfig (model: str, prompt: str | None = None, checks: List[str] = <factory>)
Expand source code
@dataclass
class LLMAsJudgeConfig:
    model: str
    prompt: Optional[str] = None
    checks: List[str] = field(default_factory=list)

LLMAsJudgeConfig(model: str, prompt: Optional[str] = None, checks: List[str] = )

Instance variables

var checks : List[str]
var model : str
var prompt : str | None
class LLMAsJudgeMetric (*args, **kwds)
Expand source code
class LLMAsJudgeMetric(Enum):
    REASONING = "reasoning: why did the agent respond in this way?"
    RELEVANCE = "relevance: was the agent's response relevant to the user's request?"
    CLARITY = "clarity: was the agent's response clear and easy to understand?"
    SCORE = "score: how would you rate the agent's response out of 10?"

Create a collection of name/value pairs.

Example enumeration:

>>> class Color(Enum):
...     RED = 1
...     BLUE = 2
...     GREEN = 3

Access them by:

  • attribute access:

Color.RED

  • value lookup:

Color(1)

  • name lookup:

Color['RED']

Enumerations can be iterated over, and know how many members they have:

>>> len(Color)
3
>>> list(Color)
[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]

Methods can be added to enumerations, and members can have their own attributes – see the documentation for details.

Ancestors

  • enum.Enum

Class variables

var CLARITY
var REASONING
var RELEVANCE
var SCORE
class LLMComponent
Expand source code
class LLMComponent:
    @staticmethod
    def openai(config: LLMEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("OPENAI_API_KEY") or config.api_key,
            "system_prompt": config.system_prompt,
            "tools": config.tools,
            "use_stt_output": config.use_stt_output,
            "mock_input": config.mock_input
        }
        return ("openai", config)

    @staticmethod
    def anthropic(config: LLMEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("ANTHROPIC_API_KEY") or config.api_key,
            "system_prompt": config.system_prompt,
            "tools": config.tools,
            "use_stt_output": config.use_stt_output,
            "mock_input": config.mock_input
        }
        return ("anthropic", config)
    
    @staticmethod
    def google(config: LLMEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("GOOGLE_API_KEY_LLM") or config.api_key,
            "system_prompt": config.system_prompt,
            "tools": config.tools,
            "use_stt_output": config.use_stt_output,
            "mock_input": config.mock_input
        }
        return ("google", config)

Static methods

def anthropic(config: LLMEvalConfig)
Expand source code
@staticmethod
def anthropic(config: LLMEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("ANTHROPIC_API_KEY") or config.api_key,
        "system_prompt": config.system_prompt,
        "tools": config.tools,
        "use_stt_output": config.use_stt_output,
        "mock_input": config.mock_input
    }
    return ("anthropic", config)
def google(config: LLMEvalConfig)
Expand source code
@staticmethod
def google(config: LLMEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("GOOGLE_API_KEY_LLM") or config.api_key,
        "system_prompt": config.system_prompt,
        "tools": config.tools,
        "use_stt_output": config.use_stt_output,
        "mock_input": config.mock_input
    }
    return ("google", config)
def openai(config: LLMEvalConfig)
Expand source code
@staticmethod
def openai(config: LLMEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("OPENAI_API_KEY") or config.api_key,
        "system_prompt": config.system_prompt,
        "tools": config.tools,
        "use_stt_output": config.use_stt_output,
        "mock_input": config.mock_input
    }
    return ("openai", config)
class LLMEvalConfig (model: str,
api_key: str | None = None,
system_prompt: str | None = None,
tools: List[Any] = None,
use_stt_output: bool = True,
mock_input: str | None = None)
Expand source code
@dataclass
class LLMEvalConfig:
    model: str
    api_key: Optional[str] = None
    system_prompt: Optional[str] = None
    tools: List[Any] = field(default=None)
    use_stt_output: bool = True
    mock_input: Optional[str] = None

LLMEvalConfig(model: str, api_key: Optional[str] = None, system_prompt: Optional[str] = None, tools: List[Any] = None, use_stt_output: bool = True, mock_input: Optional[str] = None)

Instance variables

var api_key : str | None
var mock_input : str | None
var model : str
var system_prompt : str | None
var tools : List[Any]
var use_stt_output : bool
class STTComponent
Expand source code
class STTComponent:
    @staticmethod
    def deepgram(config: STTEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("DEEPGRAM_API_KEY") or config.api_key,
            "file_path": config.file_path,
            "chunk_size": config.chunk_size,
            "sample_rate": config.sample_rate
        }
        return ("deepgram", config)
    def deepgramv2(config: STTEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("DEEPGRAM_API_KEY") or config.api_key,
            "file_path": config.file_path,
            "chunk_size": config.chunk_size,
            "sample_rate": config.sample_rate
        }
        return ("deepgramv2", config)

    @staticmethod
    def openai(config: STTEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("OPENAI_API_KEY") or config.api_key,
            "file_path": config.file_path,
            "chunk_size": config.chunk_size,
            "sample_rate": config.sample_rate
        }
        return ("openai", config)
    
    @staticmethod
    def google(config: STTEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("GOOGLE_API_KEY") or config.api_key,
            "file_path": config.file_path,
            "chunk_size": config.chunk_size,
            "sample_rate": config.sample_rate
        }
        return ("google", config)

Static methods

def deepgram(config: STTEvalConfig)
Expand source code
@staticmethod
def deepgram(config: STTEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("DEEPGRAM_API_KEY") or config.api_key,
        "file_path": config.file_path,
        "chunk_size": config.chunk_size,
        "sample_rate": config.sample_rate
    }
    return ("deepgram", config)
def google(config: STTEvalConfig)
Expand source code
@staticmethod
def google(config: STTEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("GOOGLE_API_KEY") or config.api_key,
        "file_path": config.file_path,
        "chunk_size": config.chunk_size,
        "sample_rate": config.sample_rate
    }
    return ("google", config)
def openai(config: STTEvalConfig)
Expand source code
@staticmethod
def openai(config: STTEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("OPENAI_API_KEY") or config.api_key,
        "file_path": config.file_path,
        "chunk_size": config.chunk_size,
        "sample_rate": config.sample_rate
    }
    return ("openai", config)

Methods

def deepgramv2(config: STTEvalConfig)
Expand source code
def deepgramv2(config: STTEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("DEEPGRAM_API_KEY") or config.api_key,
        "file_path": config.file_path,
        "chunk_size": config.chunk_size,
        "sample_rate": config.sample_rate
    }
    return ("deepgramv2", config)
class STTEvalConfig (file_path: str | None = None,
model: str = None,
api_key: str | None = None,
chunk_size: int = 96000,
sample_rate: int = 48000)
Expand source code
@dataclass
class STTEvalConfig:
    file_path: Optional[str] = None
    model:str = None
    api_key: Optional[str] = None
    chunk_size: int = 96000
    sample_rate: int = 48000

STTEvalConfig(file_path: Optional[str] = None, model: str = None, api_key: Optional[str] = None, chunk_size: int = 96000, sample_rate: int = 48000)

Instance variables

var api_key : str | None
var chunk_size : int
var file_path : str | None
var model : str
var sample_rate : int
class TTSComponent
Expand source code
class TTSComponent:
    @staticmethod
    def deepgram(config: TTSEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("DEEPGRAM_API_KEY") or config.api_key,
            "use_llm_output": config.use_llm_output,
            "mock_input": config.mock_input
        }   
        return ("deepgram", config)

    @staticmethod
    def openai(config: TTSEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("OPENAI_API_KEY") or config.api_key,
            "use_llm_output": config.use_llm_output,
            "mock_input": config.mock_input
        }
        return ("openai", config)
    
    @staticmethod
    def google(config: TTSEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("GOOGLE_API_KEY") or config.api_key,
            "use_llm_output": config.use_llm_output,
            "mock_input": config.mock_input
        }
        return ("google", config)
    
    @staticmethod
    def cartesia(config: TTSEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("CARTESIA_API_KEY") or config.api_key,
            "use_llm_output": config.use_llm_output,
            "mock_input": config.mock_input
        }
        return ("cartesia", config)
    
    @staticmethod
    def elevenlabs(config: TTSEvalConfig):
        config = {
            "model": config.model,
            "api_key": os.getenv("ELEVENLABS_API_KEY") or config.api_key,
            "use_llm_output": config.use_llm_output,
            "mock_input": config.mock_input
        }
        return ("elevenlabs", config)

Static methods

def cartesia(config: TTSEvalConfig)
Expand source code
@staticmethod
def cartesia(config: TTSEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("CARTESIA_API_KEY") or config.api_key,
        "use_llm_output": config.use_llm_output,
        "mock_input": config.mock_input
    }
    return ("cartesia", config)
def deepgram(config: TTSEvalConfig)
Expand source code
@staticmethod
def deepgram(config: TTSEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("DEEPGRAM_API_KEY") or config.api_key,
        "use_llm_output": config.use_llm_output,
        "mock_input": config.mock_input
    }   
    return ("deepgram", config)
def elevenlabs(config: TTSEvalConfig)
Expand source code
@staticmethod
def elevenlabs(config: TTSEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("ELEVENLABS_API_KEY") or config.api_key,
        "use_llm_output": config.use_llm_output,
        "mock_input": config.mock_input
    }
    return ("elevenlabs", config)
def google(config: TTSEvalConfig)
Expand source code
@staticmethod
def google(config: TTSEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("GOOGLE_API_KEY") or config.api_key,
        "use_llm_output": config.use_llm_output,
        "mock_input": config.mock_input
    }
    return ("google", config)
def openai(config: TTSEvalConfig)
Expand source code
@staticmethod
def openai(config: TTSEvalConfig):
    config = {
        "model": config.model,
        "api_key": os.getenv("OPENAI_API_KEY") or config.api_key,
        "use_llm_output": config.use_llm_output,
        "mock_input": config.mock_input
    }
    return ("openai", config)
class TTSEvalConfig (model: str,
api_key: str | None = None,
use_llm_output: bool = True,
mock_input: str | None = None)
Expand source code
@dataclass
class TTSEvalConfig:
    model: str
    api_key: Optional[str] = None
    use_llm_output: bool = True
    mock_input: Optional[str] = None

TTSEvalConfig(model: str, api_key: Optional[str] = None, use_llm_output: bool = True, mock_input: Optional[str] = None)

Instance variables

var api_key : str | None
var mock_input : str | None
var model : str
var use_llm_output : bool