Package videosdk.plugins.turn_detector

Sub-modules

videosdk.plugins.turn_detector.download_model
videosdk.plugins.turn_detector.model
videosdk.plugins.turn_detector.turn_detector
videosdk.plugins.turn_detector.turn_detector_v2
videosdk.plugins.turn_detector.turn_detector_v3

Functions

def pre_download_model()
Expand source code
def pre_download_model():
    from transformers import AutoTokenizer
    AutoTokenizer.from_pretrained(HG_MODEL)
    _download_from_hf_hub(
        repo_id=HG_MODEL,
        filename=ONNX_FILENAME,
    )
def pre_download_namo_turn_v1_model(overwrite_existing: bool = False, language: str | None = None)
Expand source code
def pre_download_namo_turn_v1_model(overwrite_existing: bool = False, language: Optional[str] = None):
    hf_repo = _get_hf_model_repo(language)
    
    if language is None:
        AutoTokenizer.from_pretrained(hf_repo)
    else:
        DistilBertTokenizer.from_pretrained(hf_repo)
def pre_download_videosdk_model(overwrite_existing: bool = False)
Expand source code
def pre_download_videosdk_model(overwrite_existing: bool = False):
    from .download_model import download_model_files_to_directory
    download_model_files_to_directory(
        base_cdn_url=VIDEOSDK_MODEL_URL,
        file_names=VIDEOSDK_MODEL_FILES,
        local_save_directory=MODEL_DIR,
        overwrite_existing=overwrite_existing,
    )
    BertTokenizer.from_pretrained(MODEL_DIR)

Classes

class NamoTurnDetectorV1 (threshold: float = 0.7, language: str | None = None, **kwargs)
Expand source code
class NamoTurnDetectorV1(EOU):
    """
    A lightweight end-of-utterance detection model using VideoSDK's Namo Turn Detection v1 model.
    """
    
    def __init__(self, threshold: float = 0.7, language: Optional[str] = None, **kwargs):

        super().__init__(threshold=threshold, **kwargs)
        self.language = language
        self.session = None
        self.tokenizer = None
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize the ONNX model and tokenizer"""
        try:
            import onnxruntime as ort
            
            hf_repo = _get_hf_model_repo(self.language)
            
            if self.language is None:
                self.tokenizer = AutoTokenizer.from_pretrained(hf_repo)
                self.max_length = 8192
            else:
                self.tokenizer = DistilBertTokenizer.from_pretrained(hf_repo)
                self.max_length = 512
                        

            model_path = hf_hub_download(repo_id=hf_repo, filename="model_quant.onnx")
            
            self.session = ort.InferenceSession(model_path)
            print(f"Model loaded successfully from {hf_repo}.")
            
        except Exception as e:
            print(f"Error loading model: {e}")
            logger.error(f"Failed to initialize TurnDetection model: {e}")
            self.emit("error", f"Failed to initialize TurnDetection model: {str(e)}")
            raise
    
    def _get_last_user_message(self, chat_context: ChatContext) -> str:
        """
        Extract the last user message from chat context.
        This is what we want to analyze for EOU detection.
        """
        user_messages = [
            item for item in chat_context.items 
            if isinstance(item, ChatMessage) and item.role == ChatRole.USER
        ]
        
        if not user_messages:
            return ""
        
        last_message = user_messages[-1]
        content = last_message.content
        
        if isinstance(content, list):
            text_content = " ".join([c.text if hasattr(c, 'text') else str(c) for c in content])
        else:
            text_content = str(content)
        
        return text_content.strip()
    
    def _chat_context_to_text(self, chat_context: ChatContext) -> str:
        """
        Transform ChatContext to model-compatible format.
        Focus on the last user message for EOU detection.
        """
        last_user_text = self._get_last_user_message(chat_context)
        
        if not last_user_text:
            return ""
        
        return last_user_text

    def detect_turn(self, sentence: str) -> float:
        """
        Detect turn probability for the given sentence.
        """
        try:
            inputs = self.tokenizer(sentence.strip(), truncation=True, max_length=self.max_length, return_tensors="np")
            
            input_dict = {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"]
            }
            
            if "token_type_ids" in inputs:
                input_dict["token_type_ids"] = inputs["token_type_ids"]
            
            outputs = self.session.run(None, input_dict)
            
            logits = outputs[0][0]
            
            exp_logits = np.exp(logits - np.max(logits))
            probabilities = exp_logits / np.sum(exp_logits)
            
            eou_probability = float(probabilities[1])
            
            return eou_probability
            
        except Exception as e:
            print(e)
            logger.error(f"Error detecting turn: {e}")
            self.emit("error", f"Error detecting turn: {str(e)}")
            return 0.0

    def get_eou_probability(self, chat_context: ChatContext) -> float:
        """
        Get the probability score for end of utterance detection.
        """
        try:
            sentence = self._chat_context_to_text(chat_context)
            if not sentence:
                return 0.0
            return self.detect_turn(sentence)
        except Exception as e:
            logger.error(f"Error getting EOU probability: {e}")
            self.emit("error", f"Error getting EOU probability: {str(e)}")
            return 0.0

    def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool:
        """
        Detect if the given chat context represents an end of utterance.
        """
        try:
            effective_threshold = threshold if threshold is not None else self.threshold
            
            probability = self.get_eou_probability(chat_context)
            return probability >= effective_threshold
            
        except Exception as e:
            logger.error(f"Error in EOU detection: {e}")
            self.emit("error", f"Error in EOU detection: {str(e)}")
            return False
    
    async def aclose(self) -> None:
        """Cleanup ONNX model and tokenizer from memory"""
        logger.info("Cleaning up NamoTurnDetectorV1 model resources")
        
        if hasattr(self, 'session') and self.session is not None:
            try:
                del self.session
                self.session = None
                logger.info("Namo ONNX session cleaned up")
            except Exception as e:
                logger.error(f"Error cleaning up Namo ONNX session: {e}")
        
        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
            try:
                del self.tokenizer
                self.tokenizer = None
                logger.info("Namo tokenizer cleaned up")
            except Exception as e:
                logger.error(f"Error cleaning up Namo tokenizer: {e}")
        self.language = None
        
        try:
            import gc
            gc.collect()
            logger.info("Garbage collection completed")
        except Exception as e:
            logger.error(f"Error during garbage collection: {e}")
        
        logger.info("NamoTurnDetectorV1 cleanup completed")
        await super().aclose()

A lightweight end-of-utterance detection model using VideoSDK's Namo Turn Detection v1 model.

Ancestors

  • videosdk.agents.eou.EOU
  • videosdk.agents.event_emitter.EventEmitter
  • typing.Generic

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    """Cleanup ONNX model and tokenizer from memory"""
    logger.info("Cleaning up NamoTurnDetectorV1 model resources")
    
    if hasattr(self, 'session') and self.session is not None:
        try:
            del self.session
            self.session = None
            logger.info("Namo ONNX session cleaned up")
        except Exception as e:
            logger.error(f"Error cleaning up Namo ONNX session: {e}")
    
    if hasattr(self, 'tokenizer') and self.tokenizer is not None:
        try:
            del self.tokenizer
            self.tokenizer = None
            logger.info("Namo tokenizer cleaned up")
        except Exception as e:
            logger.error(f"Error cleaning up Namo tokenizer: {e}")
    self.language = None
    
    try:
        import gc
        gc.collect()
        logger.info("Garbage collection completed")
    except Exception as e:
        logger.error(f"Error during garbage collection: {e}")
    
    logger.info("NamoTurnDetectorV1 cleanup completed")
    await super().aclose()

Cleanup ONNX model and tokenizer from memory

def detect_end_of_utterance(self,
chat_context: videosdk.agents.llm.chat_context.ChatContext,
threshold: float | None = None) ‑> bool
Expand source code
def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool:
    """
    Detect if the given chat context represents an end of utterance.
    """
    try:
        effective_threshold = threshold if threshold is not None else self.threshold
        
        probability = self.get_eou_probability(chat_context)
        return probability >= effective_threshold
        
    except Exception as e:
        logger.error(f"Error in EOU detection: {e}")
        self.emit("error", f"Error in EOU detection: {str(e)}")
        return False

Detect if the given chat context represents an end of utterance.

def detect_turn(self, sentence: str) ‑> float
Expand source code
def detect_turn(self, sentence: str) -> float:
    """
    Detect turn probability for the given sentence.
    """
    try:
        inputs = self.tokenizer(sentence.strip(), truncation=True, max_length=self.max_length, return_tensors="np")
        
        input_dict = {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }
        
        if "token_type_ids" in inputs:
            input_dict["token_type_ids"] = inputs["token_type_ids"]
        
        outputs = self.session.run(None, input_dict)
        
        logits = outputs[0][0]
        
        exp_logits = np.exp(logits - np.max(logits))
        probabilities = exp_logits / np.sum(exp_logits)
        
        eou_probability = float(probabilities[1])
        
        return eou_probability
        
    except Exception as e:
        print(e)
        logger.error(f"Error detecting turn: {e}")
        self.emit("error", f"Error detecting turn: {str(e)}")
        return 0.0

Detect turn probability for the given sentence.

def get_eou_probability(self, chat_context: videosdk.agents.llm.chat_context.ChatContext) ‑> float
Expand source code
def get_eou_probability(self, chat_context: ChatContext) -> float:
    """
    Get the probability score for end of utterance detection.
    """
    try:
        sentence = self._chat_context_to_text(chat_context)
        if not sentence:
            return 0.0
        return self.detect_turn(sentence)
    except Exception as e:
        logger.error(f"Error getting EOU probability: {e}")
        self.emit("error", f"Error getting EOU probability: {str(e)}")
        return 0.0

Get the probability score for end of utterance detection.

class TurnDetector (threshold: float = 0.7, **kwargs)
Expand source code
class TurnDetector(EOU):
    """
    A lightweight end-of-utterance detection model using TurnSense.
    Based on SmolLM2-135M, optimized for edge devices.
    """

    def __init__(self, threshold: float = 0.7, **kwargs):
        """Initialize the TurnDetector plugin.

        Args:
            threshold (float): The threshold for end-of-utterance detection. Defaults to 0.7.
            **kwargs: Additional keyword arguments to pass to the parent class.
        """
        super().__init__(threshold=threshold, **kwargs)
        self.session = None
        self.tokenizer = None
        self._initialize_model()

    def _initialize_model(self):
        """Initialize the ONNX model and tokenizer"""
        try:
            import onnxruntime as ort
            from transformers import AutoTokenizer

            self.tokenizer = AutoTokenizer.from_pretrained(HG_MODEL)

            model_path = _download_from_hf_hub(
                repo_id=HG_MODEL,
                filename=ONNX_FILENAME,
            )

            self.session = ort.InferenceSession(
                model_path,
                providers=["CPUExecutionProvider"]
            )

        except Exception as e:
            logger.error(f"Failed to initialize TurnSense model: {e}")
            self.emit(
                "error", f"Failed to initialize TurnSense model: {str(e)}")
            raise

    def _get_last_user_message(self, chat_context: ChatContext) -> str:
        """
        Extract the last user message from chat context.
        This is what we want to analyze for EOU detection.

        Args:
            chat_context: The chat context to analyze

        Returns:
            str: The last user message content
        """
        user_messages = [
            item for item in chat_context.items
            if isinstance(item, ChatMessage) and item.role == ChatRole.USER
        ]

        if not user_messages:
            return ""

        last_message = user_messages[-1]
        content = last_message.content

        if isinstance(content, list):
            text_content = " ".join(
                [c.text if hasattr(c, 'text') else str(c) for c in content])
        else:
            text_content = str(content)

        return text_content.strip()

    def _chat_context_to_text(self, chat_context: ChatContext) -> str:
        """
        Transform ChatContext to model-compatible format.
        Focus on the last user message for EOU detection.

        Args:
            chat_context: The chat context to transform

        Returns:
            str: Formatted text for the model
        """
        last_user_text = self._get_last_user_message(chat_context)

        if not last_user_text:
            return "<|user|>  <|im_end|>"

        formatted_text = f"<|user|> {last_user_text} <|im_end|>"

        return formatted_text

    def get_eou_probability(self, chat_context: ChatContext) -> float:
        """
        Get the probability score for end of utterance detection.

        Args:
            chat_context: Chat context to analyze

        Returns:
            float: Probability score (0.0 to 1.0)
        """
        if not self.session or not self.tokenizer:
            self.emit("error", "TurnSense model not initialized")
            raise RuntimeError("Model not initialized")

        try:
            formatted_text = self._chat_context_to_text(chat_context)

            inputs = self.tokenizer(
                formatted_text,
                padding="max_length",
                max_length=256,
                truncation=True,
                return_tensors="np"
            )

            ort_inputs = {
                'input_ids': inputs['input_ids'].astype(np.int64),
                'attention_mask': inputs['attention_mask'].astype(np.int64)
            }

            outputs = self.session.run(None, ort_inputs)

            probabilities = outputs[0]

            eou_prob = float(probabilities[0][1])

            return eou_prob

        except Exception as e:
            logger.error(f"Error getting EOU probability: {e}")
            self.emit("error", f"Error getting EOU probability: {str(e)}")
            return 0.0

    def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool:
        """
        Detect if the given chat context represents an end of utterance.

        Args:
            chat_context: Chat context to analyze
            threshold: Probability threshold for EOU detection (uses instance threshold if None)

        Returns:
            bool: True if end of utterance is detected, False otherwise
        """
        if threshold is None:
            threshold = self.threshold

        try:
            probability = self.get_eou_probability(chat_context)
            is_eou = probability >= threshold

            if not is_eou:
                self.emit(
                    "error", f"Turn detection failed: probability {probability} below threshold {threshold}")

            return is_eou

        except Exception as e:
            logger.error(f"Error during EOU detection: {e}")
            self.emit("error", f"Error during EOU detection: {str(e)}")
            return False
    
    async def aclose(self) -> None:
        """Cleanup ONNX model and tokenizer from memory"""
        logger.info("Cleaning up TurnDetector model resources")
        if hasattr(self, 'session') and self.session is not None:
            try:
                del self.session
                self.session = None
                logger.info("ONNX session cleaned up")
            except Exception as e:
                logger.error(f"Error cleaning up ONNX session: {e}")

        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
            try:
                del self.tokenizer
                self.tokenizer = None
                logger.info("Tokenizer cleaned up")
            except Exception as e:
                logger.error(f"Error cleaning up tokenizer: {e}")        
        try:
            import gc
            gc.collect()
            logger.info("Garbage collection completed")
        except Exception as e:
            logger.error(f"Error during garbage collection: {e}")
        
        logger.info("TurnDetector cleanup completed")
        await super().aclose()

A lightweight end-of-utterance detection model using TurnSense. Based on SmolLM2-135M, optimized for edge devices.

Initialize the TurnDetector plugin.

Args

threshold : float
The threshold for end-of-utterance detection. Defaults to 0.7.
**kwargs
Additional keyword arguments to pass to the parent class.

Ancestors

  • videosdk.agents.eou.EOU
  • videosdk.agents.event_emitter.EventEmitter
  • typing.Generic

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    """Cleanup ONNX model and tokenizer from memory"""
    logger.info("Cleaning up TurnDetector model resources")
    if hasattr(self, 'session') and self.session is not None:
        try:
            del self.session
            self.session = None
            logger.info("ONNX session cleaned up")
        except Exception as e:
            logger.error(f"Error cleaning up ONNX session: {e}")

    if hasattr(self, 'tokenizer') and self.tokenizer is not None:
        try:
            del self.tokenizer
            self.tokenizer = None
            logger.info("Tokenizer cleaned up")
        except Exception as e:
            logger.error(f"Error cleaning up tokenizer: {e}")        
    try:
        import gc
        gc.collect()
        logger.info("Garbage collection completed")
    except Exception as e:
        logger.error(f"Error during garbage collection: {e}")
    
    logger.info("TurnDetector cleanup completed")
    await super().aclose()

Cleanup ONNX model and tokenizer from memory

def detect_end_of_utterance(self,
chat_context: videosdk.agents.llm.chat_context.ChatContext,
threshold: float | None = None) ‑> bool
Expand source code
def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool:
    """
    Detect if the given chat context represents an end of utterance.

    Args:
        chat_context: Chat context to analyze
        threshold: Probability threshold for EOU detection (uses instance threshold if None)

    Returns:
        bool: True if end of utterance is detected, False otherwise
    """
    if threshold is None:
        threshold = self.threshold

    try:
        probability = self.get_eou_probability(chat_context)
        is_eou = probability >= threshold

        if not is_eou:
            self.emit(
                "error", f"Turn detection failed: probability {probability} below threshold {threshold}")

        return is_eou

    except Exception as e:
        logger.error(f"Error during EOU detection: {e}")
        self.emit("error", f"Error during EOU detection: {str(e)}")
        return False

Detect if the given chat context represents an end of utterance.

Args

chat_context
Chat context to analyze
threshold
Probability threshold for EOU detection (uses instance threshold if None)

Returns

bool
True if end of utterance is detected, False otherwise
def get_eou_probability(self, chat_context: videosdk.agents.llm.chat_context.ChatContext) ‑> float
Expand source code
def get_eou_probability(self, chat_context: ChatContext) -> float:
    """
    Get the probability score for end of utterance detection.

    Args:
        chat_context: Chat context to analyze

    Returns:
        float: Probability score (0.0 to 1.0)
    """
    if not self.session or not self.tokenizer:
        self.emit("error", "TurnSense model not initialized")
        raise RuntimeError("Model not initialized")

    try:
        formatted_text = self._chat_context_to_text(chat_context)

        inputs = self.tokenizer(
            formatted_text,
            padding="max_length",
            max_length=256,
            truncation=True,
            return_tensors="np"
        )

        ort_inputs = {
            'input_ids': inputs['input_ids'].astype(np.int64),
            'attention_mask': inputs['attention_mask'].astype(np.int64)
        }

        outputs = self.session.run(None, ort_inputs)

        probabilities = outputs[0]

        eou_prob = float(probabilities[0][1])

        return eou_prob

    except Exception as e:
        logger.error(f"Error getting EOU probability: {e}")
        self.emit("error", f"Error getting EOU probability: {str(e)}")
        return 0.0

Get the probability score for end of utterance detection.

Args

chat_context
Chat context to analyze

Returns

float
Probability score (0.0 to 1.0)
class VideoSDKTurnDetector (threshold: float = 0.7, **kwargs)
Expand source code
class VideoSDKTurnDetector(EOU):
    """
    A lightweight end-of-utterance detection model using VideoSDK's Turn Detection model.
    Based on BERT, doing binary classification for turn detection.
    """
    
    def __init__(self, threshold: float = 0.7, **kwargs):
        """Initialize the VideoSDKTurnDetector plugin.
        """
        super().__init__(threshold=threshold, **kwargs)
        self.session = None
        self.tokenizer = None
        self._initialize_model()
    
    def _initialize_model(self):
        """Initialize the ONNX model and tokenizer"""
        try:
            import onnxruntime as ort
            
            if not os.path.exists(MODEL_DIR):
                logger.warning(f"Model directory {MODEL_DIR} does not exist. Running pre_download_model()...")
                pre_download_videosdk_model(overwrite_existing=True)
            
            pre_download_videosdk_model(overwrite_existing=False)
            
            self.tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
            
            model_path = os.path.join(MODEL_DIR, "model.onnx")
            
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model file not found at {model_path}")
            
            self.session = ort.InferenceSession(model_path)
            print(f"Model loaded successfully.")
            
        except Exception as e:
            print(f"Error loading model: {e}")
            logger.error(f"Failed to initialize TurnDetection model: {e}")
            self.emit("error", f"Failed to initialize TurnDetection model: {str(e)}")
            raise
    
    def _get_last_user_message(self, chat_context: ChatContext) -> str:
        """
        Extract the last user message from chat context.
        This is what we want to analyze for EOU detection.
        
        Args:
            chat_context: The chat context to analyze
            
        Returns:
            str: The last user message content
        """
        user_messages = [
            item for item in chat_context.items 
            if isinstance(item, ChatMessage) and item.role == ChatRole.USER
        ]
        
        if not user_messages:
            return ""
        
        last_message = user_messages[-1]
        content = last_message.content
        
        if isinstance(content, list):
            text_content = " ".join([c.text if hasattr(c, 'text') else str(c) for c in content])
        else:
            text_content = str(content)
        
        return text_content.strip()
    
    def _chat_context_to_text(self, chat_context: ChatContext) -> str:
        """
        Transform ChatContext to model-compatible format.
        Focus on the last user message for EOU detection.
        
        Args:
            chat_context: The chat context to transform
            
        Returns:
            str: Raw text for the model (without special formatting)
        """
        last_user_text = self._get_last_user_message(chat_context)
        
        if not last_user_text:
            return ""
        
        return last_user_text

    def detect_turn(self, sentence: str):
        """
        Args:
            sentence: Input sentence to analyze
            
        Returns:
            str: "True" if turn detected, "False" otherwise
        """
        try:
            inputs = self.tokenizer(sentence.strip(), truncation=True, max_length=512, return_tensors="np")
            outputs = self.session.run(None, {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"],
                "token_type_ids": inputs["token_type_ids"],
            })
            pred = np.argmax(outputs)
            if pred == 0:
                pred = "False"
            else:
                pred = "True"
            if pred == "False":
                self.emit("error", f"Turn detection failed: result was {pred}")
            return pred
        except Exception as e:
            print(e)
            self.emit("error", f"Error detecting turn: {str(e)}")
            return "False"

    def get_eou_probability(self, chat_context: ChatContext) -> float:
        """
        Get the probability score for end of utterance detection.
        For binary classifier, returns 1.0 or 0.0 based on classification.
        
        Args:
            chat_context: Chat context to analyze
            
        Returns:
            float: Probability score (0.0 or 1.0)
        """
        try:
            sentence = self._chat_context_to_text(chat_context)
            result = self.detect_turn(sentence)
            return 1.0 if result == "True" else 0.0
        except Exception as e:
            logger.error(f"Error getting EOU probability: {e}")
            self.emit("error", f"Error getting EOU probability: {str(e)}")
            return 0.0

    def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool:
        """
        Detect if the given chat context represents an end of utterance.
        Uses direct binary classification, ignoring threshold.
        
        Args:
            chat_context: Chat context to analyze
            threshold: Not used in binary classification
            
        Returns:
            bool: True if end of utterance is detected, False otherwise
        """
        try:
            sentence = self._chat_context_to_text(chat_context)
            result = self.detect_turn(sentence)
            return result == "True"
        except Exception as e:
            logger.error(f"Error in EOU detection: {e}")
            self.emit("error", f"Error in EOU detection: {str(e)}")
            return False
    
    async def aclose(self) -> None:
        """Cleanup ONNX model and tokenizer from memory"""
        logger.info("Cleaning up VideoSDKTurnDetector model resources")
        if hasattr(self, 'session') and self.session is not None:
            try:
                del self.session
                self.session = None
                logger.info("VideoSDK ONNX session cleaned up")
            except Exception as e:
                logger.error(f"Error cleaning up VideoSDK ONNX session: {e}")

        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
            try:
                del self.tokenizer
                self.tokenizer = None
                logger.info("VideoSDK tokenizer cleaned up")
            except Exception as e:
                logger.error(f"Error cleaning up VideoSDK tokenizer: {e}")

        try:
            import gc
            gc.collect()
            logger.info("Garbage collection completed")
        except Exception as e:
            logger.error(f"Error during garbage collection: {e}")
        
        logger.info("VideoSDKTurnDetector cleanup completed")        
        await super().aclose()

A lightweight end-of-utterance detection model using VideoSDK's Turn Detection model. Based on BERT, doing binary classification for turn detection.

Initialize the VideoSDKTurnDetector plugin.

Ancestors

  • videosdk.agents.eou.EOU
  • videosdk.agents.event_emitter.EventEmitter
  • typing.Generic

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    """Cleanup ONNX model and tokenizer from memory"""
    logger.info("Cleaning up VideoSDKTurnDetector model resources")
    if hasattr(self, 'session') and self.session is not None:
        try:
            del self.session
            self.session = None
            logger.info("VideoSDK ONNX session cleaned up")
        except Exception as e:
            logger.error(f"Error cleaning up VideoSDK ONNX session: {e}")

    if hasattr(self, 'tokenizer') and self.tokenizer is not None:
        try:
            del self.tokenizer
            self.tokenizer = None
            logger.info("VideoSDK tokenizer cleaned up")
        except Exception as e:
            logger.error(f"Error cleaning up VideoSDK tokenizer: {e}")

    try:
        import gc
        gc.collect()
        logger.info("Garbage collection completed")
    except Exception as e:
        logger.error(f"Error during garbage collection: {e}")
    
    logger.info("VideoSDKTurnDetector cleanup completed")        
    await super().aclose()

Cleanup ONNX model and tokenizer from memory

def detect_end_of_utterance(self,
chat_context: videosdk.agents.llm.chat_context.ChatContext,
threshold: float | None = None) ‑> bool
Expand source code
def detect_end_of_utterance(self, chat_context: ChatContext, threshold: Optional[float] = None) -> bool:
    """
    Detect if the given chat context represents an end of utterance.
    Uses direct binary classification, ignoring threshold.
    
    Args:
        chat_context: Chat context to analyze
        threshold: Not used in binary classification
        
    Returns:
        bool: True if end of utterance is detected, False otherwise
    """
    try:
        sentence = self._chat_context_to_text(chat_context)
        result = self.detect_turn(sentence)
        return result == "True"
    except Exception as e:
        logger.error(f"Error in EOU detection: {e}")
        self.emit("error", f"Error in EOU detection: {str(e)}")
        return False

Detect if the given chat context represents an end of utterance. Uses direct binary classification, ignoring threshold.

Args

chat_context
Chat context to analyze
threshold
Not used in binary classification

Returns

bool
True if end of utterance is detected, False otherwise
def detect_turn(self, sentence: str)
Expand source code
def detect_turn(self, sentence: str):
    """
    Args:
        sentence: Input sentence to analyze
        
    Returns:
        str: "True" if turn detected, "False" otherwise
    """
    try:
        inputs = self.tokenizer(sentence.strip(), truncation=True, max_length=512, return_tensors="np")
        outputs = self.session.run(None, {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "token_type_ids": inputs["token_type_ids"],
        })
        pred = np.argmax(outputs)
        if pred == 0:
            pred = "False"
        else:
            pred = "True"
        if pred == "False":
            self.emit("error", f"Turn detection failed: result was {pred}")
        return pred
    except Exception as e:
        print(e)
        self.emit("error", f"Error detecting turn: {str(e)}")
        return "False"

Args

sentence
Input sentence to analyze

Returns

str
"True" if turn detected, "False" otherwise
def get_eou_probability(self, chat_context: videosdk.agents.llm.chat_context.ChatContext) ‑> float
Expand source code
def get_eou_probability(self, chat_context: ChatContext) -> float:
    """
    Get the probability score for end of utterance detection.
    For binary classifier, returns 1.0 or 0.0 based on classification.
    
    Args:
        chat_context: Chat context to analyze
        
    Returns:
        float: Probability score (0.0 or 1.0)
    """
    try:
        sentence = self._chat_context_to_text(chat_context)
        result = self.detect_turn(sentence)
        return 1.0 if result == "True" else 0.0
    except Exception as e:
        logger.error(f"Error getting EOU probability: {e}")
        self.emit("error", f"Error getting EOU probability: {str(e)}")
        return 0.0

Get the probability score for end of utterance detection. For binary classifier, returns 1.0 or 0.0 based on classification.

Args

chat_context
Chat context to analyze

Returns

float
Probability score (0.0 or 1.0)