Package videosdk.plugins.silero

Sub-modules

videosdk.plugins.silero.model

Used by importlib.resources and setuptools

videosdk.plugins.silero.onnx_runtime
videosdk.plugins.silero.vad

Classes

class SileroVAD (input_sample_rate: int = 48000,
model_sample_rate: Literal[8000, 16000] = 16000,
threshold: float = 0.3,
min_speech_duration: float = 0.1,
min_silence_duration: float = 0.75,
force_cpu: bool = True)
Expand source code
class SileroVAD(BaseVAD):
    """Silero Voice Activity Detection implementation using ONNX runtime"""

    def __init__(
        self,
        input_sample_rate: int = 48000,
        model_sample_rate: Literal[8000, 16000] = 16000,
        threshold: float = 0.30,
        min_speech_duration: float = 0.1,
        min_silence_duration: float = 0.75,
        force_cpu: bool = True,
    ) -> None:
        """Initialize the Silero VAD plugin.

        Args:
            input_sample_rate (int): The input sample rate for the VAD plugin. Defaults to 48000.
            model_sample_rate (Literal[8000, 16000]): The model sample rate for the VAD plugin. Must be one of: 8000, 16000. Defaults to 16000.
            threshold (float): The threshold for the VAD plugin. Defaults to 0.30.
            min_speech_duration (float): The minimum speech duration for the VAD plugin. Defaults to 0.1.
            min_silence_duration (float): The minimum silence duration for the VAD plugin. Defaults to 0.75.
            force_cpu (bool): Whether to force CPU usage for the VAD plugin. Defaults to True.
        """

        if model_sample_rate not in SAMPLE_RATES:
            self.emit(
                "error", f"Invalid model sample rate {model_sample_rate}: must be one of {SAMPLE_RATES}")
            raise ValueError(
                f"Model sample rate {model_sample_rate} not supported. Must be one of {SAMPLE_RATES}")

        super().__init__(
            sample_rate=model_sample_rate,
            threshold=threshold,
            min_speech_duration=min_speech_duration,
            min_silence_duration=min_silence_duration
        )

        self._input_sample_rate = input_sample_rate
        self._model_sample_rate = model_sample_rate
        self._needs_resampling = input_sample_rate != model_sample_rate

        try:
            self._session = VadModelWrapper.create_inference_session(force_cpu)
            self._model = VadModelWrapper(
                session=self._session, rate=model_sample_rate)
        except Exception as e:
            self.emit("error", f"Failed to initialize VAD model: {str(e)}")
            raise

        self._exp_filter = 0.0

        self._speech_threshold_duration = 0.0
        self._silence_threshold_duration = 0.0

        self._pub_speaking = False
        self._pub_speech_duration = 0.0
        self._pub_silence_duration = 0.0
        self._pub_timestamp = 0.0

        self._remaining_input_fraction = 0.0

        self._input_accumulator = np.array([], dtype=np.int16)
        self._inference_accumulator = np.array([], dtype=np.float32)

        self._frame_count = 0
        self._inference_count = 0

        self._consecutive_low_confidence_count = 0
        self._error_emission_threshold = 10

    async def process_audio(self, audio_frames: bytes, **kwargs: Any) -> None:
        try:
            input_frame_data = np.frombuffer(audio_frames, dtype=np.int16)

            self._input_accumulator = np.concatenate(
                [self._input_accumulator, input_frame_data])

            if self._needs_resampling:
                input_float = input_frame_data.astype(np.float32) / 32768.0
                target_length = int(
                    len(input_float) * self._model_sample_rate / self._input_sample_rate)
                if target_length > 0:
                    resampled_float = signal.resample(
                        input_float, target_length)
                    self._inference_accumulator = np.concatenate([
                        self._inference_accumulator,
                        resampled_float.astype(np.float32)
                    ])
            else:
                input_float = input_frame_data.astype(np.float32) / 32768.0
                self._inference_accumulator = np.concatenate(
                    [self._inference_accumulator, input_float])

            while len(self._inference_accumulator) >= self._model.frame_size:
                inference_window = self._inference_accumulator[:self._model.frame_size]

                try:
                    raw_prob = self._model.process(inference_window)
                except Exception as e:
                    self.emit("error", f"VAD inference error: {e}")
                    raw_prob = 0.0

                alpha = 0.40
                self._exp_filter = alpha * raw_prob + \
                    (1 - alpha) * self._exp_filter

                window_duration = self._model.frame_size / self._model_sample_rate
                self._pub_timestamp += window_duration

                resampling_ratio = self._input_sample_rate / self._model_sample_rate

                _copy = self._model.frame_size * \
                    resampling_ratio + self._remaining_input_fraction
                _int_copy = int(_copy)

                self._remaining_input_fraction = _copy - _int_copy

                if len(self._input_accumulator) >= _int_copy:
                    self._input_accumulator = self._input_accumulator[_int_copy:]

                if self._pub_speaking:
                    self._pub_speech_duration += window_duration
                else:
                    self._pub_silence_duration += window_duration

                if self._exp_filter >= self._threshold:
                    self._speech_threshold_duration += window_duration
                    self._silence_threshold_duration = 0.0

                    if not self._pub_speaking:
                        if self._speech_threshold_duration >= self._min_speech_duration:
                            self._pub_speaking = True
                            self._pub_silence_duration = 0.0
                            self._pub_speech_duration = self._speech_threshold_duration

                            self._send_speech_event(
                                VADEventType.START_OF_SPEECH)
                else:
                    self._silence_threshold_duration += window_duration
                    self._speech_threshold_duration = 0.0

                    if not self._pub_speaking:
                        pass

                    if self._pub_speaking and self._silence_threshold_duration >= self._min_silence_duration:
                        self._pub_speaking = False
                        self._pub_speech_duration = 0.0
                        self._pub_silence_duration = self._silence_threshold_duration

                        self._send_speech_event(VADEventType.END_OF_SPEECH)
                        self._reset_model_state()
                        pass

                self._inference_accumulator = self._inference_accumulator[self._model.frame_size:]

        except Exception as e:
            self.emit("error", f"VAD audio processing failed: {str(e)}")

    def _send_speech_event(self, event_type: VADEventType) -> None:
        response = VADResponse(
            event_type=event_type,
            data=VADData(
                is_speech=event_type == VADEventType.START_OF_SPEECH,
                confidence=self._exp_filter,
                timestamp=self._pub_timestamp,
                speech_duration=self._pub_speech_duration,
                silence_duration=self._pub_silence_duration
            )
        )
        if self._vad_callback:
            asyncio.create_task(self._vad_callback(response))

    def _reset_model_state(self) -> None:
        """Reset model internal state when errors occur"""
        try:
            self._model._hidden_state = np.zeros((2, 1, 128), dtype=np.float32)
            self._model._prev_context = np.zeros(
                (1, self._model.history_len), dtype=np.float32)

            self._exp_filter = 0.0
            self._speech_threshold_duration = 0.0
            self._silence_threshold_duration = 0.0
        except Exception as e:
            self.emit("error", f"Failed to reset VAD model state: {e}")

    async def aclose(self) -> None:
        """Cleanup resources"""
        try:
            self._input_accumulator = np.array([], dtype=np.int16)
            self._inference_accumulator = np.array([], dtype=np.float32)
        except Exception as e:
            self.emit("error", f"Error during VAD cleanup: {str(e)}")

Silero Voice Activity Detection implementation using ONNX runtime

Initialize the Silero VAD plugin.

Args

input_sample_rate : int
The input sample rate for the VAD plugin. Defaults to 48000.
model_sample_rate : Literal[8000, 16000]
The model sample rate for the VAD plugin. Must be one of: 8000, 16000. Defaults to 16000.
threshold : float
The threshold for the VAD plugin. Defaults to 0.30.
min_speech_duration : float
The minimum speech duration for the VAD plugin. Defaults to 0.1.
min_silence_duration : float
The minimum silence duration for the VAD plugin. Defaults to 0.75.
force_cpu : bool
Whether to force CPU usage for the VAD plugin. Defaults to True.

Ancestors

  • videosdk.agents.vad.VAD
  • videosdk.agents.event_emitter.EventEmitter
  • typing.Generic

Methods

async def aclose(self) ‑> None
Expand source code
async def aclose(self) -> None:
    """Cleanup resources"""
    try:
        self._input_accumulator = np.array([], dtype=np.int16)
        self._inference_accumulator = np.array([], dtype=np.float32)
    except Exception as e:
        self.emit("error", f"Error during VAD cleanup: {str(e)}")

Cleanup resources

async def process_audio(self, audio_frames: bytes, **kwargs: Any) ‑> None
Expand source code
async def process_audio(self, audio_frames: bytes, **kwargs: Any) -> None:
    try:
        input_frame_data = np.frombuffer(audio_frames, dtype=np.int16)

        self._input_accumulator = np.concatenate(
            [self._input_accumulator, input_frame_data])

        if self._needs_resampling:
            input_float = input_frame_data.astype(np.float32) / 32768.0
            target_length = int(
                len(input_float) * self._model_sample_rate / self._input_sample_rate)
            if target_length > 0:
                resampled_float = signal.resample(
                    input_float, target_length)
                self._inference_accumulator = np.concatenate([
                    self._inference_accumulator,
                    resampled_float.astype(np.float32)
                ])
        else:
            input_float = input_frame_data.astype(np.float32) / 32768.0
            self._inference_accumulator = np.concatenate(
                [self._inference_accumulator, input_float])

        while len(self._inference_accumulator) >= self._model.frame_size:
            inference_window = self._inference_accumulator[:self._model.frame_size]

            try:
                raw_prob = self._model.process(inference_window)
            except Exception as e:
                self.emit("error", f"VAD inference error: {e}")
                raw_prob = 0.0

            alpha = 0.40
            self._exp_filter = alpha * raw_prob + \
                (1 - alpha) * self._exp_filter

            window_duration = self._model.frame_size / self._model_sample_rate
            self._pub_timestamp += window_duration

            resampling_ratio = self._input_sample_rate / self._model_sample_rate

            _copy = self._model.frame_size * \
                resampling_ratio + self._remaining_input_fraction
            _int_copy = int(_copy)

            self._remaining_input_fraction = _copy - _int_copy

            if len(self._input_accumulator) >= _int_copy:
                self._input_accumulator = self._input_accumulator[_int_copy:]

            if self._pub_speaking:
                self._pub_speech_duration += window_duration
            else:
                self._pub_silence_duration += window_duration

            if self._exp_filter >= self._threshold:
                self._speech_threshold_duration += window_duration
                self._silence_threshold_duration = 0.0

                if not self._pub_speaking:
                    if self._speech_threshold_duration >= self._min_speech_duration:
                        self._pub_speaking = True
                        self._pub_silence_duration = 0.0
                        self._pub_speech_duration = self._speech_threshold_duration

                        self._send_speech_event(
                            VADEventType.START_OF_SPEECH)
            else:
                self._silence_threshold_duration += window_duration
                self._speech_threshold_duration = 0.0

                if not self._pub_speaking:
                    pass

                if self._pub_speaking and self._silence_threshold_duration >= self._min_silence_duration:
                    self._pub_speaking = False
                    self._pub_speech_duration = 0.0
                    self._pub_silence_duration = self._silence_threshold_duration

                    self._send_speech_event(VADEventType.END_OF_SPEECH)
                    self._reset_model_state()
                    pass

            self._inference_accumulator = self._inference_accumulator[self._model.frame_size:]

    except Exception as e:
        self.emit("error", f"VAD audio processing failed: {str(e)}")

Process audio frames and detect voice activity

Args

audio_frames
Iterator of audio frames to process
**kwargs
Additional provider-specific arguments

Returns

AsyncIterator yielding VADResponse objects