Module `agents.tokenize.indic`

Functions

def pre_warm_tokenizer() ‑> None

Expand source code

def pre_warm_tokenizer() -> None:
    """Eagerly import ``indic-nlp-library`` so the first ``.tokenize()`` call is cheap.

    The underlying ``indicnlp.tokenize.sentence_tokenize`` module performs its
    expensive initialisation on first import (~6s on a cold Python process).
    Calling this at worker start — alongside ``TurnDetector.pre_download_model()``
    — moves that cost out of the first conversational turn.
    """
    _load_sentence_split()

Eagerly import indic-nlp-library so the first .tokenize() call is cheap.

The underlying indicnlp.tokenize.sentence_tokenize module performs its expensive initialisation on first import (~6s on a cold Python process). Calling this at worker start — alongside TurnDetector.pre_download_model() — moves that cost out of the first conversational turn.

Classes

class IndicScriptTransliterator (*, source: str, target: str)

Expand source code

class IndicScriptTransliterator:
    """Thin wrapper around ``UnicodeIndicTransliterator``.

    Useful as a ``TextFilter``-adjacent utility when your LLM emits Hindi but
    the TTS speaks Telugu (or similar cross-Indic scenarios).

    Example::

        from videosdk.agents.tokenize import IndicScriptTransliterator

        trans = IndicScriptTransliterator(source="hi", target="te")
        out = trans.convert("नमस्ते दुनिया")
        # out == "నమస్తే దునియా" (approximate phonetic conversion)
    """

    def __init__(self, *, source: str, target: str) -> None:
        self._source = source
        self._target = target

    def convert(self, text: str) -> str:
        try:
            from indicnlp.transliterate.unicode_transliterate import (
                UnicodeIndicTransliterator,
            )
        except ImportError as exc:
            raise ImportError(
                "indic-nlp-library is missing — it ships as a dependency of "
                "videosdk-agents; reinstall with: uv pip install -U videosdk-agents"
            ) from exc
        return UnicodeIndicTransliterator.transliterate(text, self._source, self._target)

Thin wrapper around UnicodeIndicTransliterator.

Useful as a TextFilter-adjacent utility when your LLM emits Hindi but the TTS speaks Telugu (or similar cross-Indic scenarios).

Example::

from videosdk.agents.tokenize import IndicScriptTransliterator

trans = IndicScriptTransliterator(source="hi", target="te")
out = trans.convert("नमस्ते दुनिया")
# out == "నమస్తే దునియా" (approximate phonetic conversion)

Methods

def convert(self, text: str) ‑> str

Expand source code

def convert(self, text: str) -> str:
    try:
        from indicnlp.transliterate.unicode_transliterate import (
            UnicodeIndicTransliterator,
        )
    except ImportError as exc:
        raise ImportError(
            "indic-nlp-library is missing — it ships as a dependency of "
            "videosdk-agents; reinstall with: uv pip install -U videosdk-agents"
        ) from exc
    return UnicodeIndicTransliterator.transliterate(text, self._source, self._target)

class IndicSentenceChunker (*, language: str = 'hi', min_sentence_len: int = 1, idle_flush_ms: int = 400)

Expand source code

class IndicSentenceChunker(SentenceChunker):
    """Sentence chunker for Indic scripts using indic-nlp-library.

    Falls back gracefully for unsupported ``language`` values by returning the
    input text as a single segment — the ``BufferedSentenceChunkStream`` then
    relies on idle-flush for phrasing.
    """

    def __init__(
        self,
        *,
        language: str = "hi",
        min_sentence_len: int = 1,
        idle_flush_ms: int = 400,
    ) -> None:
        """Initialise the chunker.

        Args:
            language: Default ISO 639-1 code. Override per-turn via
                ``tokenize(..., language=...)`` or ``stream(language=...)``.
            min_sentence_len: Passed through to the ``BufferedSentenceChunkStream``
                idle-flush heuristic. Default 1 (Indic sentences can be very short).
            idle_flush_ms: Idle timeout before a word-boundary cut fires.
        """
        if language not in INDIC_LANGS:
            logger.warning(
                "IndicSentenceChunker language %r is not in the supported set "
                "%s; sentence splitting may degrade. Consider using "
                "BasicSentenceChunker for non-Indic languages.",
                language,
                sorted(INDIC_LANGS),
            )
        self._default_language = language
        self._min_sentence_len = int(min_sentence_len)
        self._idle_flush_ms = int(idle_flush_ms)
        self._split_fn: "_SentenceSplit | None" = None

    def tokenize(self, text: str, *, language: str | None = None) -> list[str]:
        if not text or not text.strip():
            return []
        lang = self._resolve_language(language)
        split_fn = self._get_split_fn()
        try:
            sentences = split_fn(text, lang=lang)
        except Exception:  # pragma: no cover - defensive
            logger.warning(
                "indic-nlp-library sentence_split raised for lang=%r; returning "
                "input as single segment",
                lang,
                exc_info=True,
            )
            return [text.strip()]
        return [s.strip() for s in sentences if s and s.strip()]

    def tokenize_raw(self, text: str, *, language: str | None = None) -> list[str]:
        """Return segments with original whitespace preserved.

        Used by the stream adapter: the ``BufferedSentenceChunkStream`` re-uses
        the last segment as the continuation buffer, so losing edge whitespace
        would concatenate unrelated words (e.g. ``व्यापार`` + ``से`` →
        ``व्यापारसे``) when the next chunk arrives.

        indic-nlp-library's ``sentence_split`` returns stripped strings, so we
        map each stripped segment back to its position in the original text
        and slice raw ranges, preserving the whitespace that originally sat
        between sentences.
        """
        if not text:
            return []
        lang = self._resolve_language(language)
        split_fn = self._get_split_fn()
        try:
            stripped_sentences = split_fn(text, lang=lang)
        except Exception:  # pragma: no cover - defensive
            return [text]

        raw_segments: list[str] = []
        cursor = 0
        for s in stripped_sentences:
            core = s.strip() if s else ""
            if not core:
                continue
            idx = text.find(core, cursor)
            if idx < 0:
                # Shouldn't happen — defensive fallback keeps the stripped text.
                raw_segments.append(core)
                continue
            end = idx + len(core)
            raw_segments.append(text[cursor:end])
            cursor = end

        if cursor < len(text):
            if raw_segments:
                raw_segments[-1] = raw_segments[-1] + text[cursor:]
            else:
                raw_segments.append(text[cursor:])

        return raw_segments

    def stream(self, *, language: str | None = None) -> SentenceChunkStream:
        lang = self._resolve_language(language)
        tokenize_fn: Callable[[str], list[str]] = partial(
            self.tokenize_raw, language=lang
        )
        return BufferedSentenceChunkStream(
            tokenize_fn=tokenize_fn,
            strong_terminators="।॥.!?",  # Devanagari danda + Latin terminators
            min_sentence_len=self._min_sentence_len,
            idle_flush_ms=self._idle_flush_ms,
        )

    def _resolve_language(self, language: str | None) -> str:
        """Pick the language code to pass to the upstream library."""
        lang = (language or self._default_language or "hi").lower()
        if lang == "auto":
            return self._default_language
        return lang

    def _get_split_fn(self) -> "_SentenceSplit":
        """Lazy-load the upstream function on first use."""
        if self._split_fn is None:
            self._split_fn = _load_sentence_split()
        return self._split_fn

Sentence chunker for Indic scripts using indic-nlp-library.

Falls back gracefully for unsupported language values by returning the input text as a single segment — the BufferedSentenceChunkStream then relies on idle-flush for phrasing.

Initialise the chunker.

Args

language: Default ISO 639-1 code. Override per-turn via tokenize(..., language=...) or stream(language=...).
min_sentence_len: Passed through to the BufferedSentenceChunkStream idle-flush heuristic. Default 1 (Indic sentences can be very short).
idle_flush_ms: Idle timeout before a word-boundary cut fires.

Ancestors

SentenceChunker
abc.ABC

Methods

def tokenize_raw(self, text: str, *, language: str | None = None) ‑> list[str]

Expand source code

def tokenize_raw(self, text: str, *, language: str | None = None) -> list[str]:
    """Return segments with original whitespace preserved.

    Used by the stream adapter: the ``BufferedSentenceChunkStream`` re-uses
    the last segment as the continuation buffer, so losing edge whitespace
    would concatenate unrelated words (e.g. ``व्यापार`` + ``से`` →
    ``व्यापारसे``) when the next chunk arrives.

    indic-nlp-library's ``sentence_split`` returns stripped strings, so we
    map each stripped segment back to its position in the original text
    and slice raw ranges, preserving the whitespace that originally sat
    between sentences.
    """
    if not text:
        return []
    lang = self._resolve_language(language)
    split_fn = self._get_split_fn()
    try:
        stripped_sentences = split_fn(text, lang=lang)
    except Exception:  # pragma: no cover - defensive
        return [text]

    raw_segments: list[str] = []
    cursor = 0
    for s in stripped_sentences:
        core = s.strip() if s else ""
        if not core:
            continue
        idx = text.find(core, cursor)
        if idx < 0:
            # Shouldn't happen — defensive fallback keeps the stripped text.
            raw_segments.append(core)
            continue
        end = idx + len(core)
        raw_segments.append(text[cursor:end])
        cursor = end

    if cursor < len(text):
        if raw_segments:
            raw_segments[-1] = raw_segments[-1] + text[cursor:]
        else:
            raw_segments.append(text[cursor:])

    return raw_segments

Return segments with original whitespace preserved.

Used by the stream adapter: the BufferedSentenceChunkStream re-uses the last segment as the continuation buffer, so losing edge whitespace would concatenate unrelated words (e.g. व्यापार + से → व्यापारसे) when the next chunk arrives.

indic-nlp-library's sentence_split returns stripped strings, so we map each stripped segment back to its position in the original text and slice raw ranges, preserving the whitespace that originally sat between sentences.

Inherited members

SentenceChunker:
- stream
- tokenize