fn_registry/python/functions/datascience/translate_es_to_en.py

"""Traduce texto espanol a ingles usando MarianMT, frase a frase."""

from __future__ import annotations

import re
from typing import Any

# Patron de split por oraciones: punto, exclamacion, interrogacion seguidos de espacio.
_SENTENCE_RE = re.compile(r"(?<=[.!?])\s+")


def translate_es_to_en(
    text: str,
    tokenizer: Any,
    model: Any,
    max_length: int = 512,
    num_beams: int = 4,
) -> str:
    """Translate Spanish text to English, sentence by sentence.

    Splits the input on sentence boundaries (after ``.``, ``!``, ``?``),
    translates each sentence independently, and rejoins with a single space.
    Processing sentence by sentence preserves proper nouns (names, companies,
    locations) better than passing the full paragraph in a single call, because
    the translation model can focus on shorter context windows.

    Args:
        text: Spanish text to translate. Can be a single sentence or a
            multi-sentence paragraph.
        tokenizer: MarianMT tokenizer loaded with ``marianmt_es_en_load_model``.
        model: MarianMT model loaded with ``marianmt_es_en_load_model``.
        max_length: Maximum token length for each sentence during tokenization
            and generation. Sentences longer than this are truncated.
        num_beams: Number of beams for beam search. Higher = better quality,
            slower. Default 4 is a good tradeoff.

    Returns:
        Translated English text. Sentences joined with a single space.
        Returns an empty string if ``text`` is empty or whitespace-only.

    Raises:
        RuntimeError: if model.generate fails (propagated from transformers).
    """
    if not text or not text.strip():
        return ""

    sentences = _SENTENCE_RE.split(text.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    if not sentences:
        return ""

    translated_parts: list[str] = []
    for sentence in sentences:
        inputs = tokenizer(
            sentence,
            return_tensors="pt",
            max_length=max_length,
            truncation=True,
        )
        generated = model.generate(
            **inputs,
            num_beams=num_beams,
            max_length=max_length,
        )
        decoded = tokenizer.decode(generated[0], skip_special_tokens=True)
        translated_parts.append(decoded.strip())

    return " ".join(translated_parts)