"""Traduce texto espanol a ingles usando MarianMT, frase a frase.""" from __future__ import annotations import re from typing import Any # Patron de split por oraciones: punto, exclamacion, interrogacion seguidos de espacio. _SENTENCE_RE = re.compile(r"(?<=[.!?])\s+") def translate_es_to_en( text: str, tokenizer: Any, model: Any, max_length: int = 512, num_beams: int = 4, ) -> str: """Translate Spanish text to English, sentence by sentence. Splits the input on sentence boundaries (after ``.``, ``!``, ``?``), translates each sentence independently, and rejoins with a single space. Processing sentence by sentence preserves proper nouns (names, companies, locations) better than passing the full paragraph in a single call, because the translation model can focus on shorter context windows. Args: text: Spanish text to translate. Can be a single sentence or a multi-sentence paragraph. tokenizer: MarianMT tokenizer loaded with ``marianmt_es_en_load_model``. model: MarianMT model loaded with ``marianmt_es_en_load_model``. max_length: Maximum token length for each sentence during tokenization and generation. Sentences longer than this are truncated. num_beams: Number of beams for beam search. Higher = better quality, slower. Default 4 is a good tradeoff. Returns: Translated English text. Sentences joined with a single space. Returns an empty string if ``text`` is empty or whitespace-only. Raises: RuntimeError: if model.generate fails (propagated from transformers). """ if not text or not text.strip(): return "" sentences = _SENTENCE_RE.split(text.strip()) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return "" translated_parts: list[str] = [] for sentence in sentences: inputs = tokenizer( sentence, return_tensors="pt", max_length=max_length, truncation=True, ) generated = model.generate( **inputs, num_beams=num_beams, max_length=max_length, ) decoded = tokenizer.decode(generated[0], skip_special_tokens=True) translated_parts.append(decoded.strip()) return " ".join(translated_parts)