fn_registry/python/functions/datascience/words_to_dataset.py

"""Extrae palabras y sus ocurrencias de textos en bruto."""

from __future__ import annotations

import re
from collections import Counter
from typing import Iterable


_STOPWORDS_ES: frozenset[str] = frozenset({
    "DE", "LA", "EL", "EN", "Y", "A", "LOS", "DEL", "SE", "LAS",
    "UN", "POR", "CON", "NO", "UNA", "SU", "PARA", "ES", "AL", "LO",
    "COMO", "MAS", "O", "PERO", "SUS", "LE", "YA", "ESTE",
    "SI", "PORQUE", "ESTA", "ENTRE", "CUANDO", "MUY", "SIN", "SOBRE",
    "TAMBIEN", "ME", "HASTA", "HAY", "DONDE", "QUIEN", "DESDE", "TODO",
    "NOS", "DURANTE", "TODOS", "UNO", "LES", "NI", "CONTRA", "OTROS",
})


def words_to_dataset(
    texts: Iterable[str | None],
    min_ocurrencias: int = 1,
    eliminar_stopwords: bool = False,
) -> list[dict]:
    """Extrae palabras y ocurrencias de una coleccion de textos.

    Sin dependencias externas. Tokeniza cada texto con regex \\b\\w+\\b,
    convierte a mayusculas, cuenta ocurrencias y filtra por minimo.

    Args:
        texts: Iterable de strings (o None). Los None se ignoran.
        min_ocurrencias: Numero minimo de ocurrencias para incluir una
            palabra. Default 1.
        eliminar_stopwords: Si True, filtra palabras comunes en espanol.

    Returns:
        Lista de dicts {"palabra": str, "ocurrencias": int} ordenada
        por ocurrencias descendente.
    """
    all_words: list[str] = []
    for text in texts:
        if text is None:
            continue
        words = re.findall(r"\b\w+\b", str(text).upper())
        if eliminar_stopwords:
            words = [w for w in words if w not in _STOPWORDS_ES]
        all_words.extend(words)

    counter = Counter(all_words)
    return [
        {"palabra": word, "ocurrencias": count}
        for word, count in counter.most_common()
        if count >= min_ocurrencias
    ]