"""Extrae palabras y sus ocurrencias de textos en bruto.""" from __future__ import annotations import re from collections import Counter from typing import Iterable _STOPWORDS_ES: frozenset[str] = frozenset({ "DE", "LA", "EL", "EN", "Y", "A", "LOS", "DEL", "SE", "LAS", "UN", "POR", "CON", "NO", "UNA", "SU", "PARA", "ES", "AL", "LO", "COMO", "MAS", "O", "PERO", "SUS", "LE", "YA", "ESTE", "SI", "PORQUE", "ESTA", "ENTRE", "CUANDO", "MUY", "SIN", "SOBRE", "TAMBIEN", "ME", "HASTA", "HAY", "DONDE", "QUIEN", "DESDE", "TODO", "NOS", "DURANTE", "TODOS", "UNO", "LES", "NI", "CONTRA", "OTROS", }) def words_to_dataset( texts: Iterable[str | None], min_ocurrencias: int = 1, eliminar_stopwords: bool = False, ) -> list[dict]: """Extrae palabras y ocurrencias de una coleccion de textos. Sin dependencias externas. Tokeniza cada texto con regex \\b\\w+\\b, convierte a mayusculas, cuenta ocurrencias y filtra por minimo. Args: texts: Iterable de strings (o None). Los None se ignoran. min_ocurrencias: Numero minimo de ocurrencias para incluir una palabra. Default 1. eliminar_stopwords: Si True, filtra palabras comunes en espanol. Returns: Lista de dicts {"palabra": str, "ocurrencias": int} ordenada por ocurrencias descendente. """ all_words: list[str] = [] for text in texts: if text is None: continue words = re.findall(r"\b\w+\b", str(text).upper()) if eliminar_stopwords: words = [w for w in words if w not in _STOPWORDS_ES] all_words.extend(words) counter = Counter(all_words) return [ {"palabra": word, "ocurrencias": count} for word, count in counter.most_common() if count >= min_ocurrencias ]