"""Top n-gramas de palabras más frecuentes de un corpus de texto. Función pura, autocontenida (solo stdlib: re + collections.Counter). No depende de scikit-learn ni de ninguna otra librería externa. Estilo dict-no-throw del grupo `eda`: ante cualquier entrada degenerada o excepción interna devuelve ``{"n": n, "top": []}`` en vez de lanzar. """ import re from collections import Counter # Lista inline de stopwords ES + EN (~80 términos de altísima frecuencia). # Se eliminan ANTES de formar los n-gramas: los n-gramas se construyen sobre la # secuencia de tokens de contenido, no sobre el texto original. _STOPWORDS = frozenset({ # Español "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "mas", "pero", "sus", "le", "ya", "o", "este", "sí", "si", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre", "también", "tambien", "me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", "algo", "nosotros", # Inglés "the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as", "are", "was", "be", "this", "that", "by", "an", "or", "at", "from", "but", "not", "have", "has", "had", "they", "you", "we", "he", "she", "his", "her", "their", "its", "i", "my", "me", "our", "us", "do", "does", "did", "will", "would", "can", "could", "should", "there", "which", "who", "what", "when", "where", "how", "all", "if", "so", "than", "then", "out", "up", }) def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict: """Calcula los n-gramas de palabras más frecuentes de un corpus. Args: texts: lista de cadenas. Los elementos ``None`` o que no sean ``str`` se descartan silenciosamente. n: tamaño del n-grama (1 = unigramas, 2 = bigramas, 3 = trigramas...). Valores < 1 o no enteros producen ``top`` vacío. top_k: número máximo de n-gramas a devolver, ordenados por frecuencia descendente (con desempate alfabético determinista). remove_stopwords: si ``True`` elimina las stopwords ES+EN ANTES de formar los n-gramas, de modo que los n-gramas se construyen sobre la secuencia de tokens de contenido (no cruzando documentos). Returns: ``{"n": n, "top": [{"ngram": "w1 w2", "count": int}, ...]}``. Corpus vacío, sin tokens suficientes o cualquier excepción interna degrada a ``{"n": n, "top": []}``. Nunca lanza. """ try: if not isinstance(n, int) or n < 1: return {"n": n, "top": []} try: limit = int(top_k) except (TypeError, ValueError): limit = 0 if limit < 0: limit = 0 if not isinstance(texts, (list, tuple)): return {"n": n, "top": []} counter = Counter() for doc in texts: if not isinstance(doc, str): continue tokens = [ tok for tok in re.findall(r"\w+", doc.lower(), re.UNICODE) if not tok.isdigit() ] if remove_stopwords: tokens = [tok for tok in tokens if tok not in _STOPWORDS] if len(tokens) < n: continue for i in range(len(tokens) - n + 1): ngram = " ".join(tokens[i:i + n]) counter[ngram] += 1 if not counter: return {"n": n, "top": []} ordered = sorted(counter.items(), key=lambda kv: (-kv[1], kv[0])) top = [{"ngram": ngram, "count": count} for ngram, count in ordered[:limit]] return {"n": n, "top": top} except Exception: return {"n": n, "top": []}