"""Tests para compute_top_ngrams.""" import sys import os # sys.path estándar: añade `python/functions/` para importar por paquete raíz. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from datascience.compute_top_ngrams import compute_top_ngrams def test_bigramas(): # "machine learning" se repite en cada documento -> bigrama más frecuente. texts = [ "machine learning rocks", "machine learning is fun", "we love machine learning", ] result = compute_top_ngrams(texts, n=2, top_k=5) assert result["n"] == 2 assert result["top"], "esperaba al menos un bigrama" assert result["top"][0]["ngram"] == "machine learning" assert result["top"][0]["count"] == 3 # Cada entrada respeta el contrato {"ngram": str, "count": int}. for item in result["top"]: assert isinstance(item["ngram"], str) assert isinstance(item["count"], int) def test_trigramas(): texts = [ "alpha beta gamma delta", "alpha beta gamma omega", ] # Con stopwords desactivadas para no descartar tokens de contenido. result = compute_top_ngrams(texts, n=3, top_k=5, remove_stopwords=False) assert result["n"] == 3 ngrams = {item["ngram"]: item["count"] for item in result["top"]} # "alpha beta gamma" aparece en ambos documentos. assert ngrams.get("alpha beta gamma") == 2 # Trigramas únicos de cada documento. assert ngrams.get("beta gamma delta") == 1 assert ngrams.get("beta gamma omega") == 1 def test_vacio(): assert compute_top_ngrams([], n=2) == {"n": 2, "top": []} # Documentos no-str / None se descartan -> corpus efectivamente vacío. assert compute_top_ngrams([None, 123, {"a": 1}], n=2) == {"n": 2, "top": []} def test_stopwords(): # "the cat" debería desaparecer al quitar stopwords ("the" es stopword EN). texts = ["the cat the cat the cat"] con = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=True) sin = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=False) con_ngrams = {item["ngram"] for item in con["top"]} sin_ngrams = {item["ngram"] for item in sin["top"]} # Sin filtrar, el bigrama dominante es "the cat". assert "the cat" in sin_ngrams # Al filtrar stopwords, ya no aparece "the cat" (queda solo "cat cat"). assert "the cat" not in con_ngrams assert con_ngrams != sin_ngrams