"""Tests para compute_vocabulary_stats.""" import os import sys sys.path.insert( 0, os.path.join(os.path.dirname(__file__), "..", "..", "functions") ) from datascience.compute_vocabulary_stats import compute_vocabulary_stats def test_basico(): # Corpus con repeticiones y hapax. Stopwords desactivadas para controlar # exactamente que tokens entran. texts = ["gato gato perro", "perro perro raton", "elefante"] r = compute_vocabulary_stats(texts, top_k=10, remove_stopwords=False) # n_types < n_tokens cuando hay repeticiones. assert r["n_types"] < r["n_tokens"] assert r["n_tokens"] == 7 assert r["n_types"] == 4 # gato, perro, raton, elefante # ttr en (0, 1]. assert 0 < r["ttr"] <= 1 assert r["ttr"] == round(4 / 7, 4) # top_terms ordenado por count descendente. counts = [t["count"] for t in r["top_terms"]] assert counts == sorted(counts, reverse=True) assert r["top_terms"][0]["term"] == "perro" assert r["top_terms"][0]["count"] == 3 # hapax: raton y elefante aparecen exactamente una vez. assert r["n_hapax"] == 2 assert r["hapax_pct"] == round(2 / 4 * 100, 2) # pct coherente con count/n_tokens. assert r["top_terms"][0]["pct"] == round(3 / 7 * 100, 2) def test_vacio(): # Sin documentos validos -> ceros / None / []. for arg in ([], None, [None, 123, ""], ["123 456"]): r = compute_vocabulary_stats(arg) assert r["n_tokens"] == 0 assert r["n_types"] == 0 assert r["ttr"] is None assert r["n_hapax"] == 0 assert r["hapax_pct"] is None assert r["top_terms"] == [] def test_stopwords_quitadas(): texts = ["the gato the perro", "de la casa azul"] r = compute_vocabulary_stats(texts, remove_stopwords=True) terms = {t["term"] for t in r["top_terms"]} # Stopwords ES+EN no deben aparecer. assert "the" not in terms assert "de" not in terms assert "la" not in terms # Palabras de contenido si. assert "gato" in terms assert "casa" in terms def test_stopwords_conservadas(): texts = ["the gato the perro", "de la casa azul"] r = compute_vocabulary_stats(texts, remove_stopwords=False) terms = {t["term"] for t in r["top_terms"]} # Con el filtro desactivado, las stopwords se conservan. assert "the" in terms assert "de" in terms assert "la" in terms