"""Tests para compute_text_duplicates. Importa el modulo hoja directamente (`datascience.compute_text_duplicates`) para no depender de que el paquete reexporte la funcion en su __init__. datasketch normalmente NO esta instalada en el venv, asi que near_dup degrada a available=False; los tests no requieren la libreria. """ from datascience.compute_text_duplicates import compute_text_duplicates EXPECTED_KEYS = {"n_docs", "n_exact_dup", "exact_dup_pct", "n_unique", "near_dup"} def test_duplicados_exactos(): """3 copias del mismo texto + 2 Ășnicos: n_exact_dup=2, pct>0.""" texts = [ "El gato come pescado", "El gato come pescado", "el GATO come pescado", # mismo tras normalizar (espacios + case) "Un perro ladra", "La luna brilla", ] result = compute_text_duplicates(texts) assert set(result.keys()) == EXPECTED_KEYS assert result["n_docs"] == 5 # 3 copias del primer texto (2 son repeticion) + 2 textos unicos. assert result["n_exact_dup"] == 2 assert result["n_unique"] == 3 assert result["exact_dup_pct"] is not None assert result["exact_dup_pct"] > 0 # 2 / 5 * 100 = 40.0 assert abs(result["exact_dup_pct"] - 40.0) < 1e-9 def test_sin_duplicados(): """Corpus sin repeticiones: n_exact_dup=0, n_unique==n_docs.""" texts = [ "primero documento distinto", "segundo documento distinto", "tercero documento distinto", ] result = compute_text_duplicates(texts) assert result["n_docs"] == 3 assert result["n_exact_dup"] == 0 assert result["n_unique"] == 3 assert abs(result["exact_dup_pct"] - 0.0) < 1e-9 def test_vacio(): """Corpus vacio: n_docs 0, exact_dup_pct None, no lanza.""" result = compute_text_duplicates([]) assert set(result.keys()) == EXPECTED_KEYS assert result["n_docs"] == 0 assert result["n_exact_dup"] == 0 assert result["exact_dup_pct"] is None assert result["n_unique"] == 0 assert result["near_dup"]["n_near_dup_docs"] == 0 def test_near_dup_degrada(): """near_dup expone 'available' (bool) y no lanza aunque falte datasketch.""" texts = ["uno dos tres cuatro", "uno dos tres cuatro cinco", "algo distinto"] result = compute_text_duplicates(texts) near = result["near_dup"] assert "available" in near assert isinstance(near["available"], bool) assert "n_near_dup_docs" in near assert isinstance(near["n_near_dup_docs"], int) # Tambien tolera None y entradas no-str sin lanzar. mixed = compute_text_duplicates(["hola", None, 123, "hola"]) assert mixed["n_docs"] == 2 assert mixed["n_exact_dup"] == 1