105e56cf05
Añade el capítulo `text_distr` al motor AutomaticEDA: perfila columnas de texto libre largo (reseñas, descripciones, comentarios) que la distribución categórica no resume bien. Sigue el patrón de cat_distr/num_distr (build_text_distr(profile, ctx) -> Chapter | None) y se registra en CHAPTER_ORDER tras cat_distr. Activación en dos fases: gate barato desde el perfil (columna no numérica con len_mean >= 50 chars) + confirmación con muestra cruda (mediana de palabras >= 20). Un dataset sin texto largo (p.ej. titanic) devuelve None sin tocar el informe. Bloques por columna (Group con page_break): resumen (longitudes, vocabulario con TTR y % hapax, idioma dominante, % duplicados, legibilidad), histograma de longitudes, top términos (tabla + barras), bigramas/trigramas, idiomas detectados y nube de palabras opcional. Términos ttr/hapax enganchados al glosario clicable. Lógica delegada a 7 funciones nuevas del registry (datascience, tag eda), estilo dict-no-throw: - extract_text_sample (impura, push-down SQL DuckDB/Postgres) - compute_text_length_stats, compute_vocabulary_stats, compute_top_ngrams (puras, stdlib) - detect_corpus_language (langdetect opcional), compute_text_readability (textstat opcional), compute_text_duplicates (hash + datasketch opcional) Versión barata sin modelos pesados: las piezas que dependen de una librería opcional (langdetect, textstat, wordcloud, datasketch) degradan a omitidas sin lanzar. Añade langdetect y textstat (ligeras) al pyproject + uv.lock. Verificado: golden sobre dataset de reviews multi-idioma (capítulo presente en PDF+PPTX+MD con métricas reales), titanic sin capítulo (None), degradación sin libs, suite automatic_eda + pipeline verde (128 passed), fn index OK. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
"""Tests para compute_text_readability."""
|
|
|
|
import sys
|
|
import os
|
|
import builtins
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from datascience.compute_text_readability import compute_text_readability
|
|
|
|
|
|
EXPECTED_KEYS = {"available", "n_scored", "flesch"}
|
|
FLESCH_KEYS = {"mean", "p50", "min", "max"}
|
|
|
|
|
|
def test_prosa_ingles():
|
|
"""Varios textos en prosa inglesa: available True, n_scored>0, mean no None."""
|
|
texts = [
|
|
"The cat sat on the mat. It was a warm and sunny day in the park.",
|
|
"She sells sea shells by the sea shore. The shells she sells are surely sea shells.",
|
|
"Reading is a wonderful habit. Books open doors to new worlds and ideas.",
|
|
"He ran quickly to the store to buy some fresh bread and a bottle of milk.",
|
|
]
|
|
out = compute_text_readability(texts)
|
|
|
|
assert set(out.keys()) == EXPECTED_KEYS
|
|
assert out["available"] is True
|
|
assert out["n_scored"] > 0
|
|
assert set(out["flesch"].keys()) == FLESCH_KEYS
|
|
assert out["flesch"]["mean"] is not None
|
|
assert out["flesch"]["p50"] is not None
|
|
assert out["flesch"]["min"] is not None
|
|
assert out["flesch"]["max"] is not None
|
|
# min <= mean/p50 <= max coherente.
|
|
assert out["flesch"]["min"] <= out["flesch"]["max"]
|
|
|
|
|
|
def test_vacio():
|
|
"""Corpus vacío con textstat presente: available True, n_scored 0, flesch None."""
|
|
out = compute_text_readability([])
|
|
|
|
assert set(out.keys()) == EXPECTED_KEYS
|
|
assert out["available"] is True
|
|
assert out["n_scored"] == 0
|
|
assert out["flesch"]["mean"] is None
|
|
assert out["flesch"]["p50"] is None
|
|
assert out["flesch"]["min"] is None
|
|
assert out["flesch"]["max"] is None
|
|
|
|
# Elementos no-str / vacíos también se descartan -> n_scored 0.
|
|
out2 = compute_text_readability([None, "", " ", 123])
|
|
assert out2["available"] is True
|
|
assert out2["n_scored"] == 0
|
|
|
|
|
|
def test_degradacion(monkeypatch):
|
|
"""Sin textstat (ImportError forzado): degrada a available False sin lanzar."""
|
|
import datascience.compute_text_readability as m
|
|
|
|
real = builtins.__import__
|
|
|
|
def fake(name, *a, **k):
|
|
if name == "textstat" or name.startswith("textstat."):
|
|
raise ImportError("simulado")
|
|
return real(name, *a, **k)
|
|
|
|
monkeypatch.setattr(builtins, "__import__", fake)
|
|
out = m.compute_text_readability(["The cat sat on the mat. It was happy and warm."])
|
|
assert out["available"] is False
|
|
assert out["n_scored"] == 0
|
|
assert out["flesch"]["mean"] is None
|
|
assert out["flesch"]["p50"] is None
|
|
assert out["flesch"]["min"] is None
|
|
assert out["flesch"]["max"] is None
|