feat(eda): capítulo AutomaticEDA CAT DISTR + funciones cardinalidad/pie
Capítulo cat_distr del motor AutomaticEDA: distribuciones categóricas con explicación de entropía de Shannon, métricas de cardinalidad por columna (valores distintos, % distintos, total de filas, valores únicos, entropía y su máximo log2(k) + normalizada), tabla top-k y un donut de las categorías más comunes (top-k + «Otros»). Marca columnas id-like y dominadas. Delegadas a fn-constructor (grupo eda): - categorical_cardinality_block: deriva métricas de cardinalidad/entropía. - categorical_top_pie_figure: figura donut top-k + «Otros», leyenda lateral. Defensivo (dict-no-throw): None si no hay columnas categóricas; normaliza mode_pct a escala 0-100 (summarize_categorical lo emite como fracción). Tablas vía DataTable y figura perezosa: el paginador del núcleo garantiza no-corte en PDF y PPTX. Tests: golden + edge (sin categóricas) + anti-corte (label largo / muchas columnas) en ambos renderers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,402 @@
|
||||
"""Categorical distributions chapter (CAT DISTR).
|
||||
|
||||
Third reference chapter for AutomaticEDA. For every categorical column it shows,
|
||||
fulfilling the user's request:
|
||||
|
||||
1. A short opening explanation of **Shannon entropy** (what it measures, its 0
|
||||
and log2(k) bounds, the normalized 0–1 version) and the dataset row total used
|
||||
as a comparison baseline.
|
||||
2. Per column, a cardinality key/value table: distinct values, ``% distinct``
|
||||
(distinct / total rows), total dataset rows, singleton values (frequency 1),
|
||||
entropy with its theoretical maximum and the normalized ratio, mode, imbalance
|
||||
and string-length stats.
|
||||
3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
single dominating category).
|
||||
4. A ``top-k`` table (value / count / %).
|
||||
5. A **donut pie chart** of the most common categories (top-k + an "Otros"
|
||||
bucket), drawn lazily so the renderers scale it to fit entirely.
|
||||
|
||||
Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the
|
||||
output of ``summarize_categorical`` (``top[{value,count,pct}]``, ``mode``,
|
||||
``n_distinct``, ``entropy``, ``imbalance``, ``len_min/mean/max``). The derived
|
||||
cardinality metrics and the pie figure are delegated to two registry functions
|
||||
(``categorical_cardinality_block`` and ``categorical_top_pie_figure``); both are
|
||||
imported lazily and degrade to a minimal inline fallback so this chapter never
|
||||
raises even if they are unavailable.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "cat_distr"
|
||||
CHAPTER_TITLE = "Distribuciones categóricas"
|
||||
|
||||
# Cap the number of categorical columns rendered to keep the document bounded;
|
||||
# the rest are summarized in a closing note (no silent truncation).
|
||||
MAX_COLS = 40
|
||||
# Rows shown in each top-k table and explicit slices in the pie.
|
||||
TOP_TABLE_ROWS = 15
|
||||
PIE_TOP_K = 6
|
||||
# Truncate very long category labels in tables (the renderer also wraps).
|
||||
LABEL_MAX = 48
|
||||
|
||||
|
||||
def _fmt_int(value) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{int(value):,}".replace(",", ".")
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, int):
|
||||
return f"{value:,}".replace(",", ".")
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "NaN"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_pct_value(value, decimals: int = 1) -> str:
|
||||
"""Format an already-in-percent value (0–100). None -> placeholder."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value):.{decimals}f}%"
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _pct_from_maybe_fraction(value, decimals: int = 1) -> str:
|
||||
"""Format a percentage that may arrive as a 0–1 fraction or a 0–100 number."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
v = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if v <= 1.0:
|
||||
v *= 100.0
|
||||
return f"{v:.{decimals}f}%"
|
||||
|
||||
|
||||
def _truncate(text: str, limit: int = LABEL_MAX) -> str:
|
||||
s = model._safe_str(text)
|
||||
if len(s) <= limit:
|
||||
return s
|
||||
return s[: max(1, limit - 1)].rstrip() + "…"
|
||||
|
||||
|
||||
def _is_categorical(col: dict) -> bool:
|
||||
"""A column is treated as categorical when it carries a non-empty top list
|
||||
and is not a pure numeric column (numeric columns may still expose a top)."""
|
||||
if not isinstance(col, dict):
|
||||
return False
|
||||
cat = col.get("categorical")
|
||||
if not (isinstance(cat, dict) and cat.get("top")):
|
||||
return False
|
||||
if col.get("inferred_type") == "numeric":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _cardinality(cat: dict, n_rows) -> dict:
|
||||
"""Derive cardinality metrics for a column, via the registry function when
|
||||
available, otherwise a minimal inline fallback. Never raises."""
|
||||
try:
|
||||
from datascience.categorical_cardinality_block import (
|
||||
categorical_cardinality_block,
|
||||
)
|
||||
|
||||
out = categorical_cardinality_block(cat=cat, n_rows=n_rows)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001 — fall back to the inline derivation.
|
||||
pass
|
||||
return _fallback_cardinality(cat, n_rows)
|
||||
|
||||
|
||||
def _fallback_cardinality(cat: dict, n_rows) -> dict:
|
||||
cat = cat or {}
|
||||
top = cat.get("top") or []
|
||||
n_distinct = cat.get("n_distinct")
|
||||
entropy = cat.get("entropy")
|
||||
try:
|
||||
nr = int(n_rows) if n_rows is not None else None
|
||||
except (TypeError, ValueError):
|
||||
nr = None
|
||||
pct_distinct = None
|
||||
if isinstance(n_distinct, (int, float)) and nr:
|
||||
pct_distinct = float(n_distinct) / nr * 100.0
|
||||
entropy_max = None
|
||||
if isinstance(n_distinct, (int, float)):
|
||||
entropy_max = math.log2(n_distinct) if n_distinct > 1 else 0.0
|
||||
entropy_norm = None
|
||||
if isinstance(entropy, (int, float)) and entropy_max:
|
||||
entropy_norm = max(0.0, min(1.0, float(entropy) / entropy_max))
|
||||
mode_pct = cat.get("mode_pct")
|
||||
if mode_pct is None and top and isinstance(top[0], dict):
|
||||
mode_pct = top[0].get("pct")
|
||||
# Normalize to a 0–100 scale: summarize_categorical emits a 0–1 fraction.
|
||||
if isinstance(mode_pct, (int, float)) and not isinstance(mode_pct, bool):
|
||||
mode_pct = float(mode_pct) * 100.0 if mode_pct <= 1.0 else float(mode_pct)
|
||||
else:
|
||||
mode_pct = None
|
||||
n_singletons = None
|
||||
if top:
|
||||
n_singletons = sum(
|
||||
1 for t in top if isinstance(t, dict) and t.get("count") == 1)
|
||||
return {
|
||||
"n_distinct": n_distinct,
|
||||
"n_rows": nr,
|
||||
"pct_distinct": pct_distinct,
|
||||
"entropy": entropy,
|
||||
"entropy_max": entropy_max,
|
||||
"entropy_norm": entropy_norm,
|
||||
"mode": cat.get("mode"),
|
||||
"mode_pct": mode_pct,
|
||||
"imbalance": cat.get("imbalance"),
|
||||
"n_singletons": n_singletons,
|
||||
"n_singletons_partial": (
|
||||
isinstance(n_distinct, (int, float)) and n_distinct > len(top)),
|
||||
"len_min": cat.get("len_min"),
|
||||
"len_mean": cat.get("len_mean"),
|
||||
"len_max": cat.get("len_max"),
|
||||
"id_like": pct_distinct is not None and pct_distinct >= 99.0,
|
||||
"dominated": mode_pct is not None and mode_pct >= 90.0,
|
||||
}
|
||||
|
||||
|
||||
def _pie_make(top, n_distinct, title, n_rows):
|
||||
"""Return a zero-arg callable that builds the donut figure lazily."""
|
||||
|
||||
def make():
|
||||
try:
|
||||
from datascience.categorical_top_pie_figure import (
|
||||
categorical_top_pie_figure,
|
||||
)
|
||||
|
||||
return categorical_top_pie_figure(
|
||||
top=top, n_distinct=n_distinct or 0, title=title,
|
||||
top_k=PIE_TOP_K, n_rows=n_rows)
|
||||
except Exception: # noqa: BLE001 — minimal local fallback figure.
|
||||
return _fallback_pie(top, title)
|
||||
|
||||
return make
|
||||
|
||||
|
||||
def _fallback_pie(top, title):
|
||||
"""Minimal donut figure used only if the registry function is unavailable."""
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
fig = Figure(figsize=(5.0, 3.2))
|
||||
ax = fig.add_subplot(111)
|
||||
items = [t for t in (top or [])
|
||||
if isinstance(t, dict) and isinstance(t.get("count"), (int, float))]
|
||||
items = sorted(items, key=lambda t: t.get("count") or 0, reverse=True)
|
||||
head = items[:PIE_TOP_K]
|
||||
rest = items[PIE_TOP_K:]
|
||||
labels = [_truncate(t.get("value"), 20) for t in head]
|
||||
sizes = [float(t.get("count") or 0) for t in head]
|
||||
if rest:
|
||||
labels.append(f"Otros ({len(rest)})")
|
||||
sizes.append(sum(float(t.get("count") or 0) for t in rest))
|
||||
if not sizes or sum(sizes) <= 0:
|
||||
ax.text(0.5, 0.5, "sin datos categóricos", ha="center", va="center")
|
||||
ax.axis("off")
|
||||
return fig
|
||||
ax.pie(sizes, labels=None, wedgeprops={"width": 0.42},
|
||||
autopct=lambda p: f"{p:.0f}%" if p >= 4 else "")
|
||||
ax.legend(labels, loc="center left", bbox_to_anchor=(1.0, 0.5),
|
||||
fontsize=7, frameon=False)
|
||||
ax.set_title(_truncate(title, 40))
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def _normalize_card(card: dict) -> dict:
|
||||
"""Make the cardinality dict robust regardless of the upstream scale.
|
||||
|
||||
``summarize_categorical`` emits ``mode_pct`` as a 0–1 fraction; bring it to a
|
||||
0–100 scale and recompute the ``dominated`` flag here so the chapter is
|
||||
correct whether it consumed the registry function or the inline fallback.
|
||||
"""
|
||||
card = dict(card or {})
|
||||
mp = card.get("mode_pct")
|
||||
if isinstance(mp, (int, float)) and not isinstance(mp, bool):
|
||||
mp = float(mp) * 100.0 if mp <= 1.0 else float(mp)
|
||||
else:
|
||||
mp = None
|
||||
card["mode_pct"] = mp
|
||||
card["dominated"] = mp is not None and mp >= 90.0
|
||||
pd = card.get("pct_distinct")
|
||||
card["id_like"] = isinstance(pd, (int, float)) and pd >= 99.0
|
||||
return card
|
||||
|
||||
|
||||
def _cardinality_block(card: dict):
|
||||
"""KVTable with the cardinality / entropy metrics for one column."""
|
||||
n_singletons = card.get("n_singletons")
|
||||
if n_singletons is not None and card.get("n_singletons_partial"):
|
||||
singletons = f"≥{_fmt_int(n_singletons)} (en top mostrado)"
|
||||
elif n_singletons is not None:
|
||||
singletons = _fmt_int(n_singletons)
|
||||
else:
|
||||
singletons = "—"
|
||||
|
||||
entropy_ref = _fmt_num(card.get("entropy"))
|
||||
emax = card.get("entropy_max")
|
||||
if emax is not None:
|
||||
entropy_ref = f"{entropy_ref} (máx {_fmt_num(emax)})"
|
||||
|
||||
mode = card.get("mode")
|
||||
mode_pct = card.get("mode_pct")
|
||||
mode_str = "—" if mode is None else model._safe_str(mode)
|
||||
if mode is not None and mode_pct is not None:
|
||||
mode_str = f"{mode_str} ({_fmt_pct_value(mode_pct)})"
|
||||
|
||||
rows = [
|
||||
("Valores distintos", _fmt_int(card.get("n_distinct"))),
|
||||
("% distintos", _fmt_pct_value(card.get("pct_distinct"))),
|
||||
("Total filas (dataset)", _fmt_int(card.get("n_rows"))),
|
||||
("Valores únicos (frecuencia 1)", singletons),
|
||||
("Entropía (bits)", entropy_ref),
|
||||
("Entropía normalizada (0–1)", _fmt_num(card.get("entropy_norm"))),
|
||||
("Moda", mode_str),
|
||||
]
|
||||
imbalance = card.get("imbalance")
|
||||
if imbalance is not None:
|
||||
rows.append(("Desbalance", _fmt_num(imbalance)))
|
||||
lm = card.get("len_min")
|
||||
lmean = card.get("len_mean")
|
||||
lmax = card.get("len_max")
|
||||
if any(v is not None for v in (lm, lmean, lmax)):
|
||||
rows.append((
|
||||
"Longitud (mín/media/máx)",
|
||||
f"{_fmt_num(lm)} / {_fmt_num(lmean)} / {_fmt_num(lmax)}"))
|
||||
return model.KVTable(rows=rows, title="Cardinalidad")
|
||||
|
||||
|
||||
def _flag_note(card: dict):
|
||||
"""Return a Note flagging problematic cardinality, or None."""
|
||||
if card.get("id_like"):
|
||||
return model.Note(
|
||||
"Casi todos los valores son distintos (≈100% distintos): la columna "
|
||||
"se comporta como un identificador y aporta poco para agrupar o "
|
||||
"comparar categorías.")
|
||||
if card.get("dominated"):
|
||||
mp = card.get("mode_pct")
|
||||
mp_str = _fmt_pct_value(mp) if mp is not None else "muy alta"
|
||||
return model.Note(
|
||||
f"Una sola categoría domina la columna (moda {mp_str}): la "
|
||||
"distribución está muy desbalanceada.")
|
||||
return None
|
||||
|
||||
|
||||
def _topk_table(cat: dict):
|
||||
"""DataTable value / count / % for the top categories."""
|
||||
top = cat.get("top") or []
|
||||
n_distinct = cat.get("n_distinct")
|
||||
header = ["Valor", "Conteo", "%"]
|
||||
rows = []
|
||||
for t in top[:TOP_TABLE_ROWS]:
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
rows.append([
|
||||
model._safe_str(t.get("value")),
|
||||
_fmt_int(t.get("count")),
|
||||
_pct_from_maybe_fraction(t.get("pct")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
shown = len(rows)
|
||||
if isinstance(n_distinct, (int, float)) and n_distinct > shown:
|
||||
note = f"top {shown} de {_fmt_int(n_distinct)} categorías distintas"
|
||||
else:
|
||||
note = f"{shown} categorías"
|
||||
return model.DataTable(header=header, rows=rows, title="Top categorías",
|
||||
note=note)
|
||||
|
||||
|
||||
def _intro_blocks(n_rows):
|
||||
total = _fmt_int(n_rows)
|
||||
text = (
|
||||
"La **entropía de Shannon** mide cómo de repartidos están los valores de "
|
||||
"una columna categórica, en bits. Vale 0 cuando una sola categoría "
|
||||
"concentra todas las filas (máxima previsibilidad) y alcanza su máximo, "
|
||||
"log2(k) para k categorías distintas, cuando todas aparecen por igual "
|
||||
"(máxima diversidad). La **entropía normalizada** (entropía dividida por "
|
||||
"su máximo) la lleva al rango 0–1 para comparar columnas con distinto "
|
||||
"número de categorías. Para cada columna se muestran los valores "
|
||||
"distintos, el porcentaje que representan sobre el total de filas, los "
|
||||
"valores únicos (que aparecen una sola vez), la tabla de las categorías "
|
||||
"más frecuentes y un gráfico de tarta (donut) de las más comunes."
|
||||
)
|
||||
if n_rows is not None:
|
||||
text += f" El dataset tiene {total} filas en total como referencia."
|
||||
return [
|
||||
model.Heading(text="Entropía y cardinalidad", level=2),
|
||||
model.Markdown(text=text),
|
||||
]
|
||||
|
||||
|
||||
def build_cat_distr(profile: dict, ctx: dict):
|
||||
"""Build the categorical-distributions Chapter, or None if the dataset has
|
||||
no categorical columns."""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
cols = profile.get("columns") or []
|
||||
cat_cols = [c for c in cols if _is_categorical(c)]
|
||||
if not cat_cols:
|
||||
return None
|
||||
|
||||
n_rows = profile.get("n_rows")
|
||||
blocks = list(_intro_blocks(n_rows))
|
||||
|
||||
rendered = cat_cols[:MAX_COLS]
|
||||
for col in rendered:
|
||||
name = col.get("name") or "(columna)"
|
||||
cat = col.get("categorical") or {}
|
||||
card = _normalize_card(_cardinality(cat, n_rows))
|
||||
|
||||
blocks.append(model.Heading(text=str(name), level=2))
|
||||
blocks.append(_cardinality_block(card))
|
||||
note = _flag_note(card)
|
||||
if note is not None:
|
||||
blocks.append(note)
|
||||
topk = _topk_table(cat)
|
||||
if topk is not None:
|
||||
blocks.append(topk)
|
||||
blocks.append(model.Figure(
|
||||
make=_pie_make(cat.get("top") or [], card.get("n_distinct"),
|
||||
str(name), n_rows),
|
||||
caption=(f"Categorías más comunes de «{_truncate(name, 32)}» "
|
||||
"(donut: top-k + «Otros»)")))
|
||||
|
||||
if len(cat_cols) > len(rendered):
|
||||
omitted = len(cat_cols) - len(rendered)
|
||||
blocks.append(model.Note(
|
||||
f"Se muestran las primeras {len(rendered)} columnas categóricas; "
|
||||
f"quedan {omitted} sin mostrar para mantener acotado el informe."))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,186 @@
|
||||
"""Tests for the CAT DISTR chapter — DoD: golden + edges + anti-cut.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user
|
||||
asked for (entropy intro, distinct/total/%-distinct/unique metrics, top-k table
|
||||
and a donut figure), that the chapter renders inside the full document to both
|
||||
PDF and PPTX showing that content, that a profile with no categorical columns
|
||||
yields ``None`` without raising, and that long labels / many columns are never
|
||||
cut in either output.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import (
|
||||
DataTable, Figure, Heading, KVTable, Note,
|
||||
)
|
||||
from datascience.automatic_eda.chapters.cat_distr import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_cat_distr,
|
||||
)
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _profile() -> dict:
|
||||
return {
|
||||
"table": "productos",
|
||||
"source": "/data/productos.csv",
|
||||
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||
"n_rows": 1000,
|
||||
"n_cols": 3,
|
||||
"quality_score": 90.0,
|
||||
"columns": [
|
||||
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"null_count": 0,
|
||||
"numeric": {"mean": 42.5, "median": 40.0, "min": 1.0,
|
||||
"max": 100.0, "std": 12.3}},
|
||||
{"name": "categoria", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "null_count": 0, "distinct_count": 8,
|
||||
"categorical": {
|
||||
"top": [
|
||||
{"value": "neumaticos", "count": 500, "pct": 0.5},
|
||||
{"value": "aceite", "count": 300, "pct": 0.3},
|
||||
{"value": "filtros", "count": 120, "pct": 0.12},
|
||||
{"value": "frenos", "count": 80, "pct": 0.08},
|
||||
],
|
||||
"mode": "neumaticos", "n_distinct": 8, "entropy": 1.6,
|
||||
"imbalance": 6.25, "len_min": 6, "len_mean": 7.5,
|
||||
"len_max": 10}},
|
||||
{"name": "uuid", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "null_count": 0, "distinct_count": 1000,
|
||||
"categorical": {
|
||||
"top": [{"value": f"id-{i}", "count": 1} for i in range(5)],
|
||||
"mode": "id-0", "n_distinct": 1000, "entropy": 9.97,
|
||||
"imbalance": 1.0}},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def _pptx_text(path: str) -> str:
|
||||
prs = Presentation(path)
|
||||
parts = []
|
||||
for sl in prs.slides:
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
parts.append(sh.text_frame.text)
|
||||
if sh.has_table:
|
||||
tb = sh.table
|
||||
for r in range(len(tb.rows)):
|
||||
for c in range(len(tb.columns)):
|
||||
parts.append(tb.cell(r, c).text)
|
||||
return re.sub(r"\s+", " ", " ".join(parts))
|
||||
|
||||
|
||||
def _kinds(chapter):
|
||||
return [b.kind for b in chapter.blocks]
|
||||
|
||||
|
||||
def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
assert ch is not None
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = _kinds(ch)
|
||||
# Entropy intro present.
|
||||
headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
|
||||
assert any("Entrop" in h for h in headings)
|
||||
md = next(b for b in ch.blocks if b.kind == "markdown")
|
||||
assert "entropía" in md.text.lower() and "log2" in md.text
|
||||
# Cardinality metrics: distinct, total rows, %-distinct, unique values.
|
||||
kv = next(b for b in ch.blocks if isinstance(b, KVTable))
|
||||
labels = [r[0] for r in kv.rows]
|
||||
assert "Valores distintos" in labels
|
||||
assert "% distintos" in labels
|
||||
assert "Total filas (dataset)" in labels
|
||||
assert "Valores únicos (frecuencia 1)" in labels
|
||||
assert any("Entropía" in lbl for lbl in labels)
|
||||
# Top-k table + pie figure.
|
||||
dt = next(b for b in ch.blocks if isinstance(b, DataTable))
|
||||
assert dt.header == ["Valor", "Conteo", "%"]
|
||||
assert any("neumaticos" in str(cell) for row in dt.rows for cell in row)
|
||||
assert any(isinstance(b, Figure) for b in ch.blocks)
|
||||
# id-like column flagged with a Note.
|
||||
assert any(isinstance(b, Note) and "identificador" in b.text
|
||||
for b in ch.blocks)
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_categoricas():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
txt = _pdf_text(out)
|
||||
assert "Entrop" in txt
|
||||
assert "distintos" in txt
|
||||
assert "categoria" in txt and "neumaticos" in txt
|
||||
assert "donut" in txt # figure caption rendered as text.
|
||||
assert "identificador" in txt # id-like note rendered.
|
||||
|
||||
|
||||
def test_golden_render_pptx_muestra_categoricas():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
txt = _pptx_text(out)
|
||||
assert "Entrop" in txt
|
||||
assert "categoria" in txt and "neumaticos" in txt
|
||||
assert "distintos" in txt
|
||||
|
||||
|
||||
def test_edge_sin_categoricas_devuelve_none():
|
||||
only_numeric = {
|
||||
"n_rows": 10, "columns": [
|
||||
{"name": "x", "inferred_type": "numeric",
|
||||
"numeric": {"mean": 1.0}}]}
|
||||
assert build_cat_distr(only_numeric, {}) is None
|
||||
# None / empty / no-columns never raise and yield None.
|
||||
assert build_cat_distr(None, None) is None
|
||||
assert build_cat_distr({}, {}) is None
|
||||
assert build_cat_distr({"columns": []}, {}) is None
|
||||
|
||||
|
||||
def test_anti_corte_label_largo_y_muchas_columnas():
|
||||
long_label = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed "
|
||||
"do eiusmod tempor incididunt ut labore reprehenderit voluptate")
|
||||
cols = []
|
||||
for i in range(30):
|
||||
cols.append({
|
||||
"name": f"cat_{i}", "inferred_type": "categorical",
|
||||
"distinct_count": 3,
|
||||
"categorical": {
|
||||
"top": [{"value": long_label, "count": 60},
|
||||
{"value": "b", "count": 30},
|
||||
{"value": "c", "count": 10}],
|
||||
"mode": long_label, "n_distinct": 3, "entropy": 1.2}})
|
||||
profile = {"table": "t", "source": "t.csv", "n_rows": 100,
|
||||
"n_cols": len(cols), "columns": cols}
|
||||
|
||||
ch = build_cat_distr(profile, {})
|
||||
assert ch is not None
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "anti.pdf")
|
||||
res = render_automatic_eda_pdf(profile, pdf, {"write_manifest": False})
|
||||
assert res["path"] == pdf
|
||||
assert res["n_pages"] > 1 # many columns spilled across pages, OK.
|
||||
txt = _pdf_text(pdf)
|
||||
# Long label wrapped (not truncated): every word survives.
|
||||
for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"):
|
||||
assert word in txt
|
||||
# PPTX path must not raise either.
|
||||
pptx = os.path.join(d, "anti.pptx")
|
||||
res2 = render_automatic_eda_pptx(profile, pptx,
|
||||
{"write_manifest": False})
|
||||
assert res2["path"] == pptx and os.path.exists(pptx)
|
||||
Reference in New Issue
Block a user