Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 26569c7015 | |||
| 44622339fa | |||
| c0d44a6352 | |||
| cab0fbf0a3 | |||
| 7f304adc9c | |||
| 44be1d6b58 | |||
| 64306f3b1c |
File diff suppressed because one or more lines are too long
@@ -5,28 +5,32 @@ page (PDF) / slide (PPTX)**: every column is wrapped in a keep-together
|
||||
``model.Group`` with ``page_break_before=True`` (except the first, which may share
|
||||
the intro's page), so its chart sits next to its tables and no column is split.
|
||||
|
||||
A short intro names the clickable **[[term:entropia]]entropía[[/term]]** term —
|
||||
the full definition lives in the GLOSARIO chapter, so it is NOT repeated inline
|
||||
here (one click jumps to the glossary entry). The intro also carries the dataset
|
||||
row total used as a comparison baseline.
|
||||
Per column the Group is laid out ``side_by_side`` (PPTX: cardinality table LEFT,
|
||||
chart RIGHT; PDF: stacked) and contains, in order:
|
||||
|
||||
Per column the Group contains, in order:
|
||||
|
||||
1. A cardinality key/value table: distinct values, ``% distinct`` (distinct /
|
||||
1. The column name plus, when the LLM layer ran, its business **description** and
|
||||
**unit** (read from ``profile['llm']['dictionary']``, matched by column name).
|
||||
2. A cardinality key/value table: distinct values, ``% distinct`` (distinct /
|
||||
total rows), total dataset rows, singleton values (frequency 1), entropy with
|
||||
its theoretical maximum and the normalized ratio, mode, imbalance and
|
||||
string-length stats.
|
||||
2. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
single dominating category).
|
||||
3. A ``top-k`` table (value / count / %).
|
||||
4. A **donut pie chart** of the most common categories (top-k + an "Otros"
|
||||
4. A ``top-k`` table (value / count / %).
|
||||
5. A **horizontal bar chart** of the most common categories (top-k + an "Otros"
|
||||
bucket), drawn lazily so the renderers scale it to fit entirely.
|
||||
|
||||
A short intro names the clickable **[[term:entropia]]entropía[[/term]]** and
|
||||
**[[term:pagina_categorica]]page-layout[[/term]]** terms — their full
|
||||
definitions live in the GLOSARIO chapter, so they are NOT repeated inline here
|
||||
(one click jumps to the glossary entry). The intro also carries the dataset row
|
||||
total used as a comparison baseline.
|
||||
|
||||
Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the
|
||||
output of ``summarize_categorical`` (``top[{value,count,pct}]``, ``mode``,
|
||||
``n_distinct``, ``entropy``, ``imbalance``, ``len_min/mean/max``). The derived
|
||||
cardinality metrics and the pie figure are delegated to two registry functions
|
||||
(``categorical_cardinality_block`` and ``categorical_top_pie_figure``); both are
|
||||
cardinality metrics and the bar figure are delegated to two registry functions
|
||||
(``categorical_cardinality_block`` and ``categorical_top_bar_figure``); both are
|
||||
imported lazily and degrade to a minimal inline fallback so this chapter never
|
||||
raises even if they are unavailable.
|
||||
|
||||
@@ -39,10 +43,21 @@ import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_VERSION = "1.3.0"
|
||||
CHAPTER_ID = "cat_distr"
|
||||
CHAPTER_TITLE = "Distribuciones categóricas"
|
||||
|
||||
# Key under which eda_llm_insights stores its interpretive block in the profile.
|
||||
LLM_KEY = "llm"
|
||||
|
||||
# Second glossary term this chapter names: "how each categorical page is laid
|
||||
# out". The long paragraph that used to describe it inline in the intro now lives
|
||||
# in the GLOSARIO chapter (canonical definition in ``glosario._BASELINE_TERMS``);
|
||||
# the intro only names the clickable term, relocating the explanation, not losing
|
||||
# it. The chapter only needs to register key+label here.
|
||||
_TERM_PAGINA_KEY = "pagina_categorica"
|
||||
_TERM_PAGINA_LABEL = "Cómo se organiza cada página categórica"
|
||||
|
||||
# Glossary term this chapter explains. Registered in the shared collector and
|
||||
# marked clickable on its first appearance (end-to-end glossary example —
|
||||
# mejora 6). Other chapters hook their own terms the same way (see the contract).
|
||||
@@ -59,14 +74,14 @@ _TERM_ENTROPIA_DEF = (
|
||||
# Cap the number of categorical columns rendered to keep the document bounded;
|
||||
# the rest are summarized in a closing note (no silent truncation).
|
||||
MAX_COLS = 40
|
||||
# Rows shown in each top-k table and explicit slices in the pie. Kept moderate so
|
||||
# the whole column — cardinality table + top-k table + donut — fits on ONE
|
||||
# Rows shown in each top-k table and explicit bars in the chart. Kept moderate so
|
||||
# the whole column — cardinality table + top-k table + bar chart — fits on ONE
|
||||
# page/slide with the chart next to its tables; the table note still reports
|
||||
# "top N of M" so nothing is silently hidden. For id-like columns (≈100%
|
||||
# distinct) the top-k table is dropped entirely (it would be a list of unique
|
||||
# values — pure noise), which also frees the room the donut needs (see build).
|
||||
# values — pure noise), which also frees the room the chart needs (see build).
|
||||
TOP_TABLE_ROWS = 8
|
||||
PIE_TOP_K = 6
|
||||
CHART_TOP_K = 6
|
||||
# Truncate very long category labels in tables (the renderer also wraps). Kept
|
||||
# tight so a column with long id-like values (names, tickets) still fits its page.
|
||||
LABEL_MAX = 28
|
||||
@@ -208,26 +223,74 @@ def _fallback_cardinality(cat: dict, n_rows) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _pie_make(top, n_distinct, title, n_rows):
|
||||
"""Return a zero-arg callable that builds the donut figure lazily."""
|
||||
def _llm_index(profile: dict, ctx: dict) -> dict:
|
||||
"""Map column name -> its LLM dictionary entry (description/unit/...).
|
||||
|
||||
Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
|
||||
profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
|
||||
dict when ``run_llm`` did not run, so the caller degrades cleanly. Fully
|
||||
defensive: never raises on malformed input.
|
||||
"""
|
||||
llm = profile.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
llm = ctx.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
return {}
|
||||
entries = llm.get("dictionary")
|
||||
if not isinstance(entries, (list, tuple)):
|
||||
return {}
|
||||
index: dict = {}
|
||||
for e in entries:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
col = e.get("column")
|
||||
if col is None:
|
||||
continue
|
||||
index[model._safe_str(col)] = e
|
||||
return index
|
||||
|
||||
|
||||
def _llm_desc_unit_block(name: str, llm_index: dict):
|
||||
"""Markdown block with the LLM business description + unit of a column, or
|
||||
None when no LLM entry matches the column (clean fallback without LLM)."""
|
||||
entry = llm_index.get(model._safe_str(name))
|
||||
if not isinstance(entry, dict):
|
||||
return None
|
||||
raw_desc = entry.get("description") or entry.get("business_meaning")
|
||||
desc = " ".join(model._safe_str(raw_desc).split()) if raw_desc else ""
|
||||
raw_unit = entry.get("unit")
|
||||
unit = " ".join(model._safe_str(raw_unit).split()) if raw_unit else ""
|
||||
parts = []
|
||||
if desc:
|
||||
parts.append(f"**Descripción:** {desc}")
|
||||
if unit:
|
||||
parts.append(f"**Unidad:** {unit}")
|
||||
if not parts:
|
||||
return None
|
||||
return model.Markdown(text=" · ".join(parts))
|
||||
|
||||
|
||||
def _bar_make(top, n_distinct, title, n_rows):
|
||||
"""Return a zero-arg callable that builds the bar figure lazily."""
|
||||
|
||||
def make():
|
||||
try:
|
||||
from datascience.categorical_top_pie_figure import (
|
||||
categorical_top_pie_figure,
|
||||
from datascience.categorical_top_bar_figure import (
|
||||
categorical_top_bar_figure,
|
||||
)
|
||||
|
||||
return categorical_top_pie_figure(
|
||||
return categorical_top_bar_figure(
|
||||
top=top, n_distinct=n_distinct or 0, title=title,
|
||||
top_k=PIE_TOP_K, n_rows=n_rows)
|
||||
top_k=CHART_TOP_K, n_rows=n_rows)
|
||||
except Exception: # noqa: BLE001 — minimal local fallback figure.
|
||||
return _fallback_pie(top, title)
|
||||
return _fallback_bar(top, title)
|
||||
|
||||
return make
|
||||
|
||||
|
||||
def _fallback_pie(top, title):
|
||||
"""Minimal donut figure used only if the registry function is unavailable."""
|
||||
def _fallback_bar(top, title):
|
||||
"""Minimal horizontal-bar figure used only if the registry function is
|
||||
unavailable. Largest category on top, the rest folded into "Otros"."""
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
@@ -238,8 +301,8 @@ def _fallback_pie(top, title):
|
||||
items = [t for t in (top or [])
|
||||
if isinstance(t, dict) and isinstance(t.get("count"), (int, float))]
|
||||
items = sorted(items, key=lambda t: t.get("count") or 0, reverse=True)
|
||||
head = items[:PIE_TOP_K]
|
||||
rest = items[PIE_TOP_K:]
|
||||
head = items[:CHART_TOP_K]
|
||||
rest = items[CHART_TOP_K:]
|
||||
labels = [_truncate(t.get("value"), 20) for t in head]
|
||||
sizes = [float(t.get("count") or 0) for t in head]
|
||||
if rest:
|
||||
@@ -249,10 +312,13 @@ def _fallback_pie(top, title):
|
||||
ax.text(0.5, 0.5, "sin datos categóricos", ha="center", va="center")
|
||||
ax.axis("off")
|
||||
return fig
|
||||
ax.pie(sizes, labels=None, wedgeprops={"width": 0.42},
|
||||
autopct=lambda p: f"{p:.0f}%" if p >= 4 else "")
|
||||
ax.legend(labels, loc="center left", bbox_to_anchor=(1.0, 0.5),
|
||||
fontsize=7, frameon=False)
|
||||
# barh draws bottom-up, so reverse to put the largest category on top.
|
||||
y_pos = range(len(labels))
|
||||
ax.barh(list(y_pos), list(reversed(sizes)), color="#4C72B0",
|
||||
edgecolor="white")
|
||||
ax.set_yticks(list(y_pos))
|
||||
ax.set_yticklabels(list(reversed(labels)), fontsize=7)
|
||||
ax.set_xlabel("conteo", fontsize=8)
|
||||
ax.set_title(_truncate(title, 40))
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
@@ -373,22 +439,17 @@ def _topk_table(cat: dict):
|
||||
note=note)
|
||||
|
||||
|
||||
def _intro_blocks(n_rows, mark_term: bool = False):
|
||||
total = _fmt_int(n_rows)
|
||||
# Mark the first appearance of the term as a clickable glossary jump when the
|
||||
# term was registered (mark_term). The full definition of entropy lives in the
|
||||
# GLOSARIO chapter, so the intro only names the clickable term here instead of
|
||||
# repeating the long explanation (avoids the redundancy with the glossary).
|
||||
def _intro_blocks(mark_term: bool = False):
|
||||
# The full explanation of entropy AND of how each categorical page is laid out
|
||||
# lives in the GLOSARIO chapter; the chapter body keeps only the minimal
|
||||
# clickable terms — no descriptive prose — to avoid duplicating the glossary.
|
||||
# The dataset row total is not repeated here: each column's cardinality table
|
||||
# already carries "Total filas (dataset)".
|
||||
entropia = ("[[term:entropia]]entropía[[/term]]" if mark_term
|
||||
else "entropía")
|
||||
text = (
|
||||
f"Cada columna categórica ocupa su propia página: sus métricas de "
|
||||
f"cardinalidad —incluida la {entropia}—, una nota que señala cardinalidad "
|
||||
"problemática, la tabla de las categorías más frecuentes y un gráfico de "
|
||||
"tarta (donut) de las más comunes, todo junto."
|
||||
)
|
||||
if n_rows is not None:
|
||||
text += f" El dataset tiene {total} filas en total como referencia."
|
||||
pagina = ("[[term:pagina_categorica]]cómo se organiza cada página[[/term]]"
|
||||
if mark_term else "cómo se organiza cada página")
|
||||
text = f"Términos: {entropia} · {pagina}."
|
||||
return [
|
||||
model.Heading(text="Entropía y cardinalidad", level=2),
|
||||
model.Markdown(text=text),
|
||||
@@ -406,15 +467,22 @@ def build_cat_distr(profile: dict, ctx: dict):
|
||||
return None
|
||||
|
||||
n_rows = profile.get("n_rows")
|
||||
# Register "entropía" in the shared glossary collector (if present) and mark
|
||||
# its first appearance clickable. End-to-end glossary example (mejora 6).
|
||||
# Register "entropía" and the "how each categorical page is laid out" term in
|
||||
# the shared glossary collector (if present) and mark their first appearance
|
||||
# clickable. End-to-end glossary example (mejora 6).
|
||||
glossary = ctx.get("glossary")
|
||||
mark_term = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
glossary.add(_TERM_ENTROPIA_KEY, _TERM_ENTROPIA_LABEL,
|
||||
_TERM_ENTROPIA_DEF)
|
||||
glossary.add(_TERM_PAGINA_KEY, _TERM_PAGINA_LABEL)
|
||||
mark_term = True
|
||||
blocks = list(_intro_blocks(n_rows, mark_term=mark_term))
|
||||
blocks = list(_intro_blocks(mark_term=mark_term))
|
||||
|
||||
# Business description + unit per column come from the LLM dictionary
|
||||
# (profile['llm']['dictionary'], matched by column name); absent without
|
||||
# run_llm, in which case the per-column description block is simply omitted.
|
||||
llm_index = _llm_index(profile, ctx)
|
||||
|
||||
rendered = cat_cols[:MAX_COLS]
|
||||
for idx, col in enumerate(rendered):
|
||||
@@ -422,31 +490,36 @@ def build_cat_distr(profile: dict, ctx: dict):
|
||||
cat = col.get("categorical") or {}
|
||||
card = _normalize_card(_cardinality(cat, n_rows))
|
||||
|
||||
# One Group per categorical column: heading + cardinality table + flag
|
||||
# note + top-k table + donut figure are kept together and the renderer
|
||||
# starts each on a fresh page/slide (page_break_before) so every column
|
||||
# gets its own page with its chart next to its tables. The first column
|
||||
# may share the intro's page (no forced break) to avoid a near-empty page.
|
||||
col_blocks = [
|
||||
model.Heading(text=str(name), level=2),
|
||||
_cardinality_block(card),
|
||||
]
|
||||
# One Group per categorical column: heading + (optional) LLM description +
|
||||
# cardinality table + flag note + top-k table + bar figure are kept
|
||||
# together and the renderer starts each on a fresh page/slide
|
||||
# (page_break_before) so every column gets its own page with its chart next
|
||||
# to its tables. The first column may share the intro's page (no forced
|
||||
# break) to avoid a near-empty page.
|
||||
col_blocks = [model.Heading(text=str(name), level=2)]
|
||||
desc_block = _llm_desc_unit_block(name, llm_index)
|
||||
if desc_block is not None:
|
||||
col_blocks.append(desc_block)
|
||||
col_blocks.append(_cardinality_block(card))
|
||||
note = _flag_note(card)
|
||||
if note is not None:
|
||||
col_blocks.append(note)
|
||||
# For id-like columns (≈100% distinct) the top-k is a list of unique
|
||||
# values — pure noise; skip it (the flag note already explains why) and
|
||||
# let the donut take that room so the whole column fits one page/slide.
|
||||
# let the bar chart take that room so the whole column fits one page/slide.
|
||||
if not card.get("id_like"):
|
||||
topk = _topk_table(cat)
|
||||
if topk is not None:
|
||||
col_blocks.append(topk)
|
||||
col_blocks.append(model.Figure(
|
||||
make=_pie_make(cat.get("top") or [], card.get("n_distinct"),
|
||||
make=_bar_make(cat.get("top") or [], card.get("n_distinct"),
|
||||
str(name), n_rows),
|
||||
caption=(f"Categorías más comunes de «{_truncate(name, 32)}» "
|
||||
"(donut: top-k + «Otros»)")))
|
||||
blocks.append(model.Group(blocks=col_blocks,
|
||||
"(barras: top-k + «Otros»)")))
|
||||
# layout="side_by_side": in PPTX the cardinality table goes to the LEFT and
|
||||
# the bar chart to the RIGHT of the same slide; the PDF renderer stacks it
|
||||
# (the A5 mobile page is too narrow for two readable columns).
|
||||
blocks.append(model.Group(blocks=col_blocks, layout="side_by_side",
|
||||
page_break_before=(idx > 0)))
|
||||
|
||||
if len(cat_cols) > len(rendered):
|
||||
|
||||
@@ -2,12 +2,14 @@
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user
|
||||
asked for (distinct/total/%-distinct/unique metrics, top-k table and a donut
|
||||
asked for (distinct/total/%-distinct/unique metrics, top-k table and a bar
|
||||
figure), that EACH categorical column is wrapped in its own keep-together
|
||||
``Group`` that starts on a fresh page/slide (one column per page, chart next to
|
||||
its tables), that the long entropy explanation is NOT repeated inline (it lives
|
||||
in the glossary — only the clickable term is kept), that the chapter renders
|
||||
inside the full document to both PDF and PPTX showing that content, that a
|
||||
``Group`` laid out ``side_by_side`` (PPTX: table left / bars right) that starts on
|
||||
a fresh page/slide (one column per page, chart next to its tables), that the LLM
|
||||
business description + unit are shown per column when the profile carries an LLM
|
||||
block, that the long entropy / page-layout explanations are NOT repeated inline
|
||||
(they live in the glossary — only the clickable terms are kept), that the chapter
|
||||
renders inside the full document to both PDF and PPTX showing that content, that a
|
||||
profile with no categorical columns yields ``None`` without raising, and that
|
||||
long labels / many columns are never cut in either output.
|
||||
"""
|
||||
@@ -116,6 +118,10 @@ def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
assert "log2" not in md.text # redundant explanation removed.
|
||||
assert "máxima diversidad" not in md.text
|
||||
|
||||
# The donut/pie is gone: the intro no longer mentions tarta/donut (the chart
|
||||
# is now a bar chart; the long page-layout explanation moved to the glossary).
|
||||
assert "donut" not in md.text and "tarta" not in md.text
|
||||
|
||||
# Per-column blocks are wrapped in keep-together Groups: flatten to inspect.
|
||||
flat = _flatten(ch.blocks)
|
||||
kv = next(b for b in flat if isinstance(b, KVTable))
|
||||
@@ -128,11 +134,13 @@ def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
assert any("Entropía" in lbl for lbl in labels)
|
||||
assert "únicos" in values and "%" in values
|
||||
assert "bits" in values and "norm" in values # entropy + max + normalized.
|
||||
# Top-k table + pie figure.
|
||||
# Top-k table + bar figure.
|
||||
dt = next(b for b in flat if isinstance(b, DataTable))
|
||||
assert dt.header == ["Valor", "Conteo", "%"]
|
||||
assert any("neumaticos" in str(cell) for row in dt.rows for cell in row)
|
||||
assert any(isinstance(b, Figure) for b in flat)
|
||||
# Each per-column Group is laid out side_by_side (table left / bars right).
|
||||
assert all(g.layout == "side_by_side" for g in _column_groups(ch))
|
||||
# id-like column flagged with a Note that also explains the top-k is dropped.
|
||||
idnote = next((b for b in flat
|
||||
if isinstance(b, Note) and "identificador" in b.text), None)
|
||||
@@ -140,9 +148,9 @@ def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
assert "No se lista el top" in idnote.text
|
||||
|
||||
|
||||
def test_golden_idlike_omite_topk_y_conserva_donut():
|
||||
def test_golden_idlike_omite_topk_y_conserva_grafico():
|
||||
# The id-like column (uuid, 100% distinct) must NOT carry a top-k DataTable
|
||||
# (it would be a list of unique values), but must still keep its donut Figure
|
||||
# (it would be a list of unique values), but must still keep its bar Figure
|
||||
# and its cardinality table so it stays a full per-column page.
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
groups = _column_groups(ch)
|
||||
@@ -151,7 +159,7 @@ def test_golden_idlike_omite_topk_y_conserva_donut():
|
||||
kinds = [b.kind for b in uuid_group.blocks]
|
||||
assert "data_table" not in kinds # top-k of unique values dropped.
|
||||
assert "kv_table" in kinds # cardinality kept.
|
||||
assert "figure" in kinds # donut kept (chart per column).
|
||||
assert "figure" in kinds # bar chart kept (chart per column).
|
||||
# A non-id-like column keeps its top-k table.
|
||||
cat_group = next(g for g in groups
|
||||
if any(getattr(b, "text", "") == "categoria"
|
||||
@@ -205,7 +213,7 @@ def test_golden_render_pdf_una_pagina_por_columna():
|
||||
assert "Entrop" in txt
|
||||
assert "distintos" in txt
|
||||
assert "categoria" in txt and "neumaticos" in txt
|
||||
assert "donut" in txt # figure caption rendered as text.
|
||||
assert "barras" in txt # bar-chart caption rendered as text (PDF).
|
||||
assert "identificador" in txt # id-like note rendered.
|
||||
|
||||
|
||||
@@ -258,9 +266,11 @@ def _profile_high_card() -> dict:
|
||||
|
||||
|
||||
def test_golden_pptx_una_slide_por_columna_con_su_grafico():
|
||||
"""Each categorical column occupies EXACTLY ONE cat_distr slide that carries
|
||||
BOTH its cardinality table and its donut figure (picture) — i.e. the chart is
|
||||
never separated from its table, even for a high-cardinality column."""
|
||||
"""Cada columna categórica ocupa EXACTAMENTE UN slide cat_distr que lleva su
|
||||
gráfico (picture) en la misma slide — el chart nunca se separa de su columna,
|
||||
ni siquiera para una columna de alta cardinalidad. Con layout side_by_side la
|
||||
tabla se rasteriza a imagen, así que la comprobación se hace por presencia de
|
||||
picture (no por el texto de la tabla)."""
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
|
||||
prof = _profile_high_card()
|
||||
@@ -272,7 +282,7 @@ def test_golden_pptx_una_slide_por_columna_con_su_grafico():
|
||||
prs = Presentation(out)
|
||||
|
||||
# Per column: the cat_distr slides whose text mentions it, and whether the
|
||||
# owning slide also has the donut caption + an actual picture shape.
|
||||
# owning slide also carries an actual picture shape (its chart).
|
||||
slides_with_col = {n: [] for n in cat_names}
|
||||
owner_has_chart = {n: False for n in cat_names}
|
||||
for i, sl in enumerate(prs.slides):
|
||||
@@ -288,15 +298,106 @@ def test_golden_pptx_una_slide_por_columna_con_su_grafico():
|
||||
for n in cat_names:
|
||||
if n in txt:
|
||||
slides_with_col[n].append(i)
|
||||
has_table = "Cardinalidad" in txt or "distintos" in txt
|
||||
if has_pic and "donut" in txt and has_table:
|
||||
if has_pic:
|
||||
owner_has_chart[n] = True
|
||||
|
||||
for n in cat_names:
|
||||
# Exactly one slide carries the column (not split across slides).
|
||||
assert len(slides_with_col[n]) == 1, (n, slides_with_col[n])
|
||||
# That single slide also holds its table AND its donut picture.
|
||||
assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide")
|
||||
# That single slide also holds its chart picture.
|
||||
assert owner_has_chart[n], (n, "el gráfico no está en el slide de la columna")
|
||||
|
||||
|
||||
def test_golden_pptx_columna_side_by_side_tabla_izq_barra_der():
|
||||
"""Con layout side_by_side, una columna categórica coloca su tabla de
|
||||
cardinalidad (imagen) en la mitad izquierda y su gráfico de barras (imagen) en
|
||||
la mitad derecha de la MISMA slide. Verifica que al menos una columna queda en
|
||||
dos columnas (tabla-izq / barras-der), evidencia del side_by_side en PPTX."""
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
from pptx.util import Inches
|
||||
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
|
||||
prs = Presentation(out)
|
||||
centre = int(Inches(13.333 / 2.0)) # half of the 16:9 slide width.
|
||||
two_col_slides = 0
|
||||
for sl in prs.slides:
|
||||
texts, lefts = [], []
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
texts.append(sh.text_frame.text)
|
||||
if (sh.shape_type == MSO_SHAPE_TYPE.PICTURE
|
||||
and sh.left is not None):
|
||||
lefts.append(sh.left)
|
||||
txt = re.sub(r"\s+", " ", " ".join(texts))
|
||||
if "Distribuciones categ" not in txt:
|
||||
continue
|
||||
# One picture starts in the left half, another in the right half.
|
||||
if len(lefts) >= 2 and min(lefts) < centre and max(lefts) > centre:
|
||||
two_col_slides += 1
|
||||
assert two_col_slides >= 1, (
|
||||
"ninguna columna quedó con tabla-izq / barras-der (side_by_side)")
|
||||
|
||||
|
||||
def _profile_with_llm() -> dict:
|
||||
"""The base profile plus an ``llm`` block (as eda_llm_insights would store it
|
||||
with run_llm=True): a data dictionary with description/unit per column."""
|
||||
prof = _profile()
|
||||
prof["llm"] = {
|
||||
"dictionary": [
|
||||
{"column": "categoria",
|
||||
"description": "Familia de producto del recambio",
|
||||
"business_meaning": "Agrupa el catálogo por tipo de pieza",
|
||||
"unit": "categoría"},
|
||||
{"column": "uuid",
|
||||
"description": "Identificador único de registro",
|
||||
"unit": ""},
|
||||
],
|
||||
}
|
||||
return prof
|
||||
|
||||
|
||||
def test_llm_descripcion_y_unidad_por_columna():
|
||||
# With an LLM dictionary, each categorical column whose name matches shows its
|
||||
# business description and unit in a per-column markdown block.
|
||||
ch = build_cat_distr(_profile_with_llm(), {})
|
||||
groups = _column_groups(ch)
|
||||
cat_group = next(g for g in groups
|
||||
if any(getattr(b, "text", "") == "categoria"
|
||||
for b in g.blocks))
|
||||
md = " ".join(b.text for b in cat_group.blocks
|
||||
if getattr(b, "kind", "") == "markdown")
|
||||
assert "Descripción" in md and "Familia de producto" in md
|
||||
assert "Unidad" in md and "categoría" in md
|
||||
|
||||
|
||||
def test_edge_sin_llm_no_anade_descripcion():
|
||||
# Without an LLM block the per-column description markdown is simply omitted;
|
||||
# the column still renders its cardinality table and bar figure.
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
for g in _column_groups(ch):
|
||||
mds = [b.text for b in g.blocks if getattr(b, "kind", "") == "markdown"]
|
||||
assert not any("Descripción" in t for t in mds)
|
||||
|
||||
|
||||
def test_pagina_categorica_clicable_y_definicion_en_glosario():
|
||||
# The "how each categorical page is laid out" term is registered + marked
|
||||
# clickable in the intro, and its full definition lands in the glossary
|
||||
# chapter (canonical baseline catalog), not inline.
|
||||
from datascience.automatic_eda.chapters.glosario import build_glosario
|
||||
|
||||
gc = GlossaryCollector()
|
||||
ch = build_cat_distr(_profile(), {"glossary": gc})
|
||||
md = next(b for b in ch.blocks if isinstance(b, Markdown))
|
||||
assert "[[term:pagina_categorica]]" in md.text
|
||||
assert gc.has("pagina_categorica")
|
||||
glos = build_glosario(_profile(), {"glossary": gc})
|
||||
entry = next(b for b in glos.blocks
|
||||
if getattr(b, "kind", "") == "glossary_entry"
|
||||
and b.key == "pagina_categorica")
|
||||
assert "barras" in entry.definition
|
||||
assert "identificador" in entry.definition
|
||||
|
||||
|
||||
def test_edge_sin_categoricas_devuelve_none():
|
||||
|
||||
@@ -17,10 +17,63 @@ from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "glosario"
|
||||
CHAPTER_TITLE = "Glosario"
|
||||
|
||||
# Canonical definitions for cross-cutting terms — the "how to read it" entries
|
||||
# that do not belong to a single chapter. A chapter only needs to *register* the
|
||||
# term (``ctx['glossary'].add(key, label)``) and mark its in-text appearance with
|
||||
# ``[[term:key]]…[[/term]]``; this chapter supplies the full definition here when
|
||||
# the collector carries the term without one. Keeping the prose in a single place
|
||||
# avoids repeating a long paragraph inline in every chapter that names the term
|
||||
# (the explanation moved out of the NUM DISTR and CAT DISTR intros lives here).
|
||||
_BASELINE_TERMS = {
|
||||
"histograma_boxplot": {
|
||||
"label": "Cómo leer el histograma y el boxplot",
|
||||
"definition": (
|
||||
"Para cada columna numérica se muestra su histograma con tres líneas "
|
||||
"de referencia: la media (línea roja discontinua), la mediana (línea "
|
||||
"verde continua) y la banda ±1σ (zona sombreada que cubre una "
|
||||
"desviación estándar a cada lado de la media). Debajo, alineado al "
|
||||
"mismo eje horizontal, un boxplot de Tukey: la caja abarca del primer "
|
||||
"al tercer cuartil (P25–P75), la línea interior es la mediana y los "
|
||||
"bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
|
||||
"valores más allá de las vallas (posibles atípicos). Comparar la media "
|
||||
"con la mediana revela la asimetría: si la media supera a la mediana la "
|
||||
"cola larga cae hacia los valores altos (asimetría a la derecha), y al "
|
||||
"revés hacia los bajos."),
|
||||
},
|
||||
"pagina_categorica": {
|
||||
"label": "Cómo se organiza cada página categórica",
|
||||
"definition": (
|
||||
"Cada columna categórica ocupa su propia página: muestra sus métricas "
|
||||
"de cardinalidad —incluida la entropía—, una nota que señala "
|
||||
"cardinalidad problemática (columnas que se comportan como "
|
||||
"identificador, con casi todos los valores distintos, o dominadas por "
|
||||
"una sola categoría), la tabla de las categorías más frecuentes (top-k, "
|
||||
"con su conteo y porcentaje) y un gráfico de barras de las categorías "
|
||||
"más comunes (top-k más una barra «Otros» que agrupa la cola). El total "
|
||||
"de filas del dataset se usa como referencia para interpretar los "
|
||||
"conteos."),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _resolve_term(term: dict) -> tuple:
|
||||
"""Return (label, definition) for a collected term, completing a missing
|
||||
definition (and, if absent, the label) from the canonical baseline catalog."""
|
||||
key = model._safe_str(term.get("key"))
|
||||
label = model._safe_str(term.get("label"))
|
||||
definition = model._safe_str(term.get("definition"))
|
||||
base = _BASELINE_TERMS.get(key)
|
||||
if base:
|
||||
if not definition.strip():
|
||||
definition = model._safe_str(base.get("definition"))
|
||||
if not label.strip() or label == key:
|
||||
label = model._safe_str(base.get("label")) or label
|
||||
return label, definition
|
||||
|
||||
|
||||
def build_glosario(profile: dict, ctx: dict):
|
||||
"""Build the glossary Chapter from the shared collector, or None if empty."""
|
||||
@@ -36,12 +89,14 @@ def build_glosario(profile: dict, ctx: dict):
|
||||
"Cada término va resaltado en el texto y, al pulsarlo, salta a su "
|
||||
"definición en esta sección.")),
|
||||
]
|
||||
# One clickable destination per term, alphabetically by visible label.
|
||||
# One clickable destination per term, alphabetically by visible label. A term
|
||||
# registered without a definition is completed from the canonical baseline.
|
||||
for term in glossary.terms(by="label"):
|
||||
label, definition = _resolve_term(term)
|
||||
blocks.append(model.GlossaryEntry(
|
||||
key=model._safe_str(term.get("key")),
|
||||
label=model._safe_str(term.get("label")),
|
||||
definition=model._safe_str(term.get("definition"))))
|
||||
label=label,
|
||||
definition=definition))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -35,10 +35,21 @@ try:
|
||||
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
|
||||
build_boxplot_stats = None # type: ignore[assignment]
|
||||
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_VERSION = "1.3.0"
|
||||
CHAPTER_ID = "num_distr"
|
||||
CHAPTER_TITLE = "Distribuciones numéricas"
|
||||
|
||||
# Glossary term this chapter explains. The long "how to read the histogram and
|
||||
# the boxplot" paragraph used to live inline in the intro; it now lives in the
|
||||
# GLOSARIO chapter (canonical definition in ``glosario._BASELINE_TERMS``) and the
|
||||
# intro only names the clickable term — one click jumps to the full explanation,
|
||||
# so the information is relocated, not lost (mejora glosario).
|
||||
_TERM_HISTOBOX_KEY = "histograma_boxplot"
|
||||
_TERM_HISTOBOX_LABEL = "Cómo leer el histograma y el boxplot"
|
||||
|
||||
# Key under which eda_llm_insights stores its interpretive block in the profile.
|
||||
LLM_KEY = "llm"
|
||||
|
||||
# Plain-Spanish gloss for every label ``detect_distribution_type`` can emit, so a
|
||||
# non-expert reader understands the shape and the suggested next step (MUST-4.3).
|
||||
_DIST_GLOSS = {
|
||||
@@ -99,6 +110,53 @@ def _numeric_columns(profile: dict) -> list:
|
||||
return out
|
||||
|
||||
|
||||
def _llm_index(profile: dict, ctx: dict) -> dict:
|
||||
"""Map column name -> its LLM dictionary entry (description/unit/...).
|
||||
|
||||
Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
|
||||
profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
|
||||
dict when ``run_llm`` did not run, so the caller degrades cleanly. Fully
|
||||
defensive: never raises on malformed input.
|
||||
"""
|
||||
llm = profile.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
llm = ctx.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
return {}
|
||||
entries = llm.get("dictionary")
|
||||
if not isinstance(entries, (list, tuple)):
|
||||
return {}
|
||||
index: dict = {}
|
||||
for e in entries:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
col = e.get("column")
|
||||
if col is None:
|
||||
continue
|
||||
index[model._safe_str(col)] = e
|
||||
return index
|
||||
|
||||
|
||||
def _llm_desc_unit_block(name: str, llm_index: dict):
|
||||
"""Markdown block with the LLM business description + unit of a column, or
|
||||
None when no LLM entry matches the column (clean fallback without LLM)."""
|
||||
entry = llm_index.get(model._safe_str(name))
|
||||
if not isinstance(entry, dict):
|
||||
return None
|
||||
raw_desc = entry.get("description") or entry.get("business_meaning")
|
||||
desc = " ".join(model._safe_str(raw_desc).split()) if raw_desc else ""
|
||||
raw_unit = entry.get("unit")
|
||||
unit = " ".join(model._safe_str(raw_unit).split()) if raw_unit else ""
|
||||
parts = []
|
||||
if desc:
|
||||
parts.append(f"**Descripción:** {desc}")
|
||||
if unit:
|
||||
parts.append(f"**Unidad:** {unit}")
|
||||
if not parts:
|
||||
return None
|
||||
return model.Markdown(text=" · ".join(parts))
|
||||
|
||||
|
||||
def _make_hist_box(name: str, numeric: dict, box: dict):
|
||||
"""Build the histogram (with mean/median/±σ lines) + boxplot figure.
|
||||
|
||||
@@ -271,15 +329,26 @@ def build_num_distr(profile: dict, ctx: dict):
|
||||
if not numerics:
|
||||
return None # chapter does not apply to a dataset with no numerics.
|
||||
|
||||
# Register the "how to read the histogram and boxplot" term in the shared
|
||||
# glossary collector (if present) and mark its first appearance clickable. The
|
||||
# full explanation (colour code, 1,5·IQR rule, asymmetry reading) lives in the
|
||||
# GLOSARIO chapter instead of inline here: the intro only names the term.
|
||||
glossary = ctx.get("glossary")
|
||||
mark_term = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
glossary.add(_TERM_HISTOBOX_KEY, _TERM_HISTOBOX_LABEL)
|
||||
mark_term = True
|
||||
como_leer = ("[[term:histograma_boxplot]]cómo leer estos gráficos[[/term]]"
|
||||
if mark_term else "cómo leer estos gráficos")
|
||||
intro = (
|
||||
"Para cada columna numérica se muestra su **histograma** con tres líneas "
|
||||
"de referencia: la **media** (línea roja discontinua), la **mediana** "
|
||||
"(línea verde continua) y la banda **±1σ** (zona sombreada). Debajo, "
|
||||
"alineado al mismo eje, un **boxplot de Tukey**: la caja abarca del "
|
||||
"primer al tercer cuartil (P25–P75), la línea interior es la mediana y "
|
||||
"los bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
|
||||
"valores más allá de las vallas. Comparar media y mediana revela la "
|
||||
"asimetría de la distribución.")
|
||||
"Cada columna numérica muestra su **histograma** (con la **media**, la "
|
||||
"**mediana** y la banda **±1σ**) y, debajo y al mismo eje, su **boxplot "
|
||||
f"de Tukey** — {como_leer}.")
|
||||
|
||||
# Business description + unit per column come from the LLM dictionary
|
||||
# (profile['llm']['dictionary'], matched by column name); absent without
|
||||
# run_llm, in which case the per-column description block is simply omitted.
|
||||
llm_index = _llm_index(profile, ctx)
|
||||
|
||||
blocks = [
|
||||
model.Heading(text=CHAPTER_TITLE, level=1),
|
||||
@@ -293,17 +362,20 @@ def build_num_distr(profile: dict, ctx: dict):
|
||||
box = build_boxplot_stats(numeric) or {}
|
||||
except Exception: # noqa: BLE001 — degrade, never raise.
|
||||
box = {}
|
||||
# Keep the column heading, its figure and its stats note together on the
|
||||
# same page/slide (mejora 3 — keep-together): the renderers measure the
|
||||
# whole Group and move it whole when it would not fit.
|
||||
blocks.append(model.Group(blocks=[
|
||||
model.Heading(text=str(name), level=2),
|
||||
model.Figure(
|
||||
make=_figure_maker(name, numeric, box),
|
||||
caption=f"Distribución de «{name}» — histograma "
|
||||
f"(media/mediana/±σ) y boxplot."),
|
||||
model.Markdown(text=_stats_note(name, numeric, box)),
|
||||
]))
|
||||
# Keep the column heading, its (optional) LLM description, its figure and
|
||||
# its stats note together on the same page/slide (mejora 3 —
|
||||
# keep-together): the renderers measure the whole Group and move it whole
|
||||
# when it would not fit.
|
||||
col_blocks = [model.Heading(text=str(name), level=2)]
|
||||
desc_block = _llm_desc_unit_block(name, llm_index)
|
||||
if desc_block is not None:
|
||||
col_blocks.append(desc_block)
|
||||
col_blocks.append(model.Figure(
|
||||
make=_figure_maker(name, numeric, box),
|
||||
caption=f"Distribución de «{name}» — histograma "
|
||||
f"(media/mediana/±σ) y boxplot."))
|
||||
col_blocks.append(model.Markdown(text=_stats_note(name, numeric, box)))
|
||||
blocks.append(model.Group(blocks=col_blocks))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -101,7 +101,7 @@ def test_golden_chapter_estructura_y_bloques():
|
||||
|
||||
|
||||
def test_golden_media_mediana_sigma_y_boxplot_presentes():
|
||||
# The intro documents the three reference lines and the Tukey boxplot; the
|
||||
# The short intro names the three reference lines and the Tukey boxplot; the
|
||||
# per-column note carries the actual mean/median/σ numbers and the shape.
|
||||
ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {})
|
||||
md_texts = " ".join(b.text for b in _flatten(ch.blocks)
|
||||
@@ -110,10 +110,58 @@ def test_golden_media_mediana_sigma_y_boxplot_presentes():
|
||||
assert "±1σ" in md_texts or "σ" in md_texts
|
||||
assert "boxplot" in md_texts.lower()
|
||||
assert "Tukey" in md_texts
|
||||
# The long "how to read it" explanation moved to the glossary: the colour-code
|
||||
# / 1,5·IQR walkthrough is no longer inline in the chapter body.
|
||||
assert "1,5·IQR" not in md_texts
|
||||
assert "línea roja" not in md_texts
|
||||
# distribution_type gloss surfaced for the column (right-skewed preset).
|
||||
assert _DIST_GLOSS["right-skewed"].split(";")[0][:20] in md_texts
|
||||
|
||||
|
||||
def test_glosario_histograma_boxplot_clicable_y_definicion():
|
||||
# With a glossary collector the intro marks the clickable term and the FULL
|
||||
# explanation (the long paragraph removed from the body) lands in the glossary.
|
||||
from datascience.automatic_eda.chapters.glosario import build_glosario
|
||||
|
||||
gc = model.GlossaryCollector()
|
||||
prof = _profile(n_numeric=1, extra_categorical=False)
|
||||
ch = build_num_distr(prof, {"glossary": gc})
|
||||
intro = next(b for b in ch.blocks if b.kind == "markdown")
|
||||
assert "[[term:histograma_boxplot]]" in intro.text
|
||||
assert gc.has("histograma_boxplot")
|
||||
glos = build_glosario(prof, {"glossary": gc})
|
||||
entry = next(b for b in glos.blocks
|
||||
if getattr(b, "kind", "") == "glossary_entry"
|
||||
and b.key == "histograma_boxplot")
|
||||
assert "boxplot" in entry.definition.lower()
|
||||
assert "1,5·IQR" in entry.definition
|
||||
|
||||
|
||||
def test_llm_descripcion_y_unidad_por_columna():
|
||||
# With an LLM dictionary, each numeric column whose name matches shows its
|
||||
# business description and unit in a per-column markdown block.
|
||||
prof = _profile(n_numeric=2)
|
||||
prof["llm"] = {"dictionary": [
|
||||
{"column": "precio", "description": "Precio de venta del producto",
|
||||
"unit": "EUR"},
|
||||
{"column": "alcohol", "business_meaning": "Grado alcohólico",
|
||||
"unit": "% vol"},
|
||||
]}
|
||||
ch = build_num_distr(prof, {})
|
||||
md_all = " ".join(b.text for b in _flatten(ch.blocks)
|
||||
if b.kind == "markdown")
|
||||
assert "Precio de venta" in md_all and "EUR" in md_all
|
||||
assert "Grado alcohólico" in md_all and "% vol" in md_all
|
||||
|
||||
|
||||
def test_edge_sin_llm_no_anade_descripcion():
|
||||
# Without an LLM block the per-column description markdown is simply omitted.
|
||||
ch = build_num_distr(_profile(n_numeric=2), {})
|
||||
md_all = " ".join(b.text for b in _flatten(ch.blocks)
|
||||
if b.kind == "markdown")
|
||||
assert "Descripción" not in md_all
|
||||
|
||||
|
||||
def test_boxplot_stats_se_consumen_del_registry():
|
||||
# The chapter must feed build_boxplot_stats (group eda) and the resulting
|
||||
# box must carry the Tukey fences for the figure.
|
||||
|
||||
@@ -7,11 +7,21 @@ as needed, the renderers paginate):
|
||||
NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
|
||||
``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
|
||||
an honest placeholder documenting the missing key instead of inventing data.
|
||||
2. Column dictionary — name / type / nulls / non-null examples. Examples come
|
||||
2. Column dictionary — name / type / nulls / non-null examples plus, when the
|
||||
LLM layer ran, the business **description** and **unit** of each column so the
|
||||
reader knows at a glance what every column is and in which unit. Examples come
|
||||
from ``columns[i]['examples']`` when present; otherwise they are derived from
|
||||
real non-null profile values (categorical top values, numeric min/median/max)
|
||||
so the cell is never empty nor fabricated.
|
||||
3. ``df.describe`` — mean / median / min / max / std for every numeric column.
|
||||
3. ``df.describe`` — mean / median / min / max / std for every numeric column,
|
||||
plus its **unit** (same LLM source) so the stats read in context.
|
||||
|
||||
The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
|
||||
``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
|
||||
of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
|
||||
only **consumes** it, matching by column name; it never calls the LLM nor
|
||||
recomputes anything. When the block is absent (``run_llm`` did not run) those
|
||||
cells degrade to ``"—"`` and the tables still render.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
@@ -20,13 +30,59 @@ from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "overview"
|
||||
CHAPTER_TITLE = "Overview"
|
||||
|
||||
# Profile/ctx keys the calculation phase must add for a full head + examples.
|
||||
HEAD_KEY = "head_rows" # list[dict] — df.head(n)
|
||||
EXAMPLES_KEY = "examples" # per column: list of non-null sample values
|
||||
LLM_KEY = "llm" # interpretive block from eda_llm_insights
|
||||
|
||||
|
||||
def _llm_dict_index(profile: dict, ctx: dict) -> dict:
|
||||
"""Map column name -> its LLM dictionary entry (description/unit/...).
|
||||
|
||||
Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
|
||||
profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
|
||||
dict when no LLM block ran, so the caller degrades to "—" cells. Fully
|
||||
defensive: never raises on malformed input.
|
||||
"""
|
||||
llm = profile.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
llm = ctx.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
return {}
|
||||
entries = llm.get("dictionary")
|
||||
if not isinstance(entries, (list, tuple)):
|
||||
return {}
|
||||
index: dict = {}
|
||||
for e in entries:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
col = e.get("column")
|
||||
if col is None:
|
||||
continue
|
||||
index[model._safe_str(col)] = e
|
||||
return index
|
||||
|
||||
|
||||
def _llm_desc(entry) -> str:
|
||||
"""Business description of a column from its LLM entry, or "—"."""
|
||||
if not isinstance(entry, dict):
|
||||
return "—"
|
||||
raw = entry.get("description") or entry.get("business_meaning")
|
||||
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
|
||||
return text or "—"
|
||||
|
||||
|
||||
def _llm_unit(entry) -> str:
|
||||
"""Unit of a column from its LLM entry, or "—"."""
|
||||
if not isinstance(entry, dict):
|
||||
return "—"
|
||||
raw = entry.get("unit")
|
||||
text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
|
||||
return text or "—"
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
@@ -104,9 +160,12 @@ def _head_block(profile: dict, ctx: dict):
|
||||
"pasarlo en ctx['head_rows'] para mostrar las primeras filas.")
|
||||
|
||||
|
||||
def _columns_block(profile: dict):
|
||||
def _columns_block(profile: dict, llm_index: dict):
|
||||
cols = profile.get("columns") or []
|
||||
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"]
|
||||
# Descripción / Unidad come from the LLM dictionary (matched by column name);
|
||||
# they read "—" when run_llm did not run, so the table always renders.
|
||||
header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
|
||||
"Descripción", "Unidad"]
|
||||
rows = []
|
||||
for c in cols:
|
||||
if not isinstance(c, dict):
|
||||
@@ -126,15 +185,18 @@ def _columns_block(profile: dict):
|
||||
nulls = str(null_count)
|
||||
else:
|
||||
nulls = "—"
|
||||
rows.append([name, ctype, nulls, _examples_for(c)])
|
||||
entry = llm_index.get(model._safe_str(name))
|
||||
rows.append([name, ctype, nulls, _examples_for(c),
|
||||
_llm_desc(entry), _llm_unit(entry)])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows, title="Columnas")
|
||||
|
||||
|
||||
def _describe_block(profile: dict):
|
||||
def _describe_block(profile: dict, llm_index: dict):
|
||||
cols = profile.get("columns") or []
|
||||
header = ["Columna", "mean", "median", "min", "max", "std"]
|
||||
# "Unidad" (LLM source) lets the reader know in which unit each stat is.
|
||||
header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
|
||||
rows = []
|
||||
for c in cols:
|
||||
if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
|
||||
@@ -142,13 +204,16 @@ def _describe_block(profile: dict):
|
||||
num = c.get("numeric") or {}
|
||||
if not num:
|
||||
continue
|
||||
name = c.get("name") or "(col)"
|
||||
entry = llm_index.get(model._safe_str(name))
|
||||
rows.append([
|
||||
c.get("name") or "(col)",
|
||||
name,
|
||||
_fmt_num(num.get("mean")),
|
||||
_fmt_num(num.get("median")),
|
||||
_fmt_num(num.get("min")),
|
||||
_fmt_num(num.get("max")),
|
||||
_fmt_num(num.get("std")),
|
||||
_llm_unit(entry),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
@@ -163,16 +228,18 @@ def build_overview(profile: dict, ctx: dict):
|
||||
if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
|
||||
return None
|
||||
|
||||
llm_index = _llm_dict_index(profile, ctx)
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Primeras filas (df.head)", level=2),
|
||||
_head_block(profile, ctx),
|
||||
]
|
||||
cols_block = _columns_block(profile)
|
||||
cols_block = _columns_block(profile, llm_index)
|
||||
if cols_block is not None:
|
||||
blocks.append(model.Heading(
|
||||
text="Diccionario de columnas", level=2))
|
||||
blocks.append(cols_block)
|
||||
desc_block = _describe_block(profile)
|
||||
desc_block = _describe_block(profile, llm_index)
|
||||
if desc_block is not None:
|
||||
blocks.append(model.Heading(
|
||||
text="Resumen estadístico numérico", level=2))
|
||||
|
||||
@@ -56,7 +56,21 @@ def _head_rows() -> list:
|
||||
]
|
||||
|
||||
|
||||
def _profile(with_head: bool = True) -> dict:
|
||||
def _llm() -> dict:
|
||||
"""Interpretive block as eda_llm_insights stores it under profile['llm']."""
|
||||
return {
|
||||
"summary": "Pasajeros del Titanic.",
|
||||
"dictionary": [
|
||||
{"column": "PassengerId", "description": "Identificador del pasajero",
|
||||
"business_meaning": "Clave única de cada pasajero", "unit": "id"},
|
||||
{"column": "Pclass", "description": "Clase del billete",
|
||||
"business_meaning": "Clase socioeconómica", "unit": "clase (1-3)"},
|
||||
# No entry for Survived/Name/Sex on purpose -> they degrade to "—".
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _profile(with_head: bool = True, with_llm: bool = False) -> dict:
|
||||
prof = {
|
||||
"table": "titanic",
|
||||
"source": "/data/titanic.csv",
|
||||
@@ -68,6 +82,8 @@ def _profile(with_head: bool = True) -> dict:
|
||||
}
|
||||
if with_head:
|
||||
prof["head_rows"] = _head_rows()
|
||||
if with_llm:
|
||||
prof["llm"] = _llm()
|
||||
return prof
|
||||
|
||||
|
||||
@@ -185,3 +201,70 @@ def test_edge_none_y_vacio_no_rompen():
|
||||
assert ch is not None
|
||||
tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
|
||||
assert tables and len(tables[0].rows) == 3
|
||||
|
||||
|
||||
def _table_by_header(blocks, marker: str):
|
||||
"""Return the first DataTable whose header contains ``marker``."""
|
||||
for b in _flatten(blocks):
|
||||
if isinstance(b, DataTable) and marker in b.header:
|
||||
return b
|
||||
return None
|
||||
|
||||
|
||||
def test_golden_diccionario_lleva_descripcion_y_unidad_del_llm():
|
||||
# With run_llm: the column dictionary gains "Descripción" and "Unidad"
|
||||
# columns populated from profile['llm']['dictionary'], matched by name.
|
||||
ch = build_overview(_profile(with_llm=True), {})
|
||||
assert ch is not None
|
||||
dic = _table_by_header(ch.blocks, "Descripción")
|
||||
assert dic is not None
|
||||
assert dic.header == ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
|
||||
"Descripción", "Unidad"]
|
||||
by_name = {row[0]: row for row in dic.rows}
|
||||
# PassengerId has an LLM entry -> description + unit populated.
|
||||
assert by_name["PassengerId"][4] == "Identificador del pasajero"
|
||||
assert by_name["PassengerId"][5] == "id"
|
||||
assert by_name["Pclass"][5] == "clase (1-3)"
|
||||
# Columns with no LLM entry degrade to "—" without breaking the row.
|
||||
assert by_name["Survived"][4] == "—" and by_name["Survived"][5] == "—"
|
||||
|
||||
|
||||
def test_golden_describe_lleva_unidad_del_llm():
|
||||
ch = build_overview(_profile(with_llm=True), {})
|
||||
desc = _table_by_header(ch.blocks, "std")
|
||||
assert desc is not None
|
||||
assert desc.header[-1] == "Unidad"
|
||||
by_name = {row[0]: row for row in desc.rows}
|
||||
assert by_name["PassengerId"][-1] == "id"
|
||||
assert by_name["Pclass"][-1] == "clase (1-3)"
|
||||
# Numeric column with no LLM unit still renders, unit "—".
|
||||
assert by_name["Survived"][-1] == "—"
|
||||
|
||||
|
||||
def test_edge_sin_llm_descripcion_unidad_son_guion():
|
||||
# No profile['llm'] at all: the new cells degrade to "—" and nothing breaks.
|
||||
ch = build_overview(_profile(), {})
|
||||
assert ch is not None
|
||||
dic = _table_by_header(ch.blocks, "Unidad")
|
||||
assert dic is not None
|
||||
for row in dic.rows:
|
||||
assert row[4] == "—" and row[5] == "—"
|
||||
desc = _table_by_header(ch.blocks, "std")
|
||||
assert all(row[-1] == "—" for row in desc.rows)
|
||||
|
||||
|
||||
def test_golden_llm_via_ctx_tambien_funciona():
|
||||
# LLM block arriving through ctx['llm'] (fallback path) is consumed too.
|
||||
ch = build_overview(_profile(with_llm=False), {"llm": _llm()})
|
||||
dic = _table_by_header(ch.blocks, "Descripción")
|
||||
by_name = {row[0]: row for row in dic.rows}
|
||||
assert by_name["PassengerId"][5] == "id"
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_descripcion_y_unidad():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
render_automatic_eda_pdf(_profile(with_llm=True), out, {"title": "EDA"})
|
||||
txt = _pdf_text(out)
|
||||
assert "Descripción" in txt and "Unidad" in txt
|
||||
assert "Identificador del pasajero" in txt
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
---
|
||||
id: categorical_top_bar_figure_py_datascience
|
||||
name: categorical_top_bar_figure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def categorical_top_bar_figure(top: list, n_distinct: int = 0, title: str = \"\", top_k: int = 6, n_rows=None) -> \"matplotlib.figure.Figure\""
|
||||
description: "Construye una figura matplotlib de barras horizontales de las top_k categorías más frecuentes de una columna categórica, con la mayor arriba y agregando el resto en una barra gris \"Otros (N categorías)\". Contrato de entrada idéntico a categorical_top_pie_figure (swap directo donut↔barras): consume el bloque `top` de summarize_categorical y devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA. Backend Agg sin pyplot global; defensivo total ante top vacío/None, nunca lanza."
|
||||
tags: [eda, categorical, bar, barh, matplotlib, figure, visualization, datascience, impure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [matplotlib]
|
||||
example: |
|
||||
from categorical_top_bar_figure import categorical_top_bar_figure
|
||||
top = [
|
||||
{"value": "rojo", "count": 40, "pct": 0.4},
|
||||
{"value": "azul", "count": 30, "pct": 0.3},
|
||||
{"value": "verde", "count": 20, "pct": 0.2},
|
||||
]
|
||||
fig = categorical_top_bar_figure(top, n_distinct=12, title="color", top_k=6, n_rows=100)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_returns_figure"
|
||||
- "test_ten_items_topk_six_yields_seven_bars"
|
||||
- "test_empty_top_does_not_raise_and_returns_figure"
|
||||
- "test_long_value_truncated"
|
||||
- "test_none_value_and_none_count_are_handled"
|
||||
- "test_n_rows_adds_exact_others_bar"
|
||||
test_file_path: "python/functions/datascience/categorical_top_bar_figure_test.py"
|
||||
file_path: "python/functions/datascience/categorical_top_bar_figure.py"
|
||||
params:
|
||||
- name: top
|
||||
desc: "Lista de dicts {value, count, pct} ordenada de mayor a menor por count (salida del bloque `top` de summarize_categorical). Puede venir vacía o con dicts incompletos: items no-dict, sin count, con count None o count <= 0 se descartan. value None se admite (etiqueta vacía)."
|
||||
- name: n_distinct
|
||||
desc: "Nº total de categorías distintas de la columna. Etiqueta la barra agregada como \"Otros (n_distinct - top_k)\" (mínimo 0). Si no supera el nº de barras mostradas, se usa el overflow real de `top` como nº de categorías agregadas. Default 0."
|
||||
- name: title
|
||||
desc: "Título de la figura (nombre de la columna). Se trunca a ~48 chars con elipsis si es muy largo. Default \"\" (sin título)."
|
||||
- name: top_k
|
||||
desc: "Nº máximo de barras explícitas. Default 6. La barra \"Otros\" no cuenta contra este límite. Con top_k <= 0 se muestra al menos la categoría mayor."
|
||||
- name: n_rows
|
||||
desc: "Opcional. Total de filas del dataset. Si se da y la suma de counts mostrados < n_rows, la barra \"Otros\" usa (n_rows - suma_mostrada) como count para que sea exacta respecto al total real. Si se omite, \"Otros\" usa la suma de counts fuera del top_k mostrado (solo cuando top trae más de top_k items). Default None."
|
||||
output: "Un matplotlib.figure.Figure (figsize 6.4 x altura escalada con el nº de barras, dpi 150) con un Axes de barras horizontales: la categoría más frecuente arriba, la barra gris \"Otros (N categorías)\" abajo, cada barra anotada con su conteo y porcentaje al final y etiquetas de categoría (yticklabels) truncadas a ~22 chars. Si no hay counts válidos devuelve igualmente una Figure con un texto centrado \"sin datos categóricos\" (nunca lanza); cualquier error inesperado cae a una Figure con el texto del error. El caller rasteriza/cierra la figura; la función no la muestra ni la guarda."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from categorical_top_bar_figure import categorical_top_bar_figure
|
||||
|
||||
# `top` es la salida del bloque "top" de summarize_categorical (ya ordenado desc).
|
||||
top = [
|
||||
{"value": "rojo", "count": 40, "pct": 0.40},
|
||||
{"value": "azul", "count": 30, "pct": 0.30},
|
||||
{"value": "verde", "count": 20, "pct": 0.20},
|
||||
{"value": "amarillo", "count": 5, "pct": 0.05},
|
||||
]
|
||||
|
||||
fig = categorical_top_bar_figure(
|
||||
top,
|
||||
n_distinct=12, # 12 categorías distintas en total
|
||||
title="color_producto",
|
||||
top_k=6, # hasta 6 barras explícitas
|
||||
n_rows=100, # "Otros" = 100 - 95 = 5, sobre 8 categorías agregadas
|
||||
)
|
||||
|
||||
# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
|
||||
fig.savefig("/tmp/barras_color.png")
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala dentro de un informe EDA cuando quieras comparar **magnitudes** de las
|
||||
categorías dominantes de una columna categórica: qué categoría manda y por
|
||||
cuánto frente a las siguientes. Pásale directamente el bloque `top` de
|
||||
`summarize_categorical` (ya ordenado de mayor a menor) más `n_distinct` para que
|
||||
la barra "Otros" indique cuántas categorías quedan agrupadas. Es el clon "de
|
||||
barras" del donut `categorical_top_pie_figure` con **contrato de entrada
|
||||
idéntico**: puedes intercambiar una por otra sin tocar el caller. Elige barras
|
||||
cuando importe comparar tamaños exactos; el donut cuando importe la proporción
|
||||
del total.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
|
||||
y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
|
||||
para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
|
||||
es thread-safe; esta función evita ese riesgo construyendo el `Figure`
|
||||
directamente, así que es segura de llamar en bucle desde el renderer.
|
||||
- **El caller cierra la figura.** La función devuelve el `Figure` pero no lo
|
||||
muestra ni lo guarda. Quien la consume debe rasterizarla y luego liberarla
|
||||
(`fig.clf()` / `matplotlib.pyplot.close(fig)` si se usó pyplot en el caller)
|
||||
para no acumular memoria en lotes grandes de columnas.
|
||||
- **`barh` dibuja de abajo arriba.** La categoría más frecuente va arriba porque
|
||||
el orden de display se invierte antes de plotear; la barra "Otros" queda
|
||||
siempre al fondo. No reordenes `top` esperando otro layout: la función asume
|
||||
que ya viene ordenado desc por count.
|
||||
- **Magnitud exacta de "Otros" solo con `n_rows`.** Sin `n_rows`, la barra
|
||||
"Otros" se calcula con el overflow presente en `top`; si `top` ya viene
|
||||
recortado a `top_k` por el productor, no habrá "Otros" aunque existan más
|
||||
categorías. Pasa `n_rows` (total de filas del dataset) para una barra correcta
|
||||
respecto al total real.
|
||||
- **Defensiva, nunca lanza.** `top=[]`, `value=None`, `count=None` o counts no
|
||||
numéricos se manejan sin error: en el peor caso devuelve una `Figure` con
|
||||
"sin datos categóricos", y cualquier excepción inesperada cae a una `Figure`
|
||||
con el texto del error. No envuelvas la llamada en try/except por miedo a un
|
||||
raise — no lo hay.
|
||||
@@ -0,0 +1,233 @@
|
||||
"""Impure EDA helper: horizontal bar figure of the most common categories (`eda` group).
|
||||
|
||||
Builds a horizontal bar chart of the ``top_k`` most frequent categories of a
|
||||
categorical column, folding everything else into a single gray
|
||||
"Otros (N categorías)" bar. The most frequent category sits at the top, each bar
|
||||
labelled with its count (and percentage) at the end. Returns a ready-to-rasterize
|
||||
``matplotlib.figure.Figure``; it never shows nor saves it.
|
||||
|
||||
This is the "magnitude" twin of ``categorical_top_pie_figure``: identical input
|
||||
contract (same ``top``/``n_distinct``/``title``/``top_k``/``n_rows`` signature) so
|
||||
it can be swapped in directly, but it communicates comparable magnitudes via bars
|
||||
instead of proportions via wedges.
|
||||
|
||||
Impure because it touches matplotlib's rendering machinery. It uses the headless
|
||||
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
|
||||
global state and is safe to call repeatedly from a report renderer.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
|
||||
# Gray reserved for the aggregated "Otros" bar.
|
||||
_OTHER_COLOR = "#9e9e9e"
|
||||
# Muted gray for secondary text (title fallback, no-data message).
|
||||
_MUTED_TEXT = "#5f6b7a"
|
||||
# Soft red for the error fallback message.
|
||||
_ERROR_TEXT = "#b00020"
|
||||
# Pleasant, colour-blind-friendly qualitative palette for the explicit bars.
|
||||
_PALETTE = [
|
||||
"#4C72B0",
|
||||
"#DD8452",
|
||||
"#55A868",
|
||||
"#C44E52",
|
||||
"#8172B3",
|
||||
"#937860",
|
||||
"#DA8BC3",
|
||||
"#8C8C8C",
|
||||
"#CCB974",
|
||||
"#64B5CD",
|
||||
]
|
||||
|
||||
|
||||
def _truncate(text, width: int = 22) -> str:
|
||||
"""Truncate ``text`` to ``width`` chars, appending an ellipsis if cut."""
|
||||
s = "" if text is None else str(text)
|
||||
if len(s) <= width:
|
||||
return s
|
||||
if width <= 1:
|
||||
return s[:width]
|
||||
return s[: width - 1] + "…"
|
||||
|
||||
|
||||
def _message_figure(message: str, color: str = _MUTED_TEXT, title: str = "") -> "Figure":
|
||||
"""Return a fallback ``Figure`` carrying a single centered message."""
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
ax.axis("off")
|
||||
ax.text(
|
||||
0.5,
|
||||
0.5,
|
||||
message,
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=12,
|
||||
color=color,
|
||||
wrap=True,
|
||||
transform=ax.transAxes,
|
||||
)
|
||||
if title:
|
||||
ax.set_title(_truncate(title, 48), fontsize=12, loc="center", pad=8)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def categorical_top_bar_figure(
|
||||
top: list,
|
||||
n_distinct: int = 0,
|
||||
title: str = "",
|
||||
top_k: int = 6,
|
||||
n_rows=None,
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Build a horizontal bar figure of the most common categories of a column.
|
||||
|
||||
Renders the ``top_k`` most frequent categories as explicit horizontal bars,
|
||||
largest at the top, and aggregates every remaining category into a single
|
||||
gray "Otros (N categorías)" bar at the bottom. Each bar is annotated with its
|
||||
count and percentage of the total at the end of the bar; the category names
|
||||
are truncated Y tick labels.
|
||||
|
||||
The function shares the exact input contract of
|
||||
``categorical_top_pie_figure`` (the donut twin) so it is a drop-in swap. It is
|
||||
fully defensive: empty input, missing/``None`` values or counts never raise.
|
||||
When there is nothing valid to draw it still returns a ``Figure`` carrying a
|
||||
centered "sin datos categóricos" message, and any unexpected error is caught
|
||||
and turned into a fallback ``Figure`` carrying the error text.
|
||||
|
||||
Args:
|
||||
top: List of ``{value, count, pct}`` dicts, already sorted by ``count``
|
||||
descending (the ``top`` block of ``summarize_categorical``). May be
|
||||
empty or carry incomplete/``None`` entries; non-dict items, items
|
||||
without a positive numeric ``count`` and ``None`` counts are skipped.
|
||||
n_distinct: Total number of distinct categories in the column. Used to
|
||||
label the aggregated bar as "Otros (n_distinct - top_k)" (floored at
|
||||
0). Ignored when it does not exceed the number of shown bars.
|
||||
title: Figure title (the column name). Truncated when too long.
|
||||
top_k: Maximum number of explicit bars. Default 6. The "Otros" bar does
|
||||
not count against this limit.
|
||||
n_rows: Optional total row count of the dataset. When given and the sum of
|
||||
shown counts is below ``n_rows``, the "Otros" bar uses
|
||||
``n_rows - sum_shown`` as its count so it is exact with respect to the
|
||||
real total. When omitted, "Otros" uses the sum of the counts that fall
|
||||
outside the shown ``top_k`` (only when ``top`` carries more than
|
||||
``top_k`` items).
|
||||
|
||||
Returns:
|
||||
A ``matplotlib.figure.Figure`` with a single horizontal-bar Axes. The
|
||||
caller is responsible for rasterizing/closing it.
|
||||
"""
|
||||
try:
|
||||
safe_title = _truncate(title, 48)
|
||||
|
||||
# --- Defensive parse: keep only well-formed {value, count} with count > 0.
|
||||
cleaned = []
|
||||
if isinstance(top, list):
|
||||
for item in top:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
count = item.get("count")
|
||||
if count is None:
|
||||
continue
|
||||
try:
|
||||
count = float(count)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if count <= 0:
|
||||
continue
|
||||
cleaned.append((item.get("value"), count))
|
||||
|
||||
if not cleaned:
|
||||
return _message_figure("sin datos categóricos", title=title)
|
||||
|
||||
# --- Split into shown bars and the aggregated remainder.
|
||||
shown = cleaned[: max(int(top_k), 0)]
|
||||
if not shown: # top_k <= 0 — show at least the largest category.
|
||||
shown = cleaned[:1]
|
||||
|
||||
sum_shown = sum(c for _, c in shown)
|
||||
overflow_count = sum(c for _, c in cleaned[len(shown):])
|
||||
|
||||
# How many categories are folded into "Otros".
|
||||
try:
|
||||
nd = int(n_distinct)
|
||||
except (TypeError, ValueError):
|
||||
nd = 0
|
||||
others_categories = max(nd - len(shown), 0)
|
||||
# If n_distinct is unknown/too small, fall back to the overflow we
|
||||
# actually have in `top` beyond the shown bars.
|
||||
overflow_items = len(cleaned) - len(shown)
|
||||
if others_categories == 0 and overflow_items > 0:
|
||||
others_categories = overflow_items
|
||||
|
||||
# Count attributed to the "Otros" bar.
|
||||
others_count = 0.0
|
||||
if n_rows is not None:
|
||||
try:
|
||||
total_rows = float(n_rows)
|
||||
except (TypeError, ValueError):
|
||||
total_rows = None
|
||||
if total_rows is not None and total_rows > sum_shown:
|
||||
others_count = total_rows - sum_shown
|
||||
if others_count <= 0:
|
||||
others_count = overflow_count
|
||||
|
||||
# --- Build the display order (top to bottom): largest .. smallest, Otros.
|
||||
display_labels = [_truncate(v, 22) for v, _ in shown]
|
||||
display_values = [c for _, c in shown]
|
||||
display_colors = [_PALETTE[i % len(_PALETTE)] for i in range(len(shown))]
|
||||
|
||||
has_others = others_count > 0 and others_categories > 0
|
||||
if has_others:
|
||||
display_labels.append(f"Otros ({others_categories} categorías)")
|
||||
display_values.append(others_count)
|
||||
display_colors.append(_OTHER_COLOR)
|
||||
|
||||
total = sum(display_values) or 1.0
|
||||
|
||||
# barh draws bottom-up, so reverse the display order before plotting to
|
||||
# land the largest category on top and "Otros" at the bottom.
|
||||
labels = list(reversed(display_labels))
|
||||
values = list(reversed(display_values))
|
||||
colors = list(reversed(display_colors))
|
||||
y_pos = range(len(values))
|
||||
|
||||
# Height scales with the number of bars so dense reports stay readable.
|
||||
n_bars = len(values)
|
||||
height = max(2.4, min(0.4 * n_bars + 1.2, 14.0))
|
||||
fig = Figure(figsize=(6.4, height), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
ax.barh(list(y_pos), values, color=colors, edgecolor="white")
|
||||
ax.set_yticks(list(y_pos))
|
||||
ax.set_yticklabels(labels, fontsize=8)
|
||||
ax.set_xlabel("conteo", fontsize=9)
|
||||
|
||||
max_val = max(values) if values else 1.0
|
||||
ax.set_xlim(0, max_val * 1.18 if max_val > 0 else 1.0)
|
||||
|
||||
# Annotate each bar with its count and percentage at the end of the bar.
|
||||
for y, val in zip(y_pos, values):
|
||||
pct = val / total * 100.0
|
||||
ax.text(
|
||||
val + max_val * 0.012,
|
||||
y,
|
||||
f"{int(round(val))} ({pct:.0f}%)",
|
||||
va="center",
|
||||
ha="left",
|
||||
fontsize=7,
|
||||
color="#202020",
|
||||
)
|
||||
|
||||
if safe_title:
|
||||
ax.set_title(safe_title, fontsize=13, loc="left", pad=10)
|
||||
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
except Exception as exc: # noqa: BLE001 — never raise from a figure builder.
|
||||
return _message_figure(
|
||||
f"error al dibujar barras: {exc}", color=_ERROR_TEXT
|
||||
)
|
||||
@@ -0,0 +1,103 @@
|
||||
"""Tests para categorical_top_bar_figure (barras de categorías top, grupo eda).
|
||||
|
||||
Usa el backend Agg sin pyplot; no muestra ni guarda figuras. Cada test cierra
|
||||
explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular
|
||||
estado entre tests.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
from categorical_top_bar_figure import categorical_top_bar_figure
|
||||
|
||||
|
||||
def _make_top(n):
|
||||
"""n items {value, count, pct} ordenados desc por count."""
|
||||
return [
|
||||
{"value": f"cat_{i}", "count": n - i, "pct": (n - i) / sum(range(1, n + 1))}
|
||||
for i in range(n)
|
||||
]
|
||||
|
||||
|
||||
def _bar_count(ax):
|
||||
"""Devuelve el nº de barras (longitud del primer BarContainer del Axes)."""
|
||||
if ax.containers:
|
||||
return len(ax.containers[0])
|
||||
return 0
|
||||
|
||||
|
||||
def test_returns_figure():
|
||||
fig = categorical_top_bar_figure(_make_top(3), n_distinct=3, title="col")
|
||||
assert isinstance(fig, Figure)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_ten_items_topk_six_yields_seven_bars():
|
||||
top = _make_top(10)
|
||||
fig = categorical_top_bar_figure(top, n_distinct=10, title="muchas", top_k=6)
|
||||
ax = fig.axes[0]
|
||||
# 6 categorías explícitas + 1 barra "Otros".
|
||||
assert _bar_count(ax) == 7
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_top_does_not_raise_and_returns_figure():
|
||||
fig = categorical_top_bar_figure([], n_distinct=0, title="vacía")
|
||||
assert isinstance(fig, Figure)
|
||||
# Sin datos: no debe haber barras.
|
||||
assert _bar_count(fig.axes[0]) == 0
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_long_value_truncated():
|
||||
long_value = "una_categoria_con_un_nombre_larguisimo_que_excede_el_limite"
|
||||
top = [
|
||||
{"value": long_value, "count": 10, "pct": 0.5},
|
||||
{"value": "corta", "count": 10, "pct": 0.5},
|
||||
]
|
||||
fig = categorical_top_bar_figure(top, n_distinct=2, title="col", top_k=6)
|
||||
ax = fig.axes[0]
|
||||
tick_texts = [t.get_text() for t in ax.get_yticklabels()]
|
||||
# El valor largo aparece truncado con elipsis y NO en su forma completa.
|
||||
assert any("…" in t for t in tick_texts)
|
||||
assert long_value not in " ".join(tick_texts)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_none_value_and_none_count_are_handled():
|
||||
top = [
|
||||
{"value": None, "count": 5, "pct": 0.5},
|
||||
{"value": "b", "count": None, "pct": 0.0}, # count None -> se descarta
|
||||
{"value": "c", "count": 5, "pct": 0.5},
|
||||
]
|
||||
fig = categorical_top_bar_figure(top, n_distinct=2, title="con nones", top_k=6)
|
||||
assert isinstance(fig, Figure)
|
||||
# Solo 2 items válidos, sin overflow -> 2 barras, sin "Otros".
|
||||
assert _bar_count(fig.axes[0]) == 2
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_n_rows_adds_exact_others_bar():
|
||||
# 3 categorías mostradas suman 30, dataset real 100 -> "Otros" = 70.
|
||||
top = [
|
||||
{"value": "a", "count": 15, "pct": 0.15},
|
||||
{"value": "b", "count": 10, "pct": 0.10},
|
||||
{"value": "c", "count": 5, "pct": 0.05},
|
||||
]
|
||||
fig = categorical_top_bar_figure(
|
||||
top, n_distinct=20, title="col", top_k=3, n_rows=100
|
||||
)
|
||||
ax = fig.axes[0]
|
||||
# 3 explícitas + Otros.
|
||||
assert _bar_count(ax) == 4
|
||||
tick_texts = [t.get_text() for t in ax.get_yticklabels()]
|
||||
# La barra Otros refleja n_distinct - top_k = 17 categorías.
|
||||
assert any("Otros (17 categorías)" in t for t in tick_texts)
|
||||
# Su anotación lleva el count 70.
|
||||
annotation_texts = [t.get_text() for t in ax.texts]
|
||||
assert any("70" in t for t in annotation_texts)
|
||||
plt.close(fig)
|
||||
Reference in New Issue
Block a user