diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr.py b/python/functions/datascience/automatic_eda/chapters/cat_distr.py index b722c68a..731aa7b7 100644 --- a/python/functions/datascience/automatic_eda/chapters/cat_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/cat_distr.py @@ -5,28 +5,32 @@ page (PDF) / slide (PPTX)**: every column is wrapped in a keep-together ``model.Group`` with ``page_break_before=True`` (except the first, which may share the intro's page), so its chart sits next to its tables and no column is split. -A short intro names the clickable **[[term:entropia]]entropía[[/term]]** term — -the full definition lives in the GLOSARIO chapter, so it is NOT repeated inline -here (one click jumps to the glossary entry). The intro also carries the dataset -row total used as a comparison baseline. +Per column the Group is laid out ``side_by_side`` (PPTX: cardinality table LEFT, +chart RIGHT; PDF: stacked) and contains, in order: -Per column the Group contains, in order: - -1. A cardinality key/value table: distinct values, ``% distinct`` (distinct / +1. The column name plus, when the LLM layer ran, its business **description** and + **unit** (read from ``profile['llm']['dictionary']``, matched by column name). +2. A cardinality key/value table: distinct values, ``% distinct`` (distinct / total rows), total dataset rows, singleton values (frequency 1), entropy with its theoretical maximum and the normalized ratio, mode, imbalance and string-length stats. -2. A short note flagging problematic cardinality (id-like ≈100% distinct, or a +3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a single dominating category). -3. A ``top-k`` table (value / count / %). -4. A **donut pie chart** of the most common categories (top-k + an "Otros" +4. A ``top-k`` table (value / count / %). +5. A **horizontal bar chart** of the most common categories (top-k + an "Otros" bucket), drawn lazily so the renderers scale it to fit entirely. +A short intro names the clickable **[[term:entropia]]entropía[[/term]]** and +**[[term:pagina_categorica]]page-layout[[/term]]** terms — their full +definitions live in the GLOSARIO chapter, so they are NOT repeated inline here +(one click jumps to the glossary entry). The intro also carries the dataset row +total used as a comparison baseline. + Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the output of ``summarize_categorical`` (``top[{value,count,pct}]``, ``mode``, ``n_distinct``, ``entropy``, ``imbalance``, ``len_min/mean/max``). The derived -cardinality metrics and the pie figure are delegated to two registry functions -(``categorical_cardinality_block`` and ``categorical_top_pie_figure``); both are +cardinality metrics and the bar figure are delegated to two registry functions +(``categorical_cardinality_block`` and ``categorical_top_bar_figure``); both are imported lazily and degrade to a minimal inline fallback so this chapter never raises even if they are unavailable. @@ -39,10 +43,21 @@ import math from .. import model -CHAPTER_VERSION = "1.2.0" +CHAPTER_VERSION = "1.3.0" CHAPTER_ID = "cat_distr" CHAPTER_TITLE = "Distribuciones categóricas" +# Key under which eda_llm_insights stores its interpretive block in the profile. +LLM_KEY = "llm" + +# Second glossary term this chapter names: "how each categorical page is laid +# out". The long paragraph that used to describe it inline in the intro now lives +# in the GLOSARIO chapter (canonical definition in ``glosario._BASELINE_TERMS``); +# the intro only names the clickable term, relocating the explanation, not losing +# it. The chapter only needs to register key+label here. +_TERM_PAGINA_KEY = "pagina_categorica" +_TERM_PAGINA_LABEL = "Cómo se organiza cada página categórica" + # Glossary term this chapter explains. Registered in the shared collector and # marked clickable on its first appearance (end-to-end glossary example — # mejora 6). Other chapters hook their own terms the same way (see the contract). @@ -59,14 +74,14 @@ _TERM_ENTROPIA_DEF = ( # Cap the number of categorical columns rendered to keep the document bounded; # the rest are summarized in a closing note (no silent truncation). MAX_COLS = 40 -# Rows shown in each top-k table and explicit slices in the pie. Kept moderate so -# the whole column — cardinality table + top-k table + donut — fits on ONE +# Rows shown in each top-k table and explicit bars in the chart. Kept moderate so +# the whole column — cardinality table + top-k table + bar chart — fits on ONE # page/slide with the chart next to its tables; the table note still reports # "top N of M" so nothing is silently hidden. For id-like columns (≈100% # distinct) the top-k table is dropped entirely (it would be a list of unique -# values — pure noise), which also frees the room the donut needs (see build). +# values — pure noise), which also frees the room the chart needs (see build). TOP_TABLE_ROWS = 8 -PIE_TOP_K = 6 +CHART_TOP_K = 6 # Truncate very long category labels in tables (the renderer also wraps). Kept # tight so a column with long id-like values (names, tickets) still fits its page. LABEL_MAX = 28 @@ -208,26 +223,74 @@ def _fallback_cardinality(cat: dict, n_rows) -> dict: } -def _pie_make(top, n_distinct, title, n_rows): - """Return a zero-arg callable that builds the donut figure lazily.""" +def _llm_index(profile: dict, ctx: dict) -> dict: + """Map column name -> its LLM dictionary entry (description/unit/...). + + Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the + profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty + dict when ``run_llm`` did not run, so the caller degrades cleanly. Fully + defensive: never raises on malformed input. + """ + llm = profile.get(LLM_KEY) + if not isinstance(llm, dict): + llm = ctx.get(LLM_KEY) + if not isinstance(llm, dict): + return {} + entries = llm.get("dictionary") + if not isinstance(entries, (list, tuple)): + return {} + index: dict = {} + for e in entries: + if not isinstance(e, dict): + continue + col = e.get("column") + if col is None: + continue + index[model._safe_str(col)] = e + return index + + +def _llm_desc_unit_block(name: str, llm_index: dict): + """Markdown block with the LLM business description + unit of a column, or + None when no LLM entry matches the column (clean fallback without LLM).""" + entry = llm_index.get(model._safe_str(name)) + if not isinstance(entry, dict): + return None + raw_desc = entry.get("description") or entry.get("business_meaning") + desc = " ".join(model._safe_str(raw_desc).split()) if raw_desc else "" + raw_unit = entry.get("unit") + unit = " ".join(model._safe_str(raw_unit).split()) if raw_unit else "" + parts = [] + if desc: + parts.append(f"**Descripción:** {desc}") + if unit: + parts.append(f"**Unidad:** {unit}") + if not parts: + return None + return model.Markdown(text=" · ".join(parts)) + + +def _bar_make(top, n_distinct, title, n_rows): + """Return a zero-arg callable that builds the bar figure lazily.""" def make(): try: - from datascience.categorical_top_pie_figure import ( - categorical_top_pie_figure, + from datascience.categorical_top_bar_figure import ( + categorical_top_bar_figure, ) - return categorical_top_pie_figure( + return categorical_top_bar_figure( top=top, n_distinct=n_distinct or 0, title=title, - top_k=PIE_TOP_K, n_rows=n_rows) + top_k=CHART_TOP_K, n_rows=n_rows) except Exception: # noqa: BLE001 — minimal local fallback figure. - return _fallback_pie(top, title) + return _fallback_bar(top, title) return make -def _fallback_pie(top, title): - """Minimal donut figure used only if the registry function is unavailable.""" +def _fallback_bar(top, title): + """Minimal horizontal-bar figure used only if the registry function is + unavailable. Largest category on top, the rest folded into "Otros".""" import matplotlib matplotlib.use("Agg") @@ -238,8 +301,8 @@ def _fallback_pie(top, title): items = [t for t in (top or []) if isinstance(t, dict) and isinstance(t.get("count"), (int, float))] items = sorted(items, key=lambda t: t.get("count") or 0, reverse=True) - head = items[:PIE_TOP_K] - rest = items[PIE_TOP_K:] + head = items[:CHART_TOP_K] + rest = items[CHART_TOP_K:] labels = [_truncate(t.get("value"), 20) for t in head] sizes = [float(t.get("count") or 0) for t in head] if rest: @@ -249,10 +312,13 @@ def _fallback_pie(top, title): ax.text(0.5, 0.5, "sin datos categóricos", ha="center", va="center") ax.axis("off") return fig - ax.pie(sizes, labels=None, wedgeprops={"width": 0.42}, - autopct=lambda p: f"{p:.0f}%" if p >= 4 else "") - ax.legend(labels, loc="center left", bbox_to_anchor=(1.0, 0.5), - fontsize=7, frameon=False) + # barh draws bottom-up, so reverse to put the largest category on top. + y_pos = range(len(labels)) + ax.barh(list(y_pos), list(reversed(sizes)), color="#4C72B0", + edgecolor="white") + ax.set_yticks(list(y_pos)) + ax.set_yticklabels(list(reversed(labels)), fontsize=7) + ax.set_xlabel("conteo", fontsize=8) ax.set_title(_truncate(title, 40)) fig.tight_layout() return fig @@ -373,22 +439,17 @@ def _topk_table(cat: dict): note=note) -def _intro_blocks(n_rows, mark_term: bool = False): - total = _fmt_int(n_rows) - # Mark the first appearance of the term as a clickable glossary jump when the - # term was registered (mark_term). The full definition of entropy lives in the - # GLOSARIO chapter, so the intro only names the clickable term here instead of - # repeating the long explanation (avoids the redundancy with the glossary). +def _intro_blocks(mark_term: bool = False): + # The full explanation of entropy AND of how each categorical page is laid out + # lives in the GLOSARIO chapter; the chapter body keeps only the minimal + # clickable terms — no descriptive prose — to avoid duplicating the glossary. + # The dataset row total is not repeated here: each column's cardinality table + # already carries "Total filas (dataset)". entropia = ("[[term:entropia]]entropía[[/term]]" if mark_term else "entropía") - text = ( - f"Cada columna categórica ocupa su propia página: sus métricas de " - f"cardinalidad —incluida la {entropia}—, una nota que señala cardinalidad " - "problemática, la tabla de las categorías más frecuentes y un gráfico de " - "tarta (donut) de las más comunes, todo junto." - ) - if n_rows is not None: - text += f" El dataset tiene {total} filas en total como referencia." + pagina = ("[[term:pagina_categorica]]cómo se organiza cada página[[/term]]" + if mark_term else "cómo se organiza cada página") + text = f"Términos: {entropia} · {pagina}." return [ model.Heading(text="Entropía y cardinalidad", level=2), model.Markdown(text=text), @@ -406,15 +467,22 @@ def build_cat_distr(profile: dict, ctx: dict): return None n_rows = profile.get("n_rows") - # Register "entropía" in the shared glossary collector (if present) and mark - # its first appearance clickable. End-to-end glossary example (mejora 6). + # Register "entropía" and the "how each categorical page is laid out" term in + # the shared glossary collector (if present) and mark their first appearance + # clickable. End-to-end glossary example (mejora 6). glossary = ctx.get("glossary") mark_term = False if isinstance(glossary, model.GlossaryCollector): glossary.add(_TERM_ENTROPIA_KEY, _TERM_ENTROPIA_LABEL, _TERM_ENTROPIA_DEF) + glossary.add(_TERM_PAGINA_KEY, _TERM_PAGINA_LABEL) mark_term = True - blocks = list(_intro_blocks(n_rows, mark_term=mark_term)) + blocks = list(_intro_blocks(mark_term=mark_term)) + + # Business description + unit per column come from the LLM dictionary + # (profile['llm']['dictionary'], matched by column name); absent without + # run_llm, in which case the per-column description block is simply omitted. + llm_index = _llm_index(profile, ctx) rendered = cat_cols[:MAX_COLS] for idx, col in enumerate(rendered): @@ -422,31 +490,36 @@ def build_cat_distr(profile: dict, ctx: dict): cat = col.get("categorical") or {} card = _normalize_card(_cardinality(cat, n_rows)) - # One Group per categorical column: heading + cardinality table + flag - # note + top-k table + donut figure are kept together and the renderer - # starts each on a fresh page/slide (page_break_before) so every column - # gets its own page with its chart next to its tables. The first column - # may share the intro's page (no forced break) to avoid a near-empty page. - col_blocks = [ - model.Heading(text=str(name), level=2), - _cardinality_block(card), - ] + # One Group per categorical column: heading + (optional) LLM description + + # cardinality table + flag note + top-k table + bar figure are kept + # together and the renderer starts each on a fresh page/slide + # (page_break_before) so every column gets its own page with its chart next + # to its tables. The first column may share the intro's page (no forced + # break) to avoid a near-empty page. + col_blocks = [model.Heading(text=str(name), level=2)] + desc_block = _llm_desc_unit_block(name, llm_index) + if desc_block is not None: + col_blocks.append(desc_block) + col_blocks.append(_cardinality_block(card)) note = _flag_note(card) if note is not None: col_blocks.append(note) # For id-like columns (≈100% distinct) the top-k is a list of unique # values — pure noise; skip it (the flag note already explains why) and - # let the donut take that room so the whole column fits one page/slide. + # let the bar chart take that room so the whole column fits one page/slide. if not card.get("id_like"): topk = _topk_table(cat) if topk is not None: col_blocks.append(topk) col_blocks.append(model.Figure( - make=_pie_make(cat.get("top") or [], card.get("n_distinct"), + make=_bar_make(cat.get("top") or [], card.get("n_distinct"), str(name), n_rows), caption=(f"Categorías más comunes de «{_truncate(name, 32)}» " - "(donut: top-k + «Otros»)"))) - blocks.append(model.Group(blocks=col_blocks, + "(barras: top-k + «Otros»)"))) + # layout="side_by_side": in PPTX the cardinality table goes to the LEFT and + # the bar chart to the RIGHT of the same slide; the PDF renderer stacks it + # (the A5 mobile page is too narrow for two readable columns). + blocks.append(model.Group(blocks=col_blocks, layout="side_by_side", page_break_before=(idx > 0))) if len(cat_cols) > len(rendered): diff --git a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py index 919b86fa..4dd9d334 100644 --- a/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py +++ b/python/functions/datascience/automatic_eda/chapters/cat_distr_test.py @@ -2,12 +2,14 @@ Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user -asked for (distinct/total/%-distinct/unique metrics, top-k table and a donut +asked for (distinct/total/%-distinct/unique metrics, top-k table and a bar figure), that EACH categorical column is wrapped in its own keep-together -``Group`` that starts on a fresh page/slide (one column per page, chart next to -its tables), that the long entropy explanation is NOT repeated inline (it lives -in the glossary — only the clickable term is kept), that the chapter renders -inside the full document to both PDF and PPTX showing that content, that a +``Group`` laid out ``side_by_side`` (PPTX: table left / bars right) that starts on +a fresh page/slide (one column per page, chart next to its tables), that the LLM +business description + unit are shown per column when the profile carries an LLM +block, that the long entropy / page-layout explanations are NOT repeated inline +(they live in the glossary — only the clickable terms are kept), that the chapter +renders inside the full document to both PDF and PPTX showing that content, that a profile with no categorical columns yields ``None`` without raising, and that long labels / many columns are never cut in either output. """ @@ -116,6 +118,10 @@ def test_golden_build_cat_distr_emite_bloques_pedidos(): assert "log2" not in md.text # redundant explanation removed. assert "máxima diversidad" not in md.text + # The donut/pie is gone: the intro no longer mentions tarta/donut (the chart + # is now a bar chart; the long page-layout explanation moved to the glossary). + assert "donut" not in md.text and "tarta" not in md.text + # Per-column blocks are wrapped in keep-together Groups: flatten to inspect. flat = _flatten(ch.blocks) kv = next(b for b in flat if isinstance(b, KVTable)) @@ -128,11 +134,13 @@ def test_golden_build_cat_distr_emite_bloques_pedidos(): assert any("Entropía" in lbl for lbl in labels) assert "únicos" in values and "%" in values assert "bits" in values and "norm" in values # entropy + max + normalized. - # Top-k table + pie figure. + # Top-k table + bar figure. dt = next(b for b in flat if isinstance(b, DataTable)) assert dt.header == ["Valor", "Conteo", "%"] assert any("neumaticos" in str(cell) for row in dt.rows for cell in row) assert any(isinstance(b, Figure) for b in flat) + # Each per-column Group is laid out side_by_side (table left / bars right). + assert all(g.layout == "side_by_side" for g in _column_groups(ch)) # id-like column flagged with a Note that also explains the top-k is dropped. idnote = next((b for b in flat if isinstance(b, Note) and "identificador" in b.text), None) @@ -140,9 +148,9 @@ def test_golden_build_cat_distr_emite_bloques_pedidos(): assert "No se lista el top" in idnote.text -def test_golden_idlike_omite_topk_y_conserva_donut(): +def test_golden_idlike_omite_topk_y_conserva_grafico(): # The id-like column (uuid, 100% distinct) must NOT carry a top-k DataTable - # (it would be a list of unique values), but must still keep its donut Figure + # (it would be a list of unique values), but must still keep its bar Figure # and its cardinality table so it stays a full per-column page. ch = build_cat_distr(_profile(), {}) groups = _column_groups(ch) @@ -151,7 +159,7 @@ def test_golden_idlike_omite_topk_y_conserva_donut(): kinds = [b.kind for b in uuid_group.blocks] assert "data_table" not in kinds # top-k of unique values dropped. assert "kv_table" in kinds # cardinality kept. - assert "figure" in kinds # donut kept (chart per column). + assert "figure" in kinds # bar chart kept (chart per column). # A non-id-like column keeps its top-k table. cat_group = next(g for g in groups if any(getattr(b, "text", "") == "categoria" @@ -205,7 +213,7 @@ def test_golden_render_pdf_una_pagina_por_columna(): assert "Entrop" in txt assert "distintos" in txt assert "categoria" in txt and "neumaticos" in txt - assert "donut" in txt # figure caption rendered as text. + assert "barras" in txt # bar-chart caption rendered as text (PDF). assert "identificador" in txt # id-like note rendered. @@ -258,9 +266,11 @@ def _profile_high_card() -> dict: def test_golden_pptx_una_slide_por_columna_con_su_grafico(): - """Each categorical column occupies EXACTLY ONE cat_distr slide that carries - BOTH its cardinality table and its donut figure (picture) — i.e. the chart is - never separated from its table, even for a high-cardinality column.""" + """Cada columna categórica ocupa EXACTAMENTE UN slide cat_distr que lleva su + gráfico (picture) en la misma slide — el chart nunca se separa de su columna, + ni siquiera para una columna de alta cardinalidad. Con layout side_by_side la + tabla se rasteriza a imagen, así que la comprobación se hace por presencia de + picture (no por el texto de la tabla).""" from pptx.enum.shapes import MSO_SHAPE_TYPE prof = _profile_high_card() @@ -272,7 +282,7 @@ def test_golden_pptx_una_slide_por_columna_con_su_grafico(): prs = Presentation(out) # Per column: the cat_distr slides whose text mentions it, and whether the - # owning slide also has the donut caption + an actual picture shape. + # owning slide also carries an actual picture shape (its chart). slides_with_col = {n: [] for n in cat_names} owner_has_chart = {n: False for n in cat_names} for i, sl in enumerate(prs.slides): @@ -288,15 +298,106 @@ def test_golden_pptx_una_slide_por_columna_con_su_grafico(): for n in cat_names: if n in txt: slides_with_col[n].append(i) - has_table = "Cardinalidad" in txt or "distintos" in txt - if has_pic and "donut" in txt and has_table: + if has_pic: owner_has_chart[n] = True for n in cat_names: # Exactly one slide carries the column (not split across slides). assert len(slides_with_col[n]) == 1, (n, slides_with_col[n]) - # That single slide also holds its table AND its donut picture. - assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide") + # That single slide also holds its chart picture. + assert owner_has_chart[n], (n, "el gráfico no está en el slide de la columna") + + +def test_golden_pptx_columna_side_by_side_tabla_izq_barra_der(): + """Con layout side_by_side, una columna categórica coloca su tabla de + cardinalidad (imagen) en la mitad izquierda y su gráfico de barras (imagen) en + la mitad derecha de la MISMA slide. Verifica que al menos una columna queda en + dos columnas (tabla-izq / barras-der), evidencia del side_by_side en PPTX.""" + from pptx.enum.shapes import MSO_SHAPE_TYPE + from pptx.util import Inches + + with tempfile.TemporaryDirectory() as d: + out = os.path.join(d, "eda.pptx") + render_automatic_eda_pptx(_profile(), out, {"title": "EDA"}) + prs = Presentation(out) + centre = int(Inches(13.333 / 2.0)) # half of the 16:9 slide width. + two_col_slides = 0 + for sl in prs.slides: + texts, lefts = [], [] + for sh in sl.shapes: + if sh.has_text_frame: + texts.append(sh.text_frame.text) + if (sh.shape_type == MSO_SHAPE_TYPE.PICTURE + and sh.left is not None): + lefts.append(sh.left) + txt = re.sub(r"\s+", " ", " ".join(texts)) + if "Distribuciones categ" not in txt: + continue + # One picture starts in the left half, another in the right half. + if len(lefts) >= 2 and min(lefts) < centre and max(lefts) > centre: + two_col_slides += 1 + assert two_col_slides >= 1, ( + "ninguna columna quedó con tabla-izq / barras-der (side_by_side)") + + +def _profile_with_llm() -> dict: + """The base profile plus an ``llm`` block (as eda_llm_insights would store it + with run_llm=True): a data dictionary with description/unit per column.""" + prof = _profile() + prof["llm"] = { + "dictionary": [ + {"column": "categoria", + "description": "Familia de producto del recambio", + "business_meaning": "Agrupa el catálogo por tipo de pieza", + "unit": "categoría"}, + {"column": "uuid", + "description": "Identificador único de registro", + "unit": ""}, + ], + } + return prof + + +def test_llm_descripcion_y_unidad_por_columna(): + # With an LLM dictionary, each categorical column whose name matches shows its + # business description and unit in a per-column markdown block. + ch = build_cat_distr(_profile_with_llm(), {}) + groups = _column_groups(ch) + cat_group = next(g for g in groups + if any(getattr(b, "text", "") == "categoria" + for b in g.blocks)) + md = " ".join(b.text for b in cat_group.blocks + if getattr(b, "kind", "") == "markdown") + assert "Descripción" in md and "Familia de producto" in md + assert "Unidad" in md and "categoría" in md + + +def test_edge_sin_llm_no_anade_descripcion(): + # Without an LLM block the per-column description markdown is simply omitted; + # the column still renders its cardinality table and bar figure. + ch = build_cat_distr(_profile(), {}) + for g in _column_groups(ch): + mds = [b.text for b in g.blocks if getattr(b, "kind", "") == "markdown"] + assert not any("Descripción" in t for t in mds) + + +def test_pagina_categorica_clicable_y_definicion_en_glosario(): + # The "how each categorical page is laid out" term is registered + marked + # clickable in the intro, and its full definition lands in the glossary + # chapter (canonical baseline catalog), not inline. + from datascience.automatic_eda.chapters.glosario import build_glosario + + gc = GlossaryCollector() + ch = build_cat_distr(_profile(), {"glossary": gc}) + md = next(b for b in ch.blocks if isinstance(b, Markdown)) + assert "[[term:pagina_categorica]]" in md.text + assert gc.has("pagina_categorica") + glos = build_glosario(_profile(), {"glossary": gc}) + entry = next(b for b in glos.blocks + if getattr(b, "kind", "") == "glossary_entry" + and b.key == "pagina_categorica") + assert "barras" in entry.definition + assert "identificador" in entry.definition def test_edge_sin_categoricas_devuelve_none(): diff --git a/python/functions/datascience/automatic_eda/chapters/glosario.py b/python/functions/datascience/automatic_eda/chapters/glosario.py index fe7098fc..6b7be259 100644 --- a/python/functions/datascience/automatic_eda/chapters/glosario.py +++ b/python/functions/datascience/automatic_eda/chapters/glosario.py @@ -17,10 +17,63 @@ from __future__ import annotations from .. import model -CHAPTER_VERSION = "1.0.0" +CHAPTER_VERSION = "1.1.0" CHAPTER_ID = "glosario" CHAPTER_TITLE = "Glosario" +# Canonical definitions for cross-cutting terms — the "how to read it" entries +# that do not belong to a single chapter. A chapter only needs to *register* the +# term (``ctx['glossary'].add(key, label)``) and mark its in-text appearance with +# ``[[term:key]]…[[/term]]``; this chapter supplies the full definition here when +# the collector carries the term without one. Keeping the prose in a single place +# avoids repeating a long paragraph inline in every chapter that names the term +# (the explanation moved out of the NUM DISTR and CAT DISTR intros lives here). +_BASELINE_TERMS = { + "histograma_boxplot": { + "label": "Cómo leer el histograma y el boxplot", + "definition": ( + "Para cada columna numérica se muestra su histograma con tres líneas " + "de referencia: la media (línea roja discontinua), la mediana (línea " + "verde continua) y la banda ±1σ (zona sombreada que cubre una " + "desviación estándar a cada lado de la media). Debajo, alineado al " + "mismo eje horizontal, un boxplot de Tukey: la caja abarca del primer " + "al tercer cuartil (P25–P75), la línea interior es la mediana y los " + "bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay " + "valores más allá de las vallas (posibles atípicos). Comparar la media " + "con la mediana revela la asimetría: si la media supera a la mediana la " + "cola larga cae hacia los valores altos (asimetría a la derecha), y al " + "revés hacia los bajos."), + }, + "pagina_categorica": { + "label": "Cómo se organiza cada página categórica", + "definition": ( + "Cada columna categórica ocupa su propia página: muestra sus métricas " + "de cardinalidad —incluida la entropía—, una nota que señala " + "cardinalidad problemática (columnas que se comportan como " + "identificador, con casi todos los valores distintos, o dominadas por " + "una sola categoría), la tabla de las categorías más frecuentes (top-k, " + "con su conteo y porcentaje) y un gráfico de barras de las categorías " + "más comunes (top-k más una barra «Otros» que agrupa la cola). El total " + "de filas del dataset se usa como referencia para interpretar los " + "conteos."), + }, +} + + +def _resolve_term(term: dict) -> tuple: + """Return (label, definition) for a collected term, completing a missing + definition (and, if absent, the label) from the canonical baseline catalog.""" + key = model._safe_str(term.get("key")) + label = model._safe_str(term.get("label")) + definition = model._safe_str(term.get("definition")) + base = _BASELINE_TERMS.get(key) + if base: + if not definition.strip(): + definition = model._safe_str(base.get("definition")) + if not label.strip() or label == key: + label = model._safe_str(base.get("label")) or label + return label, definition + def build_glosario(profile: dict, ctx: dict): """Build the glossary Chapter from the shared collector, or None if empty.""" @@ -36,12 +89,14 @@ def build_glosario(profile: dict, ctx: dict): "Cada término va resaltado en el texto y, al pulsarlo, salta a su " "definición en esta sección.")), ] - # One clickable destination per term, alphabetically by visible label. + # One clickable destination per term, alphabetically by visible label. A term + # registered without a definition is completed from the canonical baseline. for term in glossary.terms(by="label"): + label, definition = _resolve_term(term) blocks.append(model.GlossaryEntry( key=model._safe_str(term.get("key")), - label=model._safe_str(term.get("label")), - definition=model._safe_str(term.get("definition")))) + label=label, + definition=definition)) return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr.py b/python/functions/datascience/automatic_eda/chapters/num_distr.py index 5890b123..9401d710 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr.py @@ -35,10 +35,21 @@ try: except Exception: # noqa: BLE001 — keep the chapter importable no matter what. build_boxplot_stats = None # type: ignore[assignment] -CHAPTER_VERSION = "1.2.0" +CHAPTER_VERSION = "1.3.0" CHAPTER_ID = "num_distr" CHAPTER_TITLE = "Distribuciones numéricas" +# Glossary term this chapter explains. The long "how to read the histogram and +# the boxplot" paragraph used to live inline in the intro; it now lives in the +# GLOSARIO chapter (canonical definition in ``glosario._BASELINE_TERMS``) and the +# intro only names the clickable term — one click jumps to the full explanation, +# so the information is relocated, not lost (mejora glosario). +_TERM_HISTOBOX_KEY = "histograma_boxplot" +_TERM_HISTOBOX_LABEL = "Cómo leer el histograma y el boxplot" + +# Key under which eda_llm_insights stores its interpretive block in the profile. +LLM_KEY = "llm" + # Plain-Spanish gloss for every label ``detect_distribution_type`` can emit, so a # non-expert reader understands the shape and the suggested next step (MUST-4.3). _DIST_GLOSS = { @@ -99,6 +110,53 @@ def _numeric_columns(profile: dict) -> list: return out +def _llm_index(profile: dict, ctx: dict) -> dict: + """Map column name -> its LLM dictionary entry (description/unit/...). + + Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the + profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty + dict when ``run_llm`` did not run, so the caller degrades cleanly. Fully + defensive: never raises on malformed input. + """ + llm = profile.get(LLM_KEY) + if not isinstance(llm, dict): + llm = ctx.get(LLM_KEY) + if not isinstance(llm, dict): + return {} + entries = llm.get("dictionary") + if not isinstance(entries, (list, tuple)): + return {} + index: dict = {} + for e in entries: + if not isinstance(e, dict): + continue + col = e.get("column") + if col is None: + continue + index[model._safe_str(col)] = e + return index + + +def _llm_desc_unit_block(name: str, llm_index: dict): + """Markdown block with the LLM business description + unit of a column, or + None when no LLM entry matches the column (clean fallback without LLM).""" + entry = llm_index.get(model._safe_str(name)) + if not isinstance(entry, dict): + return None + raw_desc = entry.get("description") or entry.get("business_meaning") + desc = " ".join(model._safe_str(raw_desc).split()) if raw_desc else "" + raw_unit = entry.get("unit") + unit = " ".join(model._safe_str(raw_unit).split()) if raw_unit else "" + parts = [] + if desc: + parts.append(f"**Descripción:** {desc}") + if unit: + parts.append(f"**Unidad:** {unit}") + if not parts: + return None + return model.Markdown(text=" · ".join(parts)) + + def _make_hist_box(name: str, numeric: dict, box: dict): """Build the histogram (with mean/median/±σ lines) + boxplot figure. @@ -271,15 +329,26 @@ def build_num_distr(profile: dict, ctx: dict): if not numerics: return None # chapter does not apply to a dataset with no numerics. + # Register the "how to read the histogram and boxplot" term in the shared + # glossary collector (if present) and mark its first appearance clickable. The + # full explanation (colour code, 1,5·IQR rule, asymmetry reading) lives in the + # GLOSARIO chapter instead of inline here: the intro only names the term. + glossary = ctx.get("glossary") + mark_term = False + if isinstance(glossary, model.GlossaryCollector): + glossary.add(_TERM_HISTOBOX_KEY, _TERM_HISTOBOX_LABEL) + mark_term = True + como_leer = ("[[term:histograma_boxplot]]cómo leer estos gráficos[[/term]]" + if mark_term else "cómo leer estos gráficos") intro = ( - "Para cada columna numérica se muestra su **histograma** con tres líneas " - "de referencia: la **media** (línea roja discontinua), la **mediana** " - "(línea verde continua) y la banda **±1σ** (zona sombreada). Debajo, " - "alineado al mismo eje, un **boxplot de Tukey**: la caja abarca del " - "primer al tercer cuartil (P25–P75), la línea interior es la mediana y " - "los bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay " - "valores más allá de las vallas. Comparar media y mediana revela la " - "asimetría de la distribución.") + "Cada columna numérica muestra su **histograma** (con la **media**, la " + "**mediana** y la banda **±1σ**) y, debajo y al mismo eje, su **boxplot " + f"de Tukey** — {como_leer}.") + + # Business description + unit per column come from the LLM dictionary + # (profile['llm']['dictionary'], matched by column name); absent without + # run_llm, in which case the per-column description block is simply omitted. + llm_index = _llm_index(profile, ctx) blocks = [ model.Heading(text=CHAPTER_TITLE, level=1), @@ -293,17 +362,20 @@ def build_num_distr(profile: dict, ctx: dict): box = build_boxplot_stats(numeric) or {} except Exception: # noqa: BLE001 — degrade, never raise. box = {} - # Keep the column heading, its figure and its stats note together on the - # same page/slide (mejora 3 — keep-together): the renderers measure the - # whole Group and move it whole when it would not fit. - blocks.append(model.Group(blocks=[ - model.Heading(text=str(name), level=2), - model.Figure( - make=_figure_maker(name, numeric, box), - caption=f"Distribución de «{name}» — histograma " - f"(media/mediana/±σ) y boxplot."), - model.Markdown(text=_stats_note(name, numeric, box)), - ])) + # Keep the column heading, its (optional) LLM description, its figure and + # its stats note together on the same page/slide (mejora 3 — + # keep-together): the renderers measure the whole Group and move it whole + # when it would not fit. + col_blocks = [model.Heading(text=str(name), level=2)] + desc_block = _llm_desc_unit_block(name, llm_index) + if desc_block is not None: + col_blocks.append(desc_block) + col_blocks.append(model.Figure( + make=_figure_maker(name, numeric, box), + caption=f"Distribución de «{name}» — histograma " + f"(media/mediana/±σ) y boxplot.")) + col_blocks.append(model.Markdown(text=_stats_note(name, numeric, box))) + blocks.append(model.Group(blocks=col_blocks)) return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py index 280cff17..ea0b9fd5 100644 --- a/python/functions/datascience/automatic_eda/chapters/num_distr_test.py +++ b/python/functions/datascience/automatic_eda/chapters/num_distr_test.py @@ -101,7 +101,7 @@ def test_golden_chapter_estructura_y_bloques(): def test_golden_media_mediana_sigma_y_boxplot_presentes(): - # The intro documents the three reference lines and the Tukey boxplot; the + # The short intro names the three reference lines and the Tukey boxplot; the # per-column note carries the actual mean/median/σ numbers and the shape. ch = build_num_distr(_profile(n_numeric=1, extra_categorical=False), {}) md_texts = " ".join(b.text for b in _flatten(ch.blocks) @@ -110,10 +110,58 @@ def test_golden_media_mediana_sigma_y_boxplot_presentes(): assert "±1σ" in md_texts or "σ" in md_texts assert "boxplot" in md_texts.lower() assert "Tukey" in md_texts + # The long "how to read it" explanation moved to the glossary: the colour-code + # / 1,5·IQR walkthrough is no longer inline in the chapter body. + assert "1,5·IQR" not in md_texts + assert "línea roja" not in md_texts # distribution_type gloss surfaced for the column (right-skewed preset). assert _DIST_GLOSS["right-skewed"].split(";")[0][:20] in md_texts +def test_glosario_histograma_boxplot_clicable_y_definicion(): + # With a glossary collector the intro marks the clickable term and the FULL + # explanation (the long paragraph removed from the body) lands in the glossary. + from datascience.automatic_eda.chapters.glosario import build_glosario + + gc = model.GlossaryCollector() + prof = _profile(n_numeric=1, extra_categorical=False) + ch = build_num_distr(prof, {"glossary": gc}) + intro = next(b for b in ch.blocks if b.kind == "markdown") + assert "[[term:histograma_boxplot]]" in intro.text + assert gc.has("histograma_boxplot") + glos = build_glosario(prof, {"glossary": gc}) + entry = next(b for b in glos.blocks + if getattr(b, "kind", "") == "glossary_entry" + and b.key == "histograma_boxplot") + assert "boxplot" in entry.definition.lower() + assert "1,5·IQR" in entry.definition + + +def test_llm_descripcion_y_unidad_por_columna(): + # With an LLM dictionary, each numeric column whose name matches shows its + # business description and unit in a per-column markdown block. + prof = _profile(n_numeric=2) + prof["llm"] = {"dictionary": [ + {"column": "precio", "description": "Precio de venta del producto", + "unit": "EUR"}, + {"column": "alcohol", "business_meaning": "Grado alcohólico", + "unit": "% vol"}, + ]} + ch = build_num_distr(prof, {}) + md_all = " ".join(b.text for b in _flatten(ch.blocks) + if b.kind == "markdown") + assert "Precio de venta" in md_all and "EUR" in md_all + assert "Grado alcohólico" in md_all and "% vol" in md_all + + +def test_edge_sin_llm_no_anade_descripcion(): + # Without an LLM block the per-column description markdown is simply omitted. + ch = build_num_distr(_profile(n_numeric=2), {}) + md_all = " ".join(b.text for b in _flatten(ch.blocks) + if b.kind == "markdown") + assert "Descripción" not in md_all + + def test_boxplot_stats_se_consumen_del_registry(): # The chapter must feed build_boxplot_stats (group eda) and the resulting # box must carry the Tukey fences for the figure. diff --git a/python/functions/datascience/categorical_top_bar_figure.md b/python/functions/datascience/categorical_top_bar_figure.md new file mode 100644 index 00000000..5dad3c49 --- /dev/null +++ b/python/functions/datascience/categorical_top_bar_figure.md @@ -0,0 +1,111 @@ +--- +id: categorical_top_bar_figure_py_datascience +name: categorical_top_bar_figure +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def categorical_top_bar_figure(top: list, n_distinct: int = 0, title: str = \"\", top_k: int = 6, n_rows=None) -> \"matplotlib.figure.Figure\"" +description: "Construye una figura matplotlib de barras horizontales de las top_k categorías más frecuentes de una columna categórica, con la mayor arriba y agregando el resto en una barra gris \"Otros (N categorías)\". Contrato de entrada idéntico a categorical_top_pie_figure (swap directo donut↔barras): consume el bloque `top` de summarize_categorical y devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA. Backend Agg sin pyplot global; defensivo total ante top vacío/None, nunca lanza." +tags: [eda, categorical, bar, barh, matplotlib, figure, visualization, datascience, impure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [matplotlib] +example: | + from categorical_top_bar_figure import categorical_top_bar_figure + top = [ + {"value": "rojo", "count": 40, "pct": 0.4}, + {"value": "azul", "count": 30, "pct": 0.3}, + {"value": "verde", "count": 20, "pct": 0.2}, + ] + fig = categorical_top_bar_figure(top, n_distinct=12, title="color", top_k=6, n_rows=100) +tested: true +tests: + - "test_returns_figure" + - "test_ten_items_topk_six_yields_seven_bars" + - "test_empty_top_does_not_raise_and_returns_figure" + - "test_long_value_truncated" + - "test_none_value_and_none_count_are_handled" + - "test_n_rows_adds_exact_others_bar" +test_file_path: "python/functions/datascience/categorical_top_bar_figure_test.py" +file_path: "python/functions/datascience/categorical_top_bar_figure.py" +params: + - name: top + desc: "Lista de dicts {value, count, pct} ordenada de mayor a menor por count (salida del bloque `top` de summarize_categorical). Puede venir vacía o con dicts incompletos: items no-dict, sin count, con count None o count <= 0 se descartan. value None se admite (etiqueta vacía)." + - name: n_distinct + desc: "Nº total de categorías distintas de la columna. Etiqueta la barra agregada como \"Otros (n_distinct - top_k)\" (mínimo 0). Si no supera el nº de barras mostradas, se usa el overflow real de `top` como nº de categorías agregadas. Default 0." + - name: title + desc: "Título de la figura (nombre de la columna). Se trunca a ~48 chars con elipsis si es muy largo. Default \"\" (sin título)." + - name: top_k + desc: "Nº máximo de barras explícitas. Default 6. La barra \"Otros\" no cuenta contra este límite. Con top_k <= 0 se muestra al menos la categoría mayor." + - name: n_rows + desc: "Opcional. Total de filas del dataset. Si se da y la suma de counts mostrados < n_rows, la barra \"Otros\" usa (n_rows - suma_mostrada) como count para que sea exacta respecto al total real. Si se omite, \"Otros\" usa la suma de counts fuera del top_k mostrado (solo cuando top trae más de top_k items). Default None." +output: "Un matplotlib.figure.Figure (figsize 6.4 x altura escalada con el nº de barras, dpi 150) con un Axes de barras horizontales: la categoría más frecuente arriba, la barra gris \"Otros (N categorías)\" abajo, cada barra anotada con su conteo y porcentaje al final y etiquetas de categoría (yticklabels) truncadas a ~22 chars. Si no hay counts válidos devuelve igualmente una Figure con un texto centrado \"sin datos categóricos\" (nunca lanza); cualquier error inesperado cae a una Figure con el texto del error. El caller rasteriza/cierra la figura; la función no la muestra ni la guarda." +--- + +## Ejemplo + +```python +from categorical_top_bar_figure import categorical_top_bar_figure + +# `top` es la salida del bloque "top" de summarize_categorical (ya ordenado desc). +top = [ + {"value": "rojo", "count": 40, "pct": 0.40}, + {"value": "azul", "count": 30, "pct": 0.30}, + {"value": "verde", "count": 20, "pct": 0.20}, + {"value": "amarillo", "count": 5, "pct": 0.05}, +] + +fig = categorical_top_bar_figure( + top, + n_distinct=12, # 12 categorías distintas en total + title="color_producto", + top_k=6, # hasta 6 barras explícitas + n_rows=100, # "Otros" = 100 - 95 = 5, sobre 8 categorías agregadas +) + +# El renderer del informe lo rasteriza; aquí solo persistimos para inspección. +fig.savefig("/tmp/barras_color.png") +``` + +## Cuando usarla + +Úsala dentro de un informe EDA cuando quieras comparar **magnitudes** de las +categorías dominantes de una columna categórica: qué categoría manda y por +cuánto frente a las siguientes. Pásale directamente el bloque `top` de +`summarize_categorical` (ya ordenado de mayor a menor) más `n_distinct` para que +la barra "Otros" indique cuántas categorías quedan agrupadas. Es el clon "de +barras" del donut `categorical_top_pie_figure` con **contrato de entrada +idéntico**: puedes intercambiar una por otra sin tocar el caller. Elige barras +cuando importe comparar tamaños exactos; el donut cuando importe la proporción +del total. + +## Gotchas + +- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg` + y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí, + para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO + es thread-safe; esta función evita ese riesgo construyendo el `Figure` + directamente, así que es segura de llamar en bucle desde el renderer. +- **El caller cierra la figura.** La función devuelve el `Figure` pero no lo + muestra ni lo guarda. Quien la consume debe rasterizarla y luego liberarla + (`fig.clf()` / `matplotlib.pyplot.close(fig)` si se usó pyplot en el caller) + para no acumular memoria en lotes grandes de columnas. +- **`barh` dibuja de abajo arriba.** La categoría más frecuente va arriba porque + el orden de display se invierte antes de plotear; la barra "Otros" queda + siempre al fondo. No reordenes `top` esperando otro layout: la función asume + que ya viene ordenado desc por count. +- **Magnitud exacta de "Otros" solo con `n_rows`.** Sin `n_rows`, la barra + "Otros" se calcula con el overflow presente en `top`; si `top` ya viene + recortado a `top_k` por el productor, no habrá "Otros" aunque existan más + categorías. Pasa `n_rows` (total de filas del dataset) para una barra correcta + respecto al total real. +- **Defensiva, nunca lanza.** `top=[]`, `value=None`, `count=None` o counts no + numéricos se manejan sin error: en el peor caso devuelve una `Figure` con + "sin datos categóricos", y cualquier excepción inesperada cae a una `Figure` + con el texto del error. No envuelvas la llamada en try/except por miedo a un + raise — no lo hay. diff --git a/python/functions/datascience/categorical_top_bar_figure.py b/python/functions/datascience/categorical_top_bar_figure.py new file mode 100644 index 00000000..3fc47196 --- /dev/null +++ b/python/functions/datascience/categorical_top_bar_figure.py @@ -0,0 +1,233 @@ +"""Impure EDA helper: horizontal bar figure of the most common categories (`eda` group). + +Builds a horizontal bar chart of the ``top_k`` most frequent categories of a +categorical column, folding everything else into a single gray +"Otros (N categorías)" bar. The most frequent category sits at the top, each bar +labelled with its count (and percentage) at the end. Returns a ready-to-rasterize +``matplotlib.figure.Figure``; it never shows nor saves it. + +This is the "magnitude" twin of ``categorical_top_pie_figure``: identical input +contract (same ``top``/``n_distinct``/``title``/``top_k``/``n_rows`` signature) so +it can be swapped in directly, but it communicates comparable magnitudes via bars +instead of proportions via wedges. + +Impure because it touches matplotlib's rendering machinery. It uses the headless +Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no +global state and is safe to call repeatedly from a report renderer. +""" + +import matplotlib + +matplotlib.use("Agg") + +from matplotlib.figure import Figure # noqa: E402 + + +# Gray reserved for the aggregated "Otros" bar. +_OTHER_COLOR = "#9e9e9e" +# Muted gray for secondary text (title fallback, no-data message). +_MUTED_TEXT = "#5f6b7a" +# Soft red for the error fallback message. +_ERROR_TEXT = "#b00020" +# Pleasant, colour-blind-friendly qualitative palette for the explicit bars. +_PALETTE = [ + "#4C72B0", + "#DD8452", + "#55A868", + "#C44E52", + "#8172B3", + "#937860", + "#DA8BC3", + "#8C8C8C", + "#CCB974", + "#64B5CD", +] + + +def _truncate(text, width: int = 22) -> str: + """Truncate ``text`` to ``width`` chars, appending an ellipsis if cut.""" + s = "" if text is None else str(text) + if len(s) <= width: + return s + if width <= 1: + return s[:width] + return s[: width - 1] + "…" + + +def _message_figure(message: str, color: str = _MUTED_TEXT, title: str = "") -> "Figure": + """Return a fallback ``Figure`` carrying a single centered message.""" + fig = Figure(figsize=(6.4, 4.0), dpi=150) + ax = fig.add_subplot(111) + ax.axis("off") + ax.text( + 0.5, + 0.5, + message, + ha="center", + va="center", + fontsize=12, + color=color, + wrap=True, + transform=ax.transAxes, + ) + if title: + ax.set_title(_truncate(title, 48), fontsize=12, loc="center", pad=8) + fig.tight_layout() + return fig + + +def categorical_top_bar_figure( + top: list, + n_distinct: int = 0, + title: str = "", + top_k: int = 6, + n_rows=None, +) -> "matplotlib.figure.Figure": + """Build a horizontal bar figure of the most common categories of a column. + + Renders the ``top_k`` most frequent categories as explicit horizontal bars, + largest at the top, and aggregates every remaining category into a single + gray "Otros (N categorías)" bar at the bottom. Each bar is annotated with its + count and percentage of the total at the end of the bar; the category names + are truncated Y tick labels. + + The function shares the exact input contract of + ``categorical_top_pie_figure`` (the donut twin) so it is a drop-in swap. It is + fully defensive: empty input, missing/``None`` values or counts never raise. + When there is nothing valid to draw it still returns a ``Figure`` carrying a + centered "sin datos categóricos" message, and any unexpected error is caught + and turned into a fallback ``Figure`` carrying the error text. + + Args: + top: List of ``{value, count, pct}`` dicts, already sorted by ``count`` + descending (the ``top`` block of ``summarize_categorical``). May be + empty or carry incomplete/``None`` entries; non-dict items, items + without a positive numeric ``count`` and ``None`` counts are skipped. + n_distinct: Total number of distinct categories in the column. Used to + label the aggregated bar as "Otros (n_distinct - top_k)" (floored at + 0). Ignored when it does not exceed the number of shown bars. + title: Figure title (the column name). Truncated when too long. + top_k: Maximum number of explicit bars. Default 6. The "Otros" bar does + not count against this limit. + n_rows: Optional total row count of the dataset. When given and the sum of + shown counts is below ``n_rows``, the "Otros" bar uses + ``n_rows - sum_shown`` as its count so it is exact with respect to the + real total. When omitted, "Otros" uses the sum of the counts that fall + outside the shown ``top_k`` (only when ``top`` carries more than + ``top_k`` items). + + Returns: + A ``matplotlib.figure.Figure`` with a single horizontal-bar Axes. The + caller is responsible for rasterizing/closing it. + """ + try: + safe_title = _truncate(title, 48) + + # --- Defensive parse: keep only well-formed {value, count} with count > 0. + cleaned = [] + if isinstance(top, list): + for item in top: + if not isinstance(item, dict): + continue + count = item.get("count") + if count is None: + continue + try: + count = float(count) + except (TypeError, ValueError): + continue + if count <= 0: + continue + cleaned.append((item.get("value"), count)) + + if not cleaned: + return _message_figure("sin datos categóricos", title=title) + + # --- Split into shown bars and the aggregated remainder. + shown = cleaned[: max(int(top_k), 0)] + if not shown: # top_k <= 0 — show at least the largest category. + shown = cleaned[:1] + + sum_shown = sum(c for _, c in shown) + overflow_count = sum(c for _, c in cleaned[len(shown):]) + + # How many categories are folded into "Otros". + try: + nd = int(n_distinct) + except (TypeError, ValueError): + nd = 0 + others_categories = max(nd - len(shown), 0) + # If n_distinct is unknown/too small, fall back to the overflow we + # actually have in `top` beyond the shown bars. + overflow_items = len(cleaned) - len(shown) + if others_categories == 0 and overflow_items > 0: + others_categories = overflow_items + + # Count attributed to the "Otros" bar. + others_count = 0.0 + if n_rows is not None: + try: + total_rows = float(n_rows) + except (TypeError, ValueError): + total_rows = None + if total_rows is not None and total_rows > sum_shown: + others_count = total_rows - sum_shown + if others_count <= 0: + others_count = overflow_count + + # --- Build the display order (top to bottom): largest .. smallest, Otros. + display_labels = [_truncate(v, 22) for v, _ in shown] + display_values = [c for _, c in shown] + display_colors = [_PALETTE[i % len(_PALETTE)] for i in range(len(shown))] + + has_others = others_count > 0 and others_categories > 0 + if has_others: + display_labels.append(f"Otros ({others_categories} categorías)") + display_values.append(others_count) + display_colors.append(_OTHER_COLOR) + + total = sum(display_values) or 1.0 + + # barh draws bottom-up, so reverse the display order before plotting to + # land the largest category on top and "Otros" at the bottom. + labels = list(reversed(display_labels)) + values = list(reversed(display_values)) + colors = list(reversed(display_colors)) + y_pos = range(len(values)) + + # Height scales with the number of bars so dense reports stay readable. + n_bars = len(values) + height = max(2.4, min(0.4 * n_bars + 1.2, 14.0)) + fig = Figure(figsize=(6.4, height), dpi=150) + ax = fig.add_subplot(111) + + ax.barh(list(y_pos), values, color=colors, edgecolor="white") + ax.set_yticks(list(y_pos)) + ax.set_yticklabels(labels, fontsize=8) + ax.set_xlabel("conteo", fontsize=9) + + max_val = max(values) if values else 1.0 + ax.set_xlim(0, max_val * 1.18 if max_val > 0 else 1.0) + + # Annotate each bar with its count and percentage at the end of the bar. + for y, val in zip(y_pos, values): + pct = val / total * 100.0 + ax.text( + val + max_val * 0.012, + y, + f"{int(round(val))} ({pct:.0f}%)", + va="center", + ha="left", + fontsize=7, + color="#202020", + ) + + if safe_title: + ax.set_title(safe_title, fontsize=13, loc="left", pad=10) + + fig.tight_layout() + return fig + except Exception as exc: # noqa: BLE001 — never raise from a figure builder. + return _message_figure( + f"error al dibujar barras: {exc}", color=_ERROR_TEXT + ) diff --git a/python/functions/datascience/categorical_top_bar_figure_test.py b/python/functions/datascience/categorical_top_bar_figure_test.py new file mode 100644 index 00000000..01fedbc7 --- /dev/null +++ b/python/functions/datascience/categorical_top_bar_figure_test.py @@ -0,0 +1,103 @@ +"""Tests para categorical_top_bar_figure (barras de categorías top, grupo eda). + +Usa el backend Agg sin pyplot; no muestra ni guarda figuras. Cada test cierra +explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular +estado entre tests. +""" + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt # noqa: E402 +from matplotlib.figure import Figure # noqa: E402 + +from categorical_top_bar_figure import categorical_top_bar_figure + + +def _make_top(n): + """n items {value, count, pct} ordenados desc por count.""" + return [ + {"value": f"cat_{i}", "count": n - i, "pct": (n - i) / sum(range(1, n + 1))} + for i in range(n) + ] + + +def _bar_count(ax): + """Devuelve el nº de barras (longitud del primer BarContainer del Axes).""" + if ax.containers: + return len(ax.containers[0]) + return 0 + + +def test_returns_figure(): + fig = categorical_top_bar_figure(_make_top(3), n_distinct=3, title="col") + assert isinstance(fig, Figure) + plt.close(fig) + + +def test_ten_items_topk_six_yields_seven_bars(): + top = _make_top(10) + fig = categorical_top_bar_figure(top, n_distinct=10, title="muchas", top_k=6) + ax = fig.axes[0] + # 6 categorías explícitas + 1 barra "Otros". + assert _bar_count(ax) == 7 + plt.close(fig) + + +def test_empty_top_does_not_raise_and_returns_figure(): + fig = categorical_top_bar_figure([], n_distinct=0, title="vacía") + assert isinstance(fig, Figure) + # Sin datos: no debe haber barras. + assert _bar_count(fig.axes[0]) == 0 + plt.close(fig) + + +def test_long_value_truncated(): + long_value = "una_categoria_con_un_nombre_larguisimo_que_excede_el_limite" + top = [ + {"value": long_value, "count": 10, "pct": 0.5}, + {"value": "corta", "count": 10, "pct": 0.5}, + ] + fig = categorical_top_bar_figure(top, n_distinct=2, title="col", top_k=6) + ax = fig.axes[0] + tick_texts = [t.get_text() for t in ax.get_yticklabels()] + # El valor largo aparece truncado con elipsis y NO en su forma completa. + assert any("…" in t for t in tick_texts) + assert long_value not in " ".join(tick_texts) + plt.close(fig) + + +def test_none_value_and_none_count_are_handled(): + top = [ + {"value": None, "count": 5, "pct": 0.5}, + {"value": "b", "count": None, "pct": 0.0}, # count None -> se descarta + {"value": "c", "count": 5, "pct": 0.5}, + ] + fig = categorical_top_bar_figure(top, n_distinct=2, title="con nones", top_k=6) + assert isinstance(fig, Figure) + # Solo 2 items válidos, sin overflow -> 2 barras, sin "Otros". + assert _bar_count(fig.axes[0]) == 2 + plt.close(fig) + + +def test_n_rows_adds_exact_others_bar(): + # 3 categorías mostradas suman 30, dataset real 100 -> "Otros" = 70. + top = [ + {"value": "a", "count": 15, "pct": 0.15}, + {"value": "b", "count": 10, "pct": 0.10}, + {"value": "c", "count": 5, "pct": 0.05}, + ] + fig = categorical_top_bar_figure( + top, n_distinct=20, title="col", top_k=3, n_rows=100 + ) + ax = fig.axes[0] + # 3 explícitas + Otros. + assert _bar_count(ax) == 4 + tick_texts = [t.get_text() for t in ax.get_yticklabels()] + # La barra Otros refleja n_distinct - top_k = 17 categorías. + assert any("Otros (17 categorías)" in t for t in tick_texts) + # Su anotación lleva el count 70. + annotation_texts = [t.get_text() for t in ax.texts] + assert any("70" in t for t in annotation_texts) + plt.close(fig)