Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| eaca41a532 | |||
| a1e2e3567c | |||
| 833597c831 | |||
| 7158be8142 | |||
| 9be84a48ea | |||
| 4099d88eaf | |||
| 48de3ce3da |
@@ -64,6 +64,7 @@ from .exploratory_caveats import exploratory_caveats
|
||||
from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational
|
||||
from .render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from .render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
from .render_automatic_eda_markdown import render_automatic_eda_markdown
|
||||
from .detect_time_column import detect_time_column
|
||||
from .extract_timeseries_raw import extract_timeseries_raw
|
||||
from .build_eda_render_ctx import build_eda_render_ctx
|
||||
@@ -82,6 +83,7 @@ __all__ = [
|
||||
"resample_timeseries",
|
||||
"render_automatic_eda_pdf",
|
||||
"render_automatic_eda_pptx",
|
||||
"render_automatic_eda_markdown",
|
||||
"decode_qr_image",
|
||||
"adf_kpss_stationarity",
|
||||
"acf_pacf",
|
||||
|
||||
@@ -36,6 +36,7 @@ from .model import ( # noqa: F401
|
||||
from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document # noqa: F401
|
||||
from .render_pdf_impl import render_pdf # noqa: F401
|
||||
from .render_pptx_impl import render_pptx # noqa: F401
|
||||
from .render_md_impl import render_md # noqa: F401
|
||||
|
||||
__all__ = [
|
||||
"ENGINE_NAME",
|
||||
@@ -60,4 +61,5 @@ __all__ = [
|
||||
"build_document",
|
||||
"render_pdf",
|
||||
"render_pptx",
|
||||
"render_md",
|
||||
]
|
||||
|
||||
@@ -1,19 +1,25 @@
|
||||
"""Categorical distributions chapter (CAT DISTR).
|
||||
|
||||
Third reference chapter for AutomaticEDA. For every categorical column it shows,
|
||||
fulfilling the user's request:
|
||||
Third reference chapter for AutomaticEDA. Each categorical column gets **its own
|
||||
page (PDF) / slide (PPTX)**: every column is wrapped in a keep-together
|
||||
``model.Group`` with ``page_break_before=True`` (except the first, which may share
|
||||
the intro's page), so its chart sits next to its tables and no column is split.
|
||||
|
||||
1. A short opening explanation of **Shannon entropy** (what it measures, its 0
|
||||
and log2(k) bounds, the normalized 0–1 version) and the dataset row total used
|
||||
as a comparison baseline.
|
||||
2. Per column, a cardinality key/value table: distinct values, ``% distinct``
|
||||
(distinct / total rows), total dataset rows, singleton values (frequency 1),
|
||||
entropy with its theoretical maximum and the normalized ratio, mode, imbalance
|
||||
and string-length stats.
|
||||
3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
A short intro names the clickable **[[term:entropia]]entropía[[/term]]** term —
|
||||
the full definition lives in the GLOSARIO chapter, so it is NOT repeated inline
|
||||
here (one click jumps to the glossary entry). The intro also carries the dataset
|
||||
row total used as a comparison baseline.
|
||||
|
||||
Per column the Group contains, in order:
|
||||
|
||||
1. A cardinality key/value table: distinct values, ``% distinct`` (distinct /
|
||||
total rows), total dataset rows, singleton values (frequency 1), entropy with
|
||||
its theoretical maximum and the normalized ratio, mode, imbalance and
|
||||
string-length stats.
|
||||
2. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
single dominating category).
|
||||
4. A ``top-k`` table (value / count / %).
|
||||
5. A **donut pie chart** of the most common categories (top-k + an "Otros"
|
||||
3. A ``top-k`` table (value / count / %).
|
||||
4. A **donut pie chart** of the most common categories (top-k + an "Otros"
|
||||
bucket), drawn lazily so the renderers scale it to fit entirely.
|
||||
|
||||
Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the
|
||||
@@ -33,7 +39,7 @@ import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "cat_distr"
|
||||
CHAPTER_TITLE = "Distribuciones categóricas"
|
||||
|
||||
@@ -53,11 +59,17 @@ _TERM_ENTROPIA_DEF = (
|
||||
# Cap the number of categorical columns rendered to keep the document bounded;
|
||||
# the rest are summarized in a closing note (no silent truncation).
|
||||
MAX_COLS = 40
|
||||
# Rows shown in each top-k table and explicit slices in the pie.
|
||||
TOP_TABLE_ROWS = 15
|
||||
# Rows shown in each top-k table and explicit slices in the pie. Kept moderate so
|
||||
# the whole column — cardinality table + top-k table + donut — fits on ONE
|
||||
# page/slide with the chart next to its tables; the table note still reports
|
||||
# "top N of M" so nothing is silently hidden. For id-like columns (≈100%
|
||||
# distinct) the top-k table is dropped entirely (it would be a list of unique
|
||||
# values — pure noise), which also frees the room the donut needs (see build).
|
||||
TOP_TABLE_ROWS = 8
|
||||
PIE_TOP_K = 6
|
||||
# Truncate very long category labels in tables (the renderer also wraps).
|
||||
LABEL_MAX = 48
|
||||
# Truncate very long category labels in tables (the renderer also wraps). Kept
|
||||
# tight so a column with long id-like values (names, tickets) still fits its page.
|
||||
LABEL_MAX = 28
|
||||
|
||||
|
||||
def _fmt_int(value) -> str:
|
||||
@@ -267,45 +279,55 @@ def _normalize_card(card: dict) -> dict:
|
||||
|
||||
|
||||
def _cardinality_block(card: dict):
|
||||
"""KVTable with the cardinality / entropy metrics for one column."""
|
||||
"""KVTable with the cardinality / entropy metrics for one column.
|
||||
|
||||
Related metrics are grouped onto a single row each (distinct/%/unique;
|
||||
entropy bits/max/normalized; length min/mean/max) so the whole column —
|
||||
table + chart — fits one page/slide without dropping any datum; the short
|
||||
16:9 PPTX slide does not fit one metric per row plus a chart otherwise."""
|
||||
n_singletons = card.get("n_singletons")
|
||||
if n_singletons is not None and card.get("n_singletons_partial"):
|
||||
singletons = f"≥{_fmt_int(n_singletons)} (en top mostrado)"
|
||||
singletons = f"≥{_fmt_int(n_singletons)}"
|
||||
elif n_singletons is not None:
|
||||
singletons = _fmt_int(n_singletons)
|
||||
else:
|
||||
singletons = "—"
|
||||
|
||||
entropy_ref = _fmt_num(card.get("entropy"))
|
||||
emax = card.get("entropy_max")
|
||||
if emax is not None:
|
||||
entropy_ref = f"{entropy_ref} (máx {_fmt_num(emax)})"
|
||||
# Distinct count · % distinct · unique (frequency 1) on one row.
|
||||
distinct_combo = (f"{_fmt_int(card.get('n_distinct'))} · "
|
||||
f"{_fmt_pct_value(card.get('pct_distinct'))} · "
|
||||
f"{singletons} únicos")
|
||||
|
||||
# Entropy bits · theoretical max · normalized 0–1 on one row.
|
||||
entropy_combo = (f"{_fmt_num(card.get('entropy'))} bits · "
|
||||
f"máx {_fmt_num(card.get('entropy_max'))} · "
|
||||
f"norm {_fmt_num(card.get('entropy_norm'))}")
|
||||
|
||||
mode = card.get("mode")
|
||||
mode_pct = card.get("mode_pct")
|
||||
mode_str = "—" if mode is None else model._safe_str(mode)
|
||||
mode_str = "—" if mode is None else _truncate(mode, 32)
|
||||
if mode is not None and mode_pct is not None:
|
||||
mode_str = f"{mode_str} ({_fmt_pct_value(mode_pct)})"
|
||||
|
||||
rows = [
|
||||
("Valores distintos", _fmt_int(card.get("n_distinct"))),
|
||||
("% distintos", _fmt_pct_value(card.get("pct_distinct"))),
|
||||
("Distintos · % · únicos", distinct_combo),
|
||||
("Total filas (dataset)", _fmt_int(card.get("n_rows"))),
|
||||
("Valores únicos (frecuencia 1)", singletons),
|
||||
("Entropía (bits)", entropy_ref),
|
||||
("Entropía normalizada (0–1)", _fmt_num(card.get("entropy_norm"))),
|
||||
("Entropía (bits · máx · norm)", entropy_combo),
|
||||
("Moda", mode_str),
|
||||
]
|
||||
imbalance = card.get("imbalance")
|
||||
if imbalance is not None:
|
||||
rows.append(("Desbalance", _fmt_num(imbalance)))
|
||||
lm = card.get("len_min")
|
||||
lmean = card.get("len_mean")
|
||||
lmax = card.get("len_max")
|
||||
# Imbalance and string length (both secondary) share one closing row.
|
||||
extras = []
|
||||
if imbalance is not None:
|
||||
extras.append(f"desbalance {_fmt_num(imbalance)}")
|
||||
if any(v is not None for v in (lm, lmean, lmax)):
|
||||
rows.append((
|
||||
"Longitud (mín/media/máx)",
|
||||
f"{_fmt_num(lm)} / {_fmt_num(lmean)} / {_fmt_num(lmax)}"))
|
||||
extras.append(
|
||||
f"long. {_fmt_num(lm)}/{_fmt_num(lmean)}/{_fmt_num(lmax)}")
|
||||
if extras:
|
||||
rows.append(("Desbalance · longitud", " · ".join(extras)))
|
||||
return model.KVTable(rows=rows, title="Cardinalidad")
|
||||
|
||||
|
||||
@@ -315,7 +337,8 @@ def _flag_note(card: dict):
|
||||
return model.Note(
|
||||
"Casi todos los valores son distintos (≈100% distintos): la columna "
|
||||
"se comporta como un identificador y aporta poco para agrupar o "
|
||||
"comparar categorías.")
|
||||
"comparar categorías. No se lista el top de categorías (serían "
|
||||
"valores casi todos únicos).")
|
||||
if card.get("dominated"):
|
||||
mp = card.get("mode_pct")
|
||||
mp_str = _fmt_pct_value(mp) if mp is not None else "muy alta"
|
||||
@@ -335,7 +358,7 @@ def _topk_table(cat: dict):
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
rows.append([
|
||||
model._safe_str(t.get("value")),
|
||||
_truncate(t.get("value")),
|
||||
_fmt_int(t.get("count")),
|
||||
_pct_from_maybe_fraction(t.get("pct")),
|
||||
])
|
||||
@@ -353,20 +376,16 @@ def _topk_table(cat: dict):
|
||||
def _intro_blocks(n_rows, mark_term: bool = False):
|
||||
total = _fmt_int(n_rows)
|
||||
# Mark the first appearance of the term as a clickable glossary jump when the
|
||||
# term was registered (mark_term). The visible text is identical either way.
|
||||
entropia = ("[[term:entropia]]**entropía de Shannon**[[/term]]" if mark_term
|
||||
else "**entropía de Shannon**")
|
||||
# term was registered (mark_term). The full definition of entropy lives in the
|
||||
# GLOSARIO chapter, so the intro only names the clickable term here instead of
|
||||
# repeating the long explanation (avoids the redundancy with the glossary).
|
||||
entropia = ("[[term:entropia]]entropía[[/term]]" if mark_term
|
||||
else "entropía")
|
||||
text = (
|
||||
f"La {entropia} mide cómo de repartidos están los valores de "
|
||||
"una columna categórica, en bits. Vale 0 cuando una sola categoría "
|
||||
"concentra todas las filas (máxima previsibilidad) y alcanza su máximo, "
|
||||
"log2(k) para k categorías distintas, cuando todas aparecen por igual "
|
||||
"(máxima diversidad). La **entropía normalizada** (entropía dividida por "
|
||||
"su máximo) la lleva al rango 0–1 para comparar columnas con distinto "
|
||||
"número de categorías. Para cada columna se muestran los valores "
|
||||
"distintos, el porcentaje que representan sobre el total de filas, los "
|
||||
"valores únicos (que aparecen una sola vez), la tabla de las categorías "
|
||||
"más frecuentes y un gráfico de tarta (donut) de las más comunes."
|
||||
f"Cada columna categórica ocupa su propia página: sus métricas de "
|
||||
f"cardinalidad —incluida la {entropia}—, una nota que señala cardinalidad "
|
||||
"problemática, la tabla de las categorías más frecuentes y un gráfico de "
|
||||
"tarta (donut) de las más comunes, todo junto."
|
||||
)
|
||||
if n_rows is not None:
|
||||
text += f" El dataset tiene {total} filas en total como referencia."
|
||||
@@ -398,24 +417,37 @@ def build_cat_distr(profile: dict, ctx: dict):
|
||||
blocks = list(_intro_blocks(n_rows, mark_term=mark_term))
|
||||
|
||||
rendered = cat_cols[:MAX_COLS]
|
||||
for col in rendered:
|
||||
for idx, col in enumerate(rendered):
|
||||
name = col.get("name") or "(columna)"
|
||||
cat = col.get("categorical") or {}
|
||||
card = _normalize_card(_cardinality(cat, n_rows))
|
||||
|
||||
blocks.append(model.Heading(text=str(name), level=2))
|
||||
blocks.append(_cardinality_block(card))
|
||||
# One Group per categorical column: heading + cardinality table + flag
|
||||
# note + top-k table + donut figure are kept together and the renderer
|
||||
# starts each on a fresh page/slide (page_break_before) so every column
|
||||
# gets its own page with its chart next to its tables. The first column
|
||||
# may share the intro's page (no forced break) to avoid a near-empty page.
|
||||
col_blocks = [
|
||||
model.Heading(text=str(name), level=2),
|
||||
_cardinality_block(card),
|
||||
]
|
||||
note = _flag_note(card)
|
||||
if note is not None:
|
||||
blocks.append(note)
|
||||
topk = _topk_table(cat)
|
||||
if topk is not None:
|
||||
blocks.append(topk)
|
||||
blocks.append(model.Figure(
|
||||
col_blocks.append(note)
|
||||
# For id-like columns (≈100% distinct) the top-k is a list of unique
|
||||
# values — pure noise; skip it (the flag note already explains why) and
|
||||
# let the donut take that room so the whole column fits one page/slide.
|
||||
if not card.get("id_like"):
|
||||
topk = _topk_table(cat)
|
||||
if topk is not None:
|
||||
col_blocks.append(topk)
|
||||
col_blocks.append(model.Figure(
|
||||
make=_pie_make(cat.get("top") or [], card.get("n_distinct"),
|
||||
str(name), n_rows),
|
||||
caption=(f"Categorías más comunes de «{_truncate(name, 32)}» "
|
||||
"(donut: top-k + «Otros»)")))
|
||||
blocks.append(model.Group(blocks=col_blocks,
|
||||
page_break_before=(idx > 0)))
|
||||
|
||||
if len(cat_cols) > len(rendered):
|
||||
omitted = len(cat_cols) - len(rendered)
|
||||
|
||||
@@ -2,11 +2,14 @@
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user
|
||||
asked for (entropy intro, distinct/total/%-distinct/unique metrics, top-k table
|
||||
and a donut figure), that the chapter renders inside the full document to both
|
||||
PDF and PPTX showing that content, that a profile with no categorical columns
|
||||
yields ``None`` without raising, and that long labels / many columns are never
|
||||
cut in either output.
|
||||
asked for (distinct/total/%-distinct/unique metrics, top-k table and a donut
|
||||
figure), that EACH categorical column is wrapped in its own keep-together
|
||||
``Group`` that starts on a fresh page/slide (one column per page, chart next to
|
||||
its tables), that the long entropy explanation is NOT repeated inline (it lives
|
||||
in the glossary — only the clickable term is kept), that the chapter renders
|
||||
inside the full document to both PDF and PPTX showing that content, that a
|
||||
profile with no categorical columns yields ``None`` without raising, and that
|
||||
long labels / many columns are never cut in either output.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -17,7 +20,8 @@ from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import (
|
||||
DataTable, Figure, Heading, KVTable, Note,
|
||||
DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown,
|
||||
Note,
|
||||
)
|
||||
from datascience.automatic_eda.chapters.cat_distr import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_cat_distr,
|
||||
@@ -81,8 +85,20 @@ def _pptx_text(path: str) -> str:
|
||||
return re.sub(r"\s+", " ", " ".join(parts))
|
||||
|
||||
|
||||
def _kinds(chapter):
|
||||
return [b.kind for b in chapter.blocks]
|
||||
def _flatten(blocks):
|
||||
"""Expand keep-together Groups so the per-column heading/table/figure are
|
||||
inspectable as a flat block list (the chapter wraps each column in a Group)."""
|
||||
out = []
|
||||
for b in blocks:
|
||||
if getattr(b, "kind", "") == "group":
|
||||
out.extend(_flatten(getattr(b, "blocks", []) or []))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
def _column_groups(chapter):
|
||||
return [b for b in chapter.blocks if isinstance(b, Group)]
|
||||
|
||||
|
||||
def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
@@ -90,36 +106,101 @@ def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
assert ch is not None
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = _kinds(ch)
|
||||
# Entropy intro present.
|
||||
|
||||
# Entropy intro present, but the long explanation is gone (it lives in the
|
||||
# glossary now): only the term is named, no log2/normalizada walkthrough.
|
||||
headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
|
||||
assert any("Entrop" in h for h in headings)
|
||||
md = next(b for b in ch.blocks if b.kind == "markdown")
|
||||
assert "entropía" in md.text.lower() and "log2" in md.text
|
||||
# Cardinality metrics: distinct, total rows, %-distinct, unique values.
|
||||
kv = next(b for b in ch.blocks if isinstance(b, KVTable))
|
||||
md = next(b for b in ch.blocks if isinstance(b, Markdown))
|
||||
assert "entropía" in md.text.lower()
|
||||
assert "log2" not in md.text # redundant explanation removed.
|
||||
assert "máxima diversidad" not in md.text
|
||||
|
||||
# Per-column blocks are wrapped in keep-together Groups: flatten to inspect.
|
||||
flat = _flatten(ch.blocks)
|
||||
kv = next(b for b in flat if isinstance(b, KVTable))
|
||||
labels = [r[0] for r in kv.rows]
|
||||
assert "Valores distintos" in labels
|
||||
assert "% distintos" in labels
|
||||
values = " ".join(str(r[1]) for r in kv.rows)
|
||||
# Cardinality metrics: distinct count, %-distinct, unique values and total
|
||||
# rows are present (grouped onto compact rows so the chart fits the page).
|
||||
assert "Distintos · % · únicos" in labels
|
||||
assert "Total filas (dataset)" in labels
|
||||
assert "Valores únicos (frecuencia 1)" in labels
|
||||
assert any("Entropía" in lbl for lbl in labels)
|
||||
assert "únicos" in values and "%" in values
|
||||
assert "bits" in values and "norm" in values # entropy + max + normalized.
|
||||
# Top-k table + pie figure.
|
||||
dt = next(b for b in ch.blocks if isinstance(b, DataTable))
|
||||
dt = next(b for b in flat if isinstance(b, DataTable))
|
||||
assert dt.header == ["Valor", "Conteo", "%"]
|
||||
assert any("neumaticos" in str(cell) for row in dt.rows for cell in row)
|
||||
assert any(isinstance(b, Figure) for b in ch.blocks)
|
||||
# id-like column flagged with a Note.
|
||||
assert any(isinstance(b, Note) and "identificador" in b.text
|
||||
for b in ch.blocks)
|
||||
assert any(isinstance(b, Figure) for b in flat)
|
||||
# id-like column flagged with a Note that also explains the top-k is dropped.
|
||||
idnote = next((b for b in flat
|
||||
if isinstance(b, Note) and "identificador" in b.text), None)
|
||||
assert idnote is not None
|
||||
assert "No se lista el top" in idnote.text
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_categoricas():
|
||||
def test_golden_idlike_omite_topk_y_conserva_donut():
|
||||
# The id-like column (uuid, 100% distinct) must NOT carry a top-k DataTable
|
||||
# (it would be a list of unique values), but must still keep its donut Figure
|
||||
# and its cardinality table so it stays a full per-column page.
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
groups = _column_groups(ch)
|
||||
uuid_group = next(g for g in groups
|
||||
if any(getattr(b, "text", "") == "uuid" for b in g.blocks))
|
||||
kinds = [b.kind for b in uuid_group.blocks]
|
||||
assert "data_table" not in kinds # top-k of unique values dropped.
|
||||
assert "kv_table" in kinds # cardinality kept.
|
||||
assert "figure" in kinds # donut kept (chart per column).
|
||||
# A non-id-like column keeps its top-k table.
|
||||
cat_group = next(g for g in groups
|
||||
if any(getattr(b, "text", "") == "categoria"
|
||||
for b in g.blocks))
|
||||
assert "data_table" in [b.kind for b in cat_group.blocks]
|
||||
|
||||
|
||||
def test_golden_una_pagina_por_columna_groups():
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
groups = _column_groups(ch)
|
||||
# Two categorical columns -> two column Groups (numeric column excluded).
|
||||
assert len(groups) == 2
|
||||
# Each Group carries one column: a heading + its cardinality table + figure.
|
||||
for g in groups:
|
||||
kinds = [b.kind for b in g.blocks]
|
||||
assert kinds[0] == "heading"
|
||||
assert "kv_table" in kinds
|
||||
assert "figure" in kinds
|
||||
# The first column may share the intro page (no forced break); every later
|
||||
# column starts on a fresh page/slide so each column gets its own page.
|
||||
assert groups[0].page_break_before is False
|
||||
assert all(g.page_break_before is True for g in groups[1:])
|
||||
|
||||
|
||||
def test_golden_entropia_clicable_y_definicion_en_glosario():
|
||||
# With a glossary collector the intro marks the clickable term and the FULL
|
||||
# definition (the long explanation removed from the intro) lands in the
|
||||
# glossary, not inline — no data lost, just relocated.
|
||||
gc = GlossaryCollector()
|
||||
ch = build_cat_distr(_profile(), {"glossary": gc})
|
||||
md = next(b for b in ch.blocks if isinstance(b, Markdown))
|
||||
assert "[[term:entropia]]entropía[[/term]]" in md.text
|
||||
assert gc.has("entropia")
|
||||
entry = gc.get("entropia")
|
||||
assert entry is not None
|
||||
# The definition kept in the glossary still carries the detail removed inline.
|
||||
assert "log2" in entry["definition"]
|
||||
assert "normalizada" in entry["definition"].lower()
|
||||
|
||||
|
||||
def test_golden_render_pdf_una_pagina_por_columna():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID)
|
||||
# Two categorical columns, each on its own page -> >= 2 pages for the
|
||||
# chapter (intro shares the first column's page).
|
||||
assert cat_meta["n_pages"] >= 2
|
||||
txt = _pdf_text(out)
|
||||
assert "Entrop" in txt
|
||||
assert "distintos" in txt
|
||||
@@ -133,13 +214,91 @@ def test_golden_render_pptx_muestra_categoricas():
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID)
|
||||
assert cat_meta["n_slides"] >= 2 # one slide per categorical column.
|
||||
txt = _pptx_text(out)
|
||||
assert "Entrop" in txt
|
||||
assert "categoria" in txt and "neumaticos" in txt
|
||||
assert "distintos" in txt
|
||||
|
||||
|
||||
def _profile_high_card() -> dict:
|
||||
"""Profile with a high-cardinality NON-id-like categorical column whose top-k
|
||||
of long values would split from its donut on a short 16:9 slide unless the
|
||||
renderer trims the table — the exact case the adversarial check flagged
|
||||
(Ticket / Cabin)."""
|
||||
long_vals = [f"Valor largo de categoria numero {i:02d} con texto extra"
|
||||
for i in range(40)]
|
||||
top = [{"value": v, "count": 60 - i, "pct": (60 - i) / 5000.0}
|
||||
for i, v in enumerate(long_vals)]
|
||||
return {
|
||||
"table": "t", "source": "t.csv", "n_rows": 5000, "n_cols": 3,
|
||||
"quality_score": 80.0,
|
||||
"columns": [
|
||||
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
|
||||
"std": 0.5}},
|
||||
# 40 distinct over 5000 rows = 0.8% distinct -> NOT id-like, keeps
|
||||
# its (long) top-k table; the tall table must not push the donut off.
|
||||
{"name": "alta_card_col", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "distinct_count": 40,
|
||||
"categorical": {"top": top, "mode": long_vals[0], "n_distinct": 40,
|
||||
"entropy": 5.2, "imbalance": 1.2, "len_min": 40,
|
||||
"len_mean": 45, "len_max": 50}},
|
||||
{"name": "baja_card_col", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "distinct_count": 4,
|
||||
"categorical": {
|
||||
"top": [{"value": "norte", "count": 2000, "pct": 0.4},
|
||||
{"value": "sur", "count": 1500, "pct": 0.3},
|
||||
{"value": "este", "count": 1000, "pct": 0.2},
|
||||
{"value": "oeste", "count": 500, "pct": 0.1}],
|
||||
"mode": "norte", "n_distinct": 4, "entropy": 1.8}},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_golden_pptx_una_slide_por_columna_con_su_grafico():
|
||||
"""Each categorical column occupies EXACTLY ONE cat_distr slide that carries
|
||||
BOTH its cardinality table and its donut figure (picture) — i.e. the chart is
|
||||
never separated from its table, even for a high-cardinality column."""
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
|
||||
prof = _profile_high_card()
|
||||
cat_names = ["alta_card_col", "baja_card_col"]
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
prs = Presentation(out)
|
||||
|
||||
# Per column: the cat_distr slides whose text mentions it, and whether the
|
||||
# owning slide also has the donut caption + an actual picture shape.
|
||||
slides_with_col = {n: [] for n in cat_names}
|
||||
owner_has_chart = {n: False for n in cat_names}
|
||||
for i, sl in enumerate(prs.slides):
|
||||
texts, has_pic = [], False
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
texts.append(sh.text_frame.text)
|
||||
if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
has_pic = True
|
||||
txt = re.sub(r"\s+", " ", " ".join(texts))
|
||||
if "Distribuciones categ" not in txt: # footer stamp of the chapter.
|
||||
continue
|
||||
for n in cat_names:
|
||||
if n in txt:
|
||||
slides_with_col[n].append(i)
|
||||
has_table = "Cardinalidad" in txt or "distintos" in txt
|
||||
if has_pic and "donut" in txt and has_table:
|
||||
owner_has_chart[n] = True
|
||||
|
||||
for n in cat_names:
|
||||
# Exactly one slide carries the column (not split across slides).
|
||||
assert len(slides_with_col[n]) == 1, (n, slides_with_col[n])
|
||||
# That single slide also holds its table AND its donut picture.
|
||||
assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide")
|
||||
|
||||
|
||||
def test_edge_sin_categoricas_devuelve_none():
|
||||
only_numeric = {
|
||||
"n_rows": 10, "columns": [
|
||||
@@ -170,11 +329,15 @@ def test_anti_corte_label_largo_y_muchas_columnas():
|
||||
|
||||
ch = build_cat_distr(profile, {})
|
||||
assert ch is not None
|
||||
# One Group per column, each forcing its own page (except the first).
|
||||
groups = _column_groups(ch)
|
||||
assert len(groups) == 30
|
||||
assert sum(1 for g in groups if g.page_break_before) == 29
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "anti.pdf")
|
||||
res = render_automatic_eda_pdf(profile, pdf, {"write_manifest": False})
|
||||
assert res["path"] == pdf
|
||||
assert res["n_pages"] > 1 # many columns spilled across pages, OK.
|
||||
assert res["n_pages"] > 1 # one page per column, OK.
|
||||
txt = _pdf_text(pdf)
|
||||
# Long label wrapped (not truncated): every word survives.
|
||||
for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"):
|
||||
|
||||
@@ -31,7 +31,7 @@ import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "correlacion"
|
||||
CHAPTER_TITLE = "Correlación"
|
||||
|
||||
@@ -47,6 +47,13 @@ _MAX_MATRIX_LABELS = 16
|
||||
# How many pairs to show in each of the top-positive / top-negative tables.
|
||||
_TOP_N = 10
|
||||
|
||||
# How many of the strongest numeric-numeric pairs to draw as scatter plots on
|
||||
# each sign (positive / negative). A scatter per pair carries a fitted line/curve
|
||||
# and a relationship-type label; keeping the count small keeps the chapter
|
||||
# readable on a phone / a slide. Only signed (Pearson/Spearman) pairs qualify —
|
||||
# Cramér's V / correlation ratio pairs are not numeric-numeric, so no scatter.
|
||||
_SCATTER_TOP_N = 3
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance in the
|
||||
# body — the canonical two-step pattern (see ``cat_distr`` for the reference
|
||||
@@ -314,6 +321,139 @@ def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _is_seq(values) -> bool:
|
||||
"""True for a non-empty list/tuple of values (a raw numeric column)."""
|
||||
return isinstance(values, (list, tuple)) and len(values) > 0
|
||||
|
||||
|
||||
def _select_scatter_pairs(pairs: list, top_n: int = _SCATTER_TOP_N):
|
||||
"""Pick the strongest numeric-numeric pairs to draw as scatters.
|
||||
|
||||
Only signed (Pearson/Spearman) pairs are numeric-numeric and thus eligible
|
||||
for a scatter with a fitted curve. Returns up to ``top_n`` of the strongest
|
||||
positive pairs followed by up to ``top_n`` of the strongest negative ones,
|
||||
each ranked by magnitude. Mixed-type metrics (Cramér's V, correlation ratio,
|
||||
mutual information) are excluded — they have no x/y scatter interpretation.
|
||||
"""
|
||||
positive = []
|
||||
negative = []
|
||||
for pair in pairs:
|
||||
if not isinstance(pair, dict) or not _is_signed(pair):
|
||||
continue
|
||||
value = pair.get("value")
|
||||
if not _is_num(value):
|
||||
continue
|
||||
if value > 0:
|
||||
positive.append(pair)
|
||||
elif value < 0:
|
||||
negative.append(pair)
|
||||
positive.sort(key=lambda p: abs(float(p.get("value", 0.0))), reverse=True)
|
||||
negative.sort(key=lambda p: abs(float(p.get("value", 0.0))), reverse=True)
|
||||
return positive[:top_n] + negative[:top_n]
|
||||
|
||||
|
||||
def _classification_note(a: str, b: str, cls: dict) -> str:
|
||||
"""Human-readable sentence describing the relationship of a pair.
|
||||
|
||||
Plain text (not baked into the figure image) so the type label is selectable
|
||||
in the PDF / extractable by pdftotext, and sits right next to its scatter
|
||||
inside the keep-together Group.
|
||||
"""
|
||||
tipo = model._safe_str(cls.get("tipo")) or "sin forma clara"
|
||||
bits = []
|
||||
pearson = cls.get("pearson")
|
||||
spearman = cls.get("spearman")
|
||||
r2_lin = cls.get("r2_linear")
|
||||
r2_poly = None
|
||||
for key in ("r2_poly2", "r2_poly3"):
|
||||
v = cls.get(key)
|
||||
if _is_num(v) and (r2_poly is None or float(v) > r2_poly):
|
||||
r2_poly = float(v)
|
||||
if _is_num(pearson):
|
||||
bits.append(f"Pearson r={float(pearson):+.2f}")
|
||||
if _is_num(spearman):
|
||||
bits.append(f"Spearman ρ={float(spearman):+.2f}")
|
||||
if _is_num(r2_lin):
|
||||
bits.append(f"R² lineal={float(r2_lin):.2f}")
|
||||
if r2_poly is not None:
|
||||
bits.append(f"R² polinómico={r2_poly:.2f}")
|
||||
metrics = "; ".join(bits)
|
||||
text = (f"Relación **{tipo}** entre «{a}» y «{b}»."
|
||||
+ (f" {metrics}." if metrics else ""))
|
||||
return text
|
||||
|
||||
|
||||
def _scatter_blocks(pairs: list, raw_numeric):
|
||||
"""Build keep-together scatter Groups for the strongest num-num pairs.
|
||||
|
||||
Returns a list of blocks (a Heading plus one Group per pair), or an empty
|
||||
list when there is no raw numeric data (e.g. the lite profile drops
|
||||
``ctx['raw_numeric']`` to skip live recomputation) or the relationship
|
||||
helpers are unavailable. Never raises: any failure degrades to no scatters,
|
||||
leaving the matrix + tables intact.
|
||||
"""
|
||||
if not isinstance(raw_numeric, dict) or not raw_numeric:
|
||||
return []
|
||||
selected = _select_scatter_pairs(pairs)
|
||||
if not selected:
|
||||
return []
|
||||
|
||||
# The relationship helpers live in the datascience package. Import lazily so
|
||||
# the chapter still builds (matrix + tables) when they are absent.
|
||||
try:
|
||||
from datascience.classify_relationship_type import (
|
||||
classify_relationship_type,
|
||||
)
|
||||
from datascience.relationship_scatter_figure import (
|
||||
relationship_scatter_figure,
|
||||
)
|
||||
except Exception: # noqa: BLE001 — degrade, never break the chapter.
|
||||
return []
|
||||
|
||||
groups = []
|
||||
for pair in selected:
|
||||
a = pair.get("a")
|
||||
b = pair.get("b")
|
||||
xs = raw_numeric.get(a)
|
||||
ys = raw_numeric.get(b)
|
||||
# Edge: a selected pair has no raw column (aggregated profile, renamed
|
||||
# column, …) — skip just that pair, keep the rest.
|
||||
if not _is_seq(xs) or not _is_seq(ys):
|
||||
continue
|
||||
try:
|
||||
cls = classify_relationship_type(list(xs), list(ys)) or {}
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
a_lbl = model._safe_str(a)
|
||||
b_lbl = model._safe_str(b)
|
||||
|
||||
def _make(xs=xs, ys=ys, a_lbl=a_lbl, b_lbl=b_lbl, cls=cls):
|
||||
return relationship_scatter_figure(
|
||||
list(xs), list(ys), x_label=a_lbl, y_label=b_lbl,
|
||||
classification=cls)
|
||||
|
||||
groups.append(model.Group(blocks=[
|
||||
model.Heading(text=f"{a_lbl} ↔ {b_lbl}", level=2),
|
||||
model.Figure(
|
||||
make=_make,
|
||||
caption=(f"Dispersión de «{a_lbl}» frente a «{b_lbl}» con la "
|
||||
"curva de ajuste del mejor modelo.")),
|
||||
model.Markdown(text=_classification_note(a_lbl, b_lbl, cls)),
|
||||
]))
|
||||
|
||||
if not groups:
|
||||
return []
|
||||
intro = model.Markdown(text=(
|
||||
"Para los pares numéricos más fuertes (positivos y negativos) se dibuja "
|
||||
"la nube de puntos con su ajuste y se clasifica el **tipo de relación**: "
|
||||
"**lineal** (una recta basta), **polinómica** (curva de grado 2/3 que "
|
||||
"mejora claramente el ajuste lineal), **monótona no-lineal** (crece o "
|
||||
"decrece siempre pero no en línea recta; Spearman ≫ Pearson) o "
|
||||
"**débil/sin forma**."))
|
||||
return [model.Heading(text="Relaciones más fuertes (scatter)", level=2),
|
||||
intro] + groups
|
||||
|
||||
|
||||
def build_correlacion(profile: dict, ctx: dict):
|
||||
"""Build the Correlation Chapter, or None if there are no pairs to show.
|
||||
|
||||
@@ -392,6 +532,18 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
"No se han hallado correlaciones negativas significativas entre "
|
||||
"columnas numéricas.")))
|
||||
|
||||
# 2.5) Scatter plots of the strongest numeric-numeric pairs, each with its
|
||||
# fitted curve and a relationship-type label (lineal / polinómica / monótona
|
||||
# / débil). Needs the raw numeric sample (ctx['raw_numeric'], row-aligned);
|
||||
# when it is absent (aggregated/lite profile) the scatters are simply omitted
|
||||
# and the matrix + tables above stand on their own.
|
||||
raw_numeric = None
|
||||
if isinstance(ctx, dict):
|
||||
raw_numeric = ctx.get("raw_numeric") or profile.get("raw_numeric")
|
||||
else:
|
||||
raw_numeric = profile.get("raw_numeric")
|
||||
blocks.extend(_scatter_blocks(pairs, raw_numeric))
|
||||
|
||||
# 3) Spuriousness caveat for level-based correlations (Granger–Newbold).
|
||||
caveat = corr.get("levels_caveat")
|
||||
if isinstance(caveat, str) and caveat.strip():
|
||||
|
||||
@@ -175,6 +175,105 @@ def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
|
||||
assert "azufre" in _pdf_text(pdf)
|
||||
|
||||
|
||||
def _raw_numeric_for_profile(n: int = 80) -> dict:
|
||||
"""Row-aligned raw numeric sample matching the signed pairs of _profile().
|
||||
|
||||
Builds columns with a clear, deterministic shape so the relationship-type
|
||||
classifier has something unambiguous to label:
|
||||
- density vs alcohol: strong negative linear (the top-negative pair).
|
||||
- alcohol vs quality: positive linear.
|
||||
- ph, fixed_acidity, sulphates: filler columns for the remaining pairs.
|
||||
"""
|
||||
import math as _m
|
||||
|
||||
alcohol = [8.0 + 0.05 * i for i in range(n)]
|
||||
density = [1.0 - 0.002 * a for a in alcohol] # neg linear vs alcohol
|
||||
quality = [3.0 + 0.4 * a + (0.1 if i % 2 else -0.1) # pos linear vs alcohol
|
||||
for i, a in enumerate(alcohol)]
|
||||
ph = [3.0 + 0.3 * _m.sin(i / 5.0) for i in range(n)]
|
||||
fixed_acidity = [7.0 - 0.5 * p for p in ph] # neg linear vs ph
|
||||
sulphates = [0.5 + 0.01 * (i % 7) for i in range(n)]
|
||||
return {
|
||||
"alcohol": alcohol, "density": density, "quality": quality,
|
||||
"ph": ph, "fixed_acidity": fixed_acidity, "sulphates": sulphates,
|
||||
}
|
||||
|
||||
|
||||
def test_golden_scatters_de_pares_num_num_con_tipo_de_relacion():
|
||||
"""Con ctx['raw_numeric'], el capítulo añade scatters (Figure dentro de Group)
|
||||
de los pares num-num más fuertes, cada uno con su etiqueta de tipo en texto."""
|
||||
from datascience.automatic_eda.model import Group
|
||||
|
||||
ctx = {"raw_numeric": _raw_numeric_for_profile()}
|
||||
ch = build_correlacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
groups = [b for b in ch.blocks if isinstance(b, Group)]
|
||||
assert groups, "debe emitir al menos un Group con scatter"
|
||||
# Cada Group lleva su figura (lazy) y una nota de texto con el tipo.
|
||||
for g in groups:
|
||||
gkinds = [b.kind for b in g.blocks]
|
||||
assert "figure" in gkinds and "markdown" in gkinds
|
||||
# La sección y la etiqueta de tipo aparecen como texto plano (extraíble).
|
||||
headings = " ".join(b.text for b in ch.blocks if b.kind == "heading")
|
||||
assert "Relaciones más fuertes" in headings
|
||||
body = " ".join(b.text for g in groups for b in g.blocks
|
||||
if b.kind == "markdown")
|
||||
assert any(t in body for t in
|
||||
("lineal", "polinómica", "monótona", "sin forma"))
|
||||
# El par num-num más fuerte (density ↔ alcohol) tiene scatter; el par cat-cat
|
||||
# (region ↔ type) NO — no es numérico.
|
||||
assert "density" in body or "alcohol" in body
|
||||
assert "region" not in body and "type" not in body
|
||||
|
||||
|
||||
def test_golden_pdf_muestra_scatters_con_etiqueta_de_tipo():
|
||||
"""En el PDF, el capítulo Correlación incluye los scatters y su etiqueta de
|
||||
tipo en texto seleccionable (pdftotext la encuentra)."""
|
||||
prof = _profile()
|
||||
ctx = {"raw_numeric": _raw_numeric_for_profile()}
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "corr_scatter.pdf")
|
||||
rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine",
|
||||
"ctx": ctx})
|
||||
assert rp["path"] == pdf and rp["n_pages"] >= 1
|
||||
txt = _pdf_text(pdf)
|
||||
assert "Relaciones" in txt and "scatter" in txt.lower()
|
||||
# Alguna etiqueta de tipo de relación, en texto.
|
||||
assert any(t in txt for t in
|
||||
("lineal", "polin", "monóton", "monoton", "sin forma"))
|
||||
|
||||
|
||||
def test_edge_sin_raw_numeric_omite_scatters_sin_lanzar():
|
||||
"""profile lite / ctx None: sin raw_numeric el capítulo omite los scatters
|
||||
pero sigue emitiendo matriz + tablas (no lanza)."""
|
||||
from datascience.automatic_eda.model import Group
|
||||
|
||||
for ctx in (None, {}, {"raw_numeric": None}, {"raw_numeric": {}}):
|
||||
ch = build_correlacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
assert not [b for b in ch.blocks if isinstance(b, Group)]
|
||||
# La matriz y al menos una tabla top siguen presentes.
|
||||
assert any(b.kind == "figure" for b in ch.blocks)
|
||||
assert any(b.kind == "data_table" for b in ch.blocks)
|
||||
|
||||
|
||||
def test_edge_par_sin_columna_cruda_se_omite_sin_lanzar():
|
||||
"""Si un par seleccionado no tiene su columna en raw_numeric, se omite ese
|
||||
par (no lanza); los demás scatters se construyen igual."""
|
||||
from datascience.automatic_eda.model import Group
|
||||
|
||||
raw = _raw_numeric_for_profile()
|
||||
raw.pop("density", None) # rompe el par density ↔ alcohol
|
||||
ch = build_correlacion(_profile(), {"raw_numeric": raw})
|
||||
assert ch is not None
|
||||
groups = [b for b in ch.blocks if isinstance(b, Group)]
|
||||
body = " ".join(b.text for g in groups for b in g.blocks
|
||||
if b.kind == "markdown")
|
||||
# density desaparece de los scatters; otros pares (p.ej. ph↔fixed_acidity,
|
||||
# alcohol↔quality) pueden seguir presentes sin error.
|
||||
assert "density" not in body
|
||||
|
||||
|
||||
def test_glosario_engancha_metodos_y_fdr():
|
||||
"""Mejora 4b: los métodos de correlación (Pearson, Spearman, Cramér's V,
|
||||
razón de correlación) y la corrección por comparaciones múltiples (FDR) se
|
||||
|
||||
@@ -139,10 +139,17 @@ class Group:
|
||||
it starts on a fresh page and flows (honest degradation, never cut). Use it to
|
||||
bind ``Heading`` + ``Markdown`` + ``Figure`` of one idea together (see the
|
||||
DISTR NUM / AGREGACION chapters).
|
||||
|
||||
When ``page_break_before`` is True the renderer additionally forces the group
|
||||
to *start* on a fresh page/slide (unless the current one is already empty), so
|
||||
a chapter can give each unit its own page — e.g. one categorical column per
|
||||
page (see CAT DISTR). It is purely additive: the default False keeps the plain
|
||||
keep-together behaviour for every existing chapter.
|
||||
"""
|
||||
|
||||
blocks: list = field(default_factory=list)
|
||||
title: Optional[str] = None
|
||||
page_break_before: bool = False
|
||||
kind: str = field(default="group", init=False)
|
||||
|
||||
|
||||
@@ -228,7 +235,9 @@ def as_block(obj: Any):
|
||||
return Note(text=_safe_str(obj.get("text")))
|
||||
if cls is Group:
|
||||
return Group(blocks=as_blocks(obj.get("blocks")),
|
||||
title=obj.get("title"))
|
||||
title=obj.get("title"),
|
||||
page_break_before=bool(
|
||||
obj.get("page_break_before", False)))
|
||||
if cls is GlossaryEntry:
|
||||
return GlossaryEntry(key=_safe_str(obj.get("key")),
|
||||
label=_safe_str(obj.get("label")),
|
||||
|
||||
@@ -0,0 +1,458 @@
|
||||
"""AutomaticEDA Markdown serializer — one self-contained file to paste to an LLM.
|
||||
|
||||
Same document model as the PDF/PPTX renderers (an ordered list of
|
||||
:class:`Chapter`, each a list of format-independent blocks) but emitted as plain
|
||||
**Markdown** instead of a binary. The goal is different from the other two
|
||||
renderers: a Markdown EDA is meant to be *pasted into an LLM*, so it prioritises
|
||||
TEXT and DATA over visuals. Tables become Markdown tables (every row dumped, no
|
||||
pagination — nothing is cut because there are no pages); a ``Figure`` becomes its
|
||||
caption plus, when possible, the underlying bar/histogram data as a Markdown
|
||||
table (an LLM cannot see the image); glossary term markers are stripped while
|
||||
``**bold**`` is kept (it is valid Markdown).
|
||||
|
||||
dict-no-throw (the ``eda`` group style): :func:`render_md` never raises. On a
|
||||
fatal error it returns ``{path: None, ...}`` with a ``note`` explaining why; a
|
||||
malformed block degrades to a readable note rather than crashing the document.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from . import model
|
||||
|
||||
# Glossary span markers (kept text, dropped markers). We intentionally do NOT use
|
||||
# ``text_layout.strip_inline_md`` for Markdown blocks because that also removes
|
||||
# ``**bold**`` — valid Markdown we want to preserve when pasting to an LLM.
|
||||
_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")
|
||||
_MAX_BAR_ROWS = 100
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Small helpers.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _clean_terms(s) -> str:
|
||||
"""Drop glossary term markers, keeping the visible text (and any **bold**)."""
|
||||
s = model._safe_str(s)
|
||||
s = _TERM_OPEN_RE.sub("", s)
|
||||
return s.replace("[[/term]]", "")
|
||||
|
||||
|
||||
def _cell(v) -> str:
|
||||
"""Render a value as a safe Markdown table cell.
|
||||
|
||||
Escapes pipes (``|`` -> ``\\|``) so they do not break the column layout and
|
||||
folds newlines to ``<br>`` so a multi-line value stays inside one cell. None
|
||||
becomes an empty string.
|
||||
"""
|
||||
s = model._safe_str(v)
|
||||
s = s.replace("|", "\\|")
|
||||
s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "<br>")
|
||||
return s
|
||||
|
||||
|
||||
def _slug(text: str) -> str:
|
||||
"""GitHub-style heading anchor: lowercase, spaces->'-', drop other symbols."""
|
||||
s = model._safe_str(text).strip().lower()
|
||||
out = []
|
||||
for ch in s:
|
||||
if ch.isalnum():
|
||||
out.append(ch)
|
||||
elif ch in " -":
|
||||
out.append("-")
|
||||
# any other symbol is dropped.
|
||||
slug = "".join(out)
|
||||
while "--" in slug:
|
||||
slug = slug.replace("--", "-")
|
||||
return slug.strip("-")
|
||||
|
||||
|
||||
def _fmt_num(v) -> str:
|
||||
"""Compact number for the figure data tables (ints as ints, else 4 sig figs)."""
|
||||
try:
|
||||
f = float(v)
|
||||
except Exception: # noqa: BLE001
|
||||
return model._safe_str(v)
|
||||
if f != f: # NaN
|
||||
return "NaN"
|
||||
if f == int(f) and abs(f) < 1e15:
|
||||
return str(int(f))
|
||||
return f"{f:.4g}"
|
||||
|
||||
|
||||
def _fmt_int(v) -> str:
|
||||
try:
|
||||
return str(int(v))
|
||||
except Exception: # noqa: BLE001
|
||||
return model._safe_str(v)
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
from datetime import datetime, timezone
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Document header (title + metadata blockquote + numbered index).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _meta_block(meta: dict) -> list:
|
||||
"""Build the metadata lines for the header blockquote (omitting absentees)."""
|
||||
ctx = meta.get("ctx") if isinstance(meta.get("ctx"), dict) else {}
|
||||
lines: list = []
|
||||
|
||||
def add(label, value) -> None:
|
||||
if value is None:
|
||||
return
|
||||
s = model._safe_str(value).strip()
|
||||
if s and s.lower() != "none":
|
||||
lines.append(f"**{label}:** {s}")
|
||||
|
||||
add("Dataset", ctx.get("dataset_name") or meta.get("dataset_name"))
|
||||
add("Fuente", ctx.get("source_origin") or meta.get("source_origin"))
|
||||
add("Almacenamiento", ctx.get("storage") or meta.get("storage"))
|
||||
n_rows = ctx.get("n_rows", meta.get("n_rows"))
|
||||
n_cols = ctx.get("n_cols", meta.get("n_cols"))
|
||||
if n_rows is not None and n_cols is not None:
|
||||
lines.append(
|
||||
f"**Dimensiones:** {_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas")
|
||||
add("Generado", meta.get("generated_at") or _now_iso())
|
||||
lines.append(f"**Motor:** {model.ENGINE_NAME} v{model.ENGINE_VERSION}")
|
||||
return lines
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Per-block serializers. Each returns a Markdown string (no surrounding blanks;
|
||||
# the caller separates blocks with a blank line).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _md_heading(block) -> str:
|
||||
level = int(getattr(block, "level", 1) or 1)
|
||||
hashes = "#" * min(level + 2, 6) # level1 -> ###; '#'/'##' reserved for doc/chapter.
|
||||
text = _clean_terms(getattr(block, "text", "")).strip()
|
||||
return f"{hashes} {text}"
|
||||
|
||||
|
||||
def _md_markdown(block) -> str:
|
||||
# Keep the text verbatim, dropping only glossary markers (keep **bold**).
|
||||
return _clean_terms(getattr(block, "text", "")).rstrip("\n")
|
||||
|
||||
|
||||
def _md_kv_table(block) -> str:
|
||||
lines: list = []
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
lines.append(f"**{_clean_terms(title).strip()}**")
|
||||
lines.append("")
|
||||
lines.append("| Campo | Valor |")
|
||||
lines.append("| --- | --- |")
|
||||
for row in (getattr(block, "rows", []) or []):
|
||||
try:
|
||||
label, value = row[0], row[1]
|
||||
except Exception: # noqa: BLE001
|
||||
label, value = row, ""
|
||||
lines.append(f"| {_cell(label)} | {_cell(value)} |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _md_data_table(block) -> str:
|
||||
lines: list = []
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
lines.append(f"**{_clean_terms(title).strip()}**")
|
||||
lines.append("")
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
if not header:
|
||||
ncol = max((len(r) for r in rows), default=1)
|
||||
header = [f"col{i + 1}" for i in range(ncol)]
|
||||
ncol = len(header)
|
||||
lines.append("| " + " | ".join(_cell(h) for h in header) + " |")
|
||||
lines.append("| " + " | ".join(["---"] * ncol) + " |")
|
||||
for r in rows: # dump every row — no pagination, nothing cut.
|
||||
cells = [_cell(r[c]) if c < len(r) else "" for c in range(ncol)]
|
||||
lines.append("| " + " | ".join(cells) + " |")
|
||||
note = getattr(block, "note", None)
|
||||
if note:
|
||||
lines.append("")
|
||||
lines.append(f"*{_clean_terms(note).strip()}*")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _bars_table(bars: list) -> str:
|
||||
"""Render extracted bar/histogram data as a Markdown table (Desde/Hasta/Frec)."""
|
||||
lines = ["| Desde | Hasta | Frecuencia |", "| --- | --- | --- |"]
|
||||
shown = bars[:_MAX_BAR_ROWS]
|
||||
for x0, x1, h in shown:
|
||||
lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
|
||||
out = "\n".join(lines)
|
||||
extra = len(bars) - len(shown)
|
||||
if extra > 0:
|
||||
out += f"\n\n*… ({extra} filas más)*"
|
||||
return out
|
||||
|
||||
|
||||
def _extract_bars(fig) -> list:
|
||||
"""Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
|
||||
|
||||
Histogram / bar-chart bars are ``matplotlib.patches.Rectangle`` with positive
|
||||
width and height; spines, legends and zero-area artists are skipped. Never
|
||||
raises — returns ``[]`` on any problem.
|
||||
"""
|
||||
bars: list = []
|
||||
try:
|
||||
for ax in fig.get_axes():
|
||||
# Collect this axes' positive-area rectangles, then keep only the ones
|
||||
# that look like actual histogram/bar bins. Reference shapes that
|
||||
# matplotlib also stores in ``ax.patches`` — most notably the ``±1σ``
|
||||
# band drawn by ``axvspan`` (a single rectangle far wider than a bin)
|
||||
# and a lone Tukey boxplot box — would otherwise show up as fake
|
||||
# "bins". A histogram axes has several near-equal-width bars, so we
|
||||
# drop any rectangle whose width is more than twice the median width
|
||||
# of that axes' rectangles (the σ-band spans many bins; uniform bins
|
||||
# all sit at the median width and stay).
|
||||
ax_bars: list = []
|
||||
for patch in list(getattr(ax, "patches", []) or []):
|
||||
try:
|
||||
w = patch.get_width()
|
||||
h = patch.get_height()
|
||||
x = patch.get_x()
|
||||
except Exception: # noqa: BLE001 — not a Rectangle-like patch.
|
||||
continue
|
||||
if w and w > 0 and h and h > 0:
|
||||
ax_bars.append((x, x + w, h))
|
||||
if len(ax_bars) >= 3:
|
||||
widths = sorted(b[1] - b[0] for b in ax_bars)
|
||||
median_w = widths[len(widths) // 2]
|
||||
if median_w > 0:
|
||||
ax_bars = [b for b in ax_bars
|
||||
if (b[1] - b[0]) <= 2.0 * median_w]
|
||||
bars.extend(ax_bars)
|
||||
except Exception: # noqa: BLE001
|
||||
return []
|
||||
return bars
|
||||
|
||||
|
||||
def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
|
||||
"""Serialize a Figure prioritising TEXT + DATA (an LLM cannot see the image).
|
||||
|
||||
Emits the caption, then — if the matplotlib figure has bars — a Markdown table
|
||||
of the underlying (Desde, Hasta, Frecuencia) values. Optionally (when
|
||||
``meta['embed_figures']`` is True) also exports a PNG beside the .md and adds
|
||||
an image link; off by default so the Markdown stays self-contained.
|
||||
"""
|
||||
caption = model._safe_str(getattr(block, "caption", "")).strip()
|
||||
parts = [f"*Figura: {caption}*" if caption else "*Figura*"]
|
||||
fig = None
|
||||
try:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg") # defensive: headless rasterization backend.
|
||||
fig = getattr(block, "fig", None)
|
||||
make = getattr(block, "make", None)
|
||||
if fig is None and callable(make):
|
||||
fig = make()
|
||||
if fig is not None:
|
||||
bars = _extract_bars(fig)
|
||||
if bars:
|
||||
parts.append(_bars_table(bars))
|
||||
if meta.get("embed_figures"):
|
||||
png = _embed_png(fig, out_path, counter)
|
||||
if png:
|
||||
parts.append(f"")
|
||||
except Exception: # noqa: BLE001 — a bad figure degrades to just its caption.
|
||||
pass
|
||||
finally:
|
||||
if fig is not None:
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
plt.close(fig)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _embed_png(fig, out_path: str, counter: list) -> str:
|
||||
"""Export the figure to ``<basename>_figN.png`` beside the .md; return its name."""
|
||||
try:
|
||||
counter[0] += 1
|
||||
base = os.path.splitext(os.path.basename(out_path))[0] or "figura"
|
||||
name = f"{base}_fig{counter[0]}.png"
|
||||
path = os.path.join(os.path.dirname(os.path.abspath(out_path)), name)
|
||||
fig.savefig(path, format="png", dpi=120, bbox_inches="tight")
|
||||
return name
|
||||
except Exception: # noqa: BLE001
|
||||
return ""
|
||||
|
||||
|
||||
def _md_image(block) -> str:
|
||||
path = model._safe_str(getattr(block, "path", ""))
|
||||
caption = model._safe_str(getattr(block, "caption", "")).strip()
|
||||
out = f""
|
||||
if caption:
|
||||
out += f"\n\n*{caption}*"
|
||||
return out
|
||||
|
||||
|
||||
def _md_caption(block) -> str:
|
||||
return f"*{_clean_terms(getattr(block, 'text', '')).strip()}*"
|
||||
|
||||
|
||||
def _md_note(block) -> str:
|
||||
text = _clean_terms(getattr(block, "text", "")).strip()
|
||||
lines = text.split("\n")
|
||||
return "\n".join((f"> {ln}" if ln.strip() else ">") for ln in lines)
|
||||
|
||||
|
||||
def _md_group(block, meta: dict, out_path: str, counter: list) -> str:
|
||||
parts: list = []
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
parts.append(f"### {_clean_terms(title).strip()}")
|
||||
for b in (getattr(block, "blocks", []) or []):
|
||||
try:
|
||||
seg = _serialize_block(b, meta, out_path, counter)
|
||||
except Exception: # noqa: BLE001
|
||||
seg = ""
|
||||
if seg:
|
||||
parts.append(seg)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _md_glossary_entry(block) -> str:
|
||||
label = (model._safe_str(getattr(block, "label", "")).strip()
|
||||
or model._safe_str(getattr(block, "key", "")).strip())
|
||||
definition = _clean_terms(getattr(block, "definition", "")).strip()
|
||||
out = f"### {label}"
|
||||
if definition:
|
||||
out += f"\n\n{definition}"
|
||||
return out
|
||||
|
||||
|
||||
def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
|
||||
"""Dispatch a single block to its Markdown serializer. Unknown -> note."""
|
||||
kind = getattr(block, "kind", "")
|
||||
if kind == "heading":
|
||||
return _md_heading(block)
|
||||
if kind == "markdown":
|
||||
return _md_markdown(block)
|
||||
if kind == "kv_table":
|
||||
return _md_kv_table(block)
|
||||
if kind == "data_table":
|
||||
return _md_data_table(block)
|
||||
if kind == "figure":
|
||||
return _md_figure(block, meta, out_path, counter)
|
||||
if kind == "image":
|
||||
return _md_image(block)
|
||||
if kind == "caption":
|
||||
return _md_caption(block)
|
||||
if kind == "note":
|
||||
return _md_note(block)
|
||||
if kind == "group":
|
||||
return _md_group(block, meta, out_path, counter)
|
||||
if kind == "glossary_entry":
|
||||
return _md_glossary_entry(block)
|
||||
# Unknown content -> readable note (mirrors the model's defensive coercion).
|
||||
return _md_note(model.Note(text=model._safe_str(block)))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
|
||||
"""Serialize a list of Chapters into a single self-contained Markdown file.
|
||||
|
||||
The output leads with ``# <title>``, a metadata blockquote and a numbered
|
||||
``## Índice`` linking each chapter, then one ``## N. <title>`` section per
|
||||
chapter with its blocks. Tables become Markdown tables (every row dumped),
|
||||
figures become caption + underlying data table, glossary markers are stripped
|
||||
while ``**bold**`` is kept. Designed to be pasted into an LLM.
|
||||
|
||||
Args:
|
||||
chapters: a list of ``Chapter`` (dataclasses or dicts); normalized
|
||||
defensively with ``model.as_chapters``.
|
||||
out_path: filesystem path for the ``.md`` (parent dirs are created).
|
||||
meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
|
||||
``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
|
||||
``generated_at``, ``embed_figures`` (export PNGs beside the .md,
|
||||
default False).
|
||||
|
||||
Returns:
|
||||
dict (never raises): ``{path: str|None, n_chars: int,
|
||||
chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
|
||||
None and ``note`` explains why.
|
||||
"""
|
||||
meta = meta or {}
|
||||
chapters = model.as_chapters(chapters)
|
||||
title = model._safe_str(meta.get("title")) or model.ENGINE_NAME
|
||||
|
||||
# Edge: nothing to render -> a minimal but valid Markdown document.
|
||||
if not chapters:
|
||||
content = (f"# {title}\n\n"
|
||||
"*(documento vacío — sin capítulos aplicables)*\n")
|
||||
return _write(out_path, content, [], "documento vacío")
|
||||
|
||||
counter = [0] # document-wide figure counter for unique PNG names.
|
||||
notes: list = []
|
||||
segments: list = [f"# {title}"]
|
||||
|
||||
meta_lines = _meta_block(meta)
|
||||
if meta_lines:
|
||||
segments.append("\n".join(f"> {ln}" for ln in meta_lines))
|
||||
|
||||
# Numbered index. The anchor matches the chapter heading emitted below
|
||||
# (``## N. <title>``) in GitHub slug style.
|
||||
chap_heads = []
|
||||
idx_lines = ["## Índice"]
|
||||
for i, ch in enumerate(chapters, 1):
|
||||
head_text = f"{i}. {model._safe_str(ch.title)}"
|
||||
anchor = _slug(head_text)
|
||||
chap_heads.append((head_text, anchor))
|
||||
idx_lines.append(f"{i}. [{model._safe_str(ch.title)}](#{anchor})")
|
||||
segments.append("\n".join(idx_lines))
|
||||
|
||||
chapters_meta = []
|
||||
for i, ch in enumerate(chapters, 1):
|
||||
segments.append("---")
|
||||
head_text, _anchor = chap_heads[i - 1]
|
||||
segments.append(f"## {head_text}")
|
||||
|
||||
blocks = list(ch.blocks or [])
|
||||
# Omit a leading level-1 Heading that just repeats the chapter title.
|
||||
if blocks:
|
||||
b0 = blocks[0]
|
||||
if (getattr(b0, "kind", "") == "heading"
|
||||
and int(getattr(b0, "level", 1) or 1) == 1
|
||||
and _clean_terms(getattr(b0, "text", "")).strip()
|
||||
== model._safe_str(ch.title).strip()):
|
||||
blocks = blocks[1:]
|
||||
|
||||
for block in blocks:
|
||||
try:
|
||||
seg = _serialize_block(block, meta, out_path, counter)
|
||||
except Exception as e: # noqa: BLE001
|
||||
seg = _md_note(model.Note(text=model._safe_str(block)))
|
||||
notes.append(
|
||||
f"bloque '{getattr(block, 'kind', '?')}' del capítulo "
|
||||
f"'{ch.id}' degradado: {e}")
|
||||
if seg:
|
||||
segments.append(seg)
|
||||
chapters_meta.append({"id": ch.id, "version": ch.version})
|
||||
|
||||
content = "\n\n".join(segments) + "\n"
|
||||
note = f"{len(content)} caracteres"
|
||||
if notes:
|
||||
note += " · " + "; ".join(notes)
|
||||
return _write(out_path, content, chapters_meta, note)
|
||||
|
||||
|
||||
def _write(out_path: str, content: str, chapters_meta: list, note: str) -> dict:
|
||||
"""Write the Markdown to disk (creating parents). dict-no-throw."""
|
||||
try:
|
||||
parent = os.path.dirname(os.path.abspath(out_path))
|
||||
os.makedirs(parent, exist_ok=True)
|
||||
with open(out_path, "w", encoding="utf-8") as fh:
|
||||
fh.write(content)
|
||||
except Exception as e: # noqa: BLE001 — never raise from the writer.
|
||||
return {"path": None, "n_chars": 0, "chapters": [],
|
||||
"note": f"no se pudo escribir el Markdown: {e}"}
|
||||
return {"path": out_path, "n_chars": len(content),
|
||||
"chapters": chapters_meta, "note": note}
|
||||
@@ -675,6 +675,61 @@ def _measure_figure_like(block) -> float:
|
||||
return target_h + 0.04 + cap_h + _GAP
|
||||
|
||||
|
||||
def _measure_kv_table(block) -> float:
|
||||
"""Faithful height of a KVTable — matches ``_place_kv_table``.
|
||||
|
||||
Counts the optional title heading and, per row, the wrapped VALUE column
|
||||
(the label column never wraps in the placer). The previous estimate assumed
|
||||
one line per row and ignored the title, so a column's keep-together Group
|
||||
under-budgeted the figure and the chart spilled to the next page. Keep this in
|
||||
sync with ``_place_kv_table``."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
rows = getattr(block, "rows", []) or []
|
||||
key_w = 1.9
|
||||
val_chars = tl.chars_per_line(_USABLE_W - key_w - 0.1, _FS_BODY)
|
||||
lh = tl.line_height_in(_FS_BODY)
|
||||
for row in rows:
|
||||
try:
|
||||
value = row[1]
|
||||
except Exception: # noqa: BLE001
|
||||
value = ""
|
||||
v_lines = tl.wrap(model._safe_str(value), val_chars)
|
||||
h += lh * len(v_lines) + _ROW_VPAD
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_data_table(block) -> float:
|
||||
"""Faithful height of a DataTable — matches ``_place_data_table``.
|
||||
|
||||
Counts the optional title heading, the wrapped header row, every wrapped data
|
||||
row (per-column wrap via the same ``_col_widths``/``_wrap_row`` the placer
|
||||
uses) and the optional note. Keep this in sync with ``_place_data_table``."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
fs = _FS_CELL
|
||||
widths = _col_widths(header, rows, fs)
|
||||
lh = tl.line_height_in(fs)
|
||||
if header:
|
||||
header_lines = _wrap_row(header, widths, fs)
|
||||
h += lh * max((len(c) for c in header_lines), default=1) + _ROW_VPAD * 2
|
||||
for r in rows:
|
||||
cells_lines = _wrap_row(r, widths, fs)
|
||||
h += lh * max((len(c) for c in cells_lines), default=1) + _ROW_VPAD * 2
|
||||
note = getattr(block, "note", None)
|
||||
if note:
|
||||
nlines = tl.wrap(model._safe_str(note),
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
h += tl.line_height_in(_FS_NOTE) * len(nlines)
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_block(st: _PdfState, block) -> float:
|
||||
kind = getattr(block, "kind", "")
|
||||
try:
|
||||
@@ -690,13 +745,9 @@ def _measure_block(st: _PdfState, block) -> float:
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
return tl.line_height_in(_FS_NOTE) * len(lines) + _GAP
|
||||
if kind == "kv_table":
|
||||
rows = getattr(block, "rows", []) or []
|
||||
return (tl.line_height_in(_FS_BODY) + _ROW_VPAD) * (len(rows) + 1) \
|
||||
+ _GAP
|
||||
return _measure_kv_table(block)
|
||||
if kind == "data_table":
|
||||
rows = getattr(block, "rows", []) or []
|
||||
return (tl.line_height_in(_FS_CELL) + _ROW_VPAD * 2) \
|
||||
* (len(rows) + 1) + _GAP
|
||||
return _measure_data_table(block)
|
||||
if kind == "group":
|
||||
return sum(_measure_block(st, b)
|
||||
for b in (getattr(block, "blocks", []) or []))
|
||||
@@ -735,6 +786,10 @@ def _place_group(st: _PdfState, block) -> None:
|
||||
blocks = getattr(block, "blocks", []) or []
|
||||
if not blocks:
|
||||
return
|
||||
# Opt-in page break: start this group on a fresh page unless the current one
|
||||
# is still empty (so a chapter can give each unit its own page).
|
||||
if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
|
||||
_new_page(st)
|
||||
avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
|
||||
_shrink_group_figures(st, blocks, avail_full)
|
||||
total = sum(_measure_block(st, b) for b in blocks)
|
||||
|
||||
@@ -625,6 +625,55 @@ def _measure_figure_like(block) -> float:
|
||||
return target_h + 0.05 + cap_h + _GAP
|
||||
|
||||
|
||||
def _measure_kv_table(block) -> float:
|
||||
"""Faithful KVTable height — matches ``_place_kv_table`` (rendered as a
|
||||
Campo/Valor data table with wrapped cells). The previous estimate assumed one
|
||||
line per row and ignored the title, so a keep-together Group under-budgeted
|
||||
the figure and the chart spilled to the next slide. Keep in sync."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
rows = getattr(block, "rows", []) or []
|
||||
data_rows = []
|
||||
for row in rows:
|
||||
try:
|
||||
label, value = row[0], row[1]
|
||||
except Exception: # noqa: BLE001
|
||||
label, value = str(row), ""
|
||||
data_rows.append([model._safe_str(label), model._safe_str(value)])
|
||||
header = ["Campo", "Valor"]
|
||||
widths = _col_widths(header, data_rows)
|
||||
fs = _FS_CELL
|
||||
h += _row_height_in(header, widths, fs)
|
||||
for r in data_rows:
|
||||
h += _row_height_in(r, widths, fs)
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_data_table(block) -> float:
|
||||
"""Faithful DataTable height — matches ``_place_data_table`` (title heading +
|
||||
wrapped header + every wrapped row + optional note). Keep in sync."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
fs = _FS_CELL
|
||||
widths = _col_widths(header, rows)
|
||||
if header:
|
||||
h += _row_height_in(header, widths, fs)
|
||||
for r in rows:
|
||||
h += _row_height_in(r, widths, fs)
|
||||
note = getattr(block, "note", None)
|
||||
if note:
|
||||
nlines = tl.wrap(model._safe_str(note),
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
h += tl.line_height_in(_FS_NOTE) * len(nlines) + 0.05
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_block(st: _PptxState, block) -> float:
|
||||
kind = getattr(block, "kind", "")
|
||||
try:
|
||||
@@ -639,9 +688,10 @@ def _measure_block(st: _PptxState, block) -> float:
|
||||
lines = tl.wrap(getattr(block, "text", ""),
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
return tl.line_height_in(_FS_NOTE) * len(lines) + 0.05 + _GAP
|
||||
if kind in ("kv_table", "data_table"):
|
||||
rows = getattr(block, "rows", []) or []
|
||||
return (tl.line_height_in(_FS_CELL) + 0.10) * (len(rows) + 1) + _GAP
|
||||
if kind == "kv_table":
|
||||
return _measure_kv_table(block)
|
||||
if kind == "data_table":
|
||||
return _measure_data_table(block)
|
||||
if kind == "group":
|
||||
return sum(_measure_block(st, b)
|
||||
for b in (getattr(block, "blocks", []) or []))
|
||||
@@ -664,10 +714,14 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
|
||||
if getattr(b, "kind", "") not in ("figure", "image"))
|
||||
fig_overhead = tl.line_height_in(_FS_NOTE) + 0.05 + 0.05 + _GAP
|
||||
budget = avail_full - nonfig_h - 0.10 * len(fig_blocks)
|
||||
if budget <= 1.0:
|
||||
# Low thresholds: a 16:9 slide is short, so a content-heavy column (cardinality
|
||||
# table + top-k + chart) only fits if the chart is allowed to shrink small.
|
||||
# Prefer a small-but-present chart on the SAME slide over splitting the column
|
||||
# across slides (matches the PDF renderer's keep-together philosophy).
|
||||
if budget <= 0.6:
|
||||
return # not enough room to keep together; let it flow (degrade).
|
||||
per = budget / len(fig_blocks) - fig_overhead
|
||||
if per <= 0.8:
|
||||
if per <= 0.35:
|
||||
return
|
||||
for fb in fig_blocks:
|
||||
cur = getattr(fb, "height_in", None)
|
||||
@@ -675,12 +729,90 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
|
||||
if isinstance(cur, (int, float)) and cur > 0 else per)
|
||||
|
||||
|
||||
# Minimum height (inches) reserved for a figure inside a keep-together group on
|
||||
# the short 16:9 slide. When a high-cardinality column's table(s) would otherwise
|
||||
# leave no room, the data table is trimmed (with an honest note) so the chart
|
||||
# stays on the SAME slide next to its table instead of spilling to the next one.
|
||||
_GROUP_MIN_FIG_H = 1.3
|
||||
|
||||
|
||||
def _trim_data_table_to_budget(block, budget: float):
|
||||
"""Return a copy of a DataTable whose rows fit within ``budget`` inches.
|
||||
|
||||
Keeps the title, header, as many leading rows as fit (at least one) and an
|
||||
honest note reporting how many of the original rows are shown. NEVER mutates
|
||||
the original block — the same Chapter blocks are rendered by the PDF renderer,
|
||||
which keeps the full table (an A5 page fits it)."""
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
title = getattr(block, "title", None)
|
||||
fs = _FS_CELL
|
||||
widths = _col_widths(header, rows)
|
||||
fixed = 0.0
|
||||
if title:
|
||||
fixed += _measure_heading_text(title, 2)
|
||||
if header:
|
||||
fixed += _row_height_in(header, widths, fs)
|
||||
note_h = tl.line_height_in(_FS_NOTE) + 0.05
|
||||
avail_rows = budget - fixed - note_h - _GAP
|
||||
kept = []
|
||||
used = 0.0
|
||||
for r in rows:
|
||||
rh = _row_height_in(r, widths, fs)
|
||||
if used + rh > avail_rows and kept:
|
||||
break
|
||||
kept.append(r)
|
||||
used += rh
|
||||
if len(kept) >= len(rows):
|
||||
return block # already fits; keep the original (with its own note).
|
||||
note = (f"top {len(kept)} de {len(rows)} categorías mostradas "
|
||||
"(recortado para caber en el slide; el PDF muestra más)")
|
||||
return model.DataTable(header=header, rows=kept, title=title, note=note)
|
||||
|
||||
|
||||
def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list:
|
||||
"""Return a slide-fitting copy of a keep-together group's blocks.
|
||||
|
||||
On the short 16:9 slide a high-cardinality column's top-k table plus its
|
||||
chart can overflow. Reserve ``_GROUP_MIN_FIG_H`` for the (later shrunk) figure
|
||||
and trim the data table(s) to what is left, so every column keeps its chart
|
||||
next to its table on ONE slide. No-op when the group has no figure+table pair
|
||||
(e.g. id-like columns already drop the top-k upstream, or it already fits)."""
|
||||
has_fig = any(getattr(b, "kind", "") in ("figure", "image") for b in blocks)
|
||||
tbls = [b for b in blocks if getattr(b, "kind", "") == "data_table"]
|
||||
if not (has_fig and tbls):
|
||||
return blocks
|
||||
fixed_h = sum(_measure_block(st, b) for b in blocks
|
||||
if getattr(b, "kind", "") not in ("figure", "image",
|
||||
"data_table"))
|
||||
tables_h = sum(_measure_block(st, b) for b in tbls)
|
||||
budget_tables = avail_full - fixed_h - _GROUP_MIN_FIG_H
|
||||
if tables_h <= budget_tables:
|
||||
return blocks # already fits next to a min-height figure; leave intact.
|
||||
out = []
|
||||
for b in blocks:
|
||||
if getattr(b, "kind", "") != "data_table":
|
||||
out.append(b)
|
||||
continue
|
||||
trimmed = _trim_data_table_to_budget(b, max(budget_tables, 0.8))
|
||||
out.append(trimmed)
|
||||
budget_tables -= _measure_data_table(trimmed)
|
||||
return out
|
||||
|
||||
|
||||
def _place_group(st: _PptxState, block) -> None:
|
||||
"""Render a keep-together Group: move it whole to the next slide if needed."""
|
||||
blocks = getattr(block, "blocks", []) or []
|
||||
if not blocks:
|
||||
return
|
||||
# Opt-in slide break: start this group on a fresh slide unless the current one
|
||||
# is still empty (so a chapter can give each unit its own slide).
|
||||
if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
|
||||
_new_slide(st, cont=True)
|
||||
avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
|
||||
# Trim oversized tables first (keeps the chart on the same slide), then shrink
|
||||
# the figure to share the remaining room.
|
||||
blocks = _fit_group_blocks(st, blocks, avail_full)
|
||||
_shrink_group_figures(st, blocks, avail_full)
|
||||
total = sum(_measure_block(st, b) for b in blocks)
|
||||
if total <= avail_full:
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
---
|
||||
name: classify_relationship_type
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def classify_relationship_type(xs: list, ys: list) -> dict"
|
||||
description: "Clasifica el TIPO de relacion entre dos variables numericas pareadas por indice para el EDA automatico del grupo eda. Limpia los pares de forma defensiva (descarta None/bool/NaN/inf), reusa pearson y spearman_corr del registry y ajusta polinomios de grado 2 y 3 con numpy.polyfit (R^2 manual), y a partir de esas senales etiqueta la forma: 'lineal', 'polinomica (grado 2/3)', 'monotona no-lineal' o 'debil/sin forma'. Orden de decision: debil -> monotona -> polinomica -> lineal (la primera que matchea gana), con umbrales calibrados para datos reales discretos/ruidosos. Devuelve ademas los coeficientes del mejor modelo en orden de numpy.polyval para pintar la curva de ajuste sobre el scatter. Funcion pura no-throw: ante datos insuficientes (menos de 5 pares validos o varianza ~0) o cualquier fallo devuelve el dict canonico con tipo='debil/sin forma' y el resto a None."
|
||||
tags: [eda, correlation, relationship, classification, polyfit, datascience, pure]
|
||||
params:
|
||||
- name: xs
|
||||
desc: "Lista (o tupla) de valores numericos de la primera variable, pareada por indice con ys. Cada par xs[i],ys[i] se descarta si cualquiera de los dos es None, bool, NaN o inf. Lectura defensiva."
|
||||
- name: ys
|
||||
desc: "Lista (o tupla) de valores numericos de la segunda variable, pareada por indice con xs. Mismas reglas de limpieza que xs."
|
||||
output: "Dict con SIEMPRE las mismas 8 claves: tipo (str: 'lineal' | 'polinómica (grado 2)' | 'polinómica (grado 3)' | 'monótona no-lineal' | 'débil/sin forma'); pearson (float|None: coeficiente de Pearson r); r2_linear (float|None: r**2 del ajuste lineal); spearman (float|None: rho de Spearman); r2_poly2 (float|None: R^2 del ajuste polinomico de grado 2); r2_poly3 (float|None: R^2 del ajuste de grado 3); best_degree (int|None: grado del modelo elegido — 1 lineal, 2/3 polinomico, None si monotona/debil); coeffs (list|None: coeficientes del mejor modelo en orden de numpy.polyval para pintar la curva, o None). Ante datos insuficientes o error: tipo='débil/sin forma' y el resto de claves a None."
|
||||
uses_functions: [pearson_py_datascience, spearman_corr_py_datascience]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [numpy]
|
||||
tested: true
|
||||
tests: ["test_lineal", "test_polinomica_cuadratica", "test_monotona_no_lineal", "test_monotona_exponencial", "test_debil_sin_forma", "test_lista_vacia_no_lanza", "test_longitudes_distintas_no_lanza", "test_todos_none_no_lanza", "test_entradas_none_no_lanza", "test_constante_no_lanza", "test_filtra_nan_inf_bool"]
|
||||
test_file_path: "python/functions/datascience/classify_relationship_type_test.py"
|
||||
file_path: "python/functions/datascience/classify_relationship_type.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.classify_relationship_type import classify_relationship_type
|
||||
import numpy as np
|
||||
|
||||
# Relacion claramente cuadratica (forma de parabola) sobre dominio simetrico.
|
||||
x = list(np.linspace(-10, 10, 60))
|
||||
y = [v * v for v in x]
|
||||
|
||||
res = classify_relationship_type(x, y)
|
||||
print(res["tipo"]) # 'polinómica (grado 2)'
|
||||
print(res["best_degree"]) # 2
|
||||
print(res["r2_linear"]) # 0.0 -> el Pearson lineal no ve la parabola
|
||||
print(res["r2_poly2"]) # 1.0
|
||||
print(res["coeffs"]) # [1.0, -0.0, -0.0] -> numpy.polyval(coeffs, x) ~ x**2
|
||||
|
||||
# El capitulo pinta la curva de ajuste cuando coeffs no es None:
|
||||
# if res["coeffs"] is not None:
|
||||
# xs_fit = np.linspace(min(x), max(x), 200)
|
||||
# ys_fit = np.polyval(res["coeffs"], xs_fit)
|
||||
# ax.plot(xs_fit, ys_fit) # curva sobre el ax.scatter(x, y)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
- Usala en el capitulo de relaciones/correlaciones del EDA automatico, despues de detectar dos columnas numericas con alguna asociacion, para decidir QUE curva de ajuste pintar sobre el scatter (recta, parabola, cubica o ninguna) y poner una etiqueta legible al tipo de relacion.
|
||||
- Cuando un Pearson bajo no signifique "sin relacion": esta funcion cruza Pearson con Spearman y con ajustes polinomicos para distinguir una relacion lineal debil de una monotona no-lineal (que el rango si capta) o de una curva polinomica.
|
||||
- Cuando necesites un punto de entrada determinista y no-throw que, con los mismos datos, devuelva siempre el mismo `tipo` y los mismos `coeffs` listos para `numpy.polyval` sin tener que ajustar modelos a mano en el capitulo.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Funcion pura, deterministica y no-throw: ante menos de 5 pares validos, varianza ~0 (xs o ys constante) o cualquier excepcion interna devuelve el dict canonico `tipo="débil/sin forma"` con el resto de claves a `None`. El dict SIEMPRE trae las 8 claves: nunca compruebes existencia, comprueba `None`.
|
||||
- El orden de decision importa: `débil -> monótona -> polinómica -> lineal` (la primera que matchee gana). La monotonia se evalua ANTES que el ajuste polinomico, asi que una curva monotona suave (exp, log, potencias) sale `monótona no-lineal` aunque un cubico tambien la ajuste — la dominancia del rango (Spearman >> Pearson) es la senal mas interpretable. Solo cae en `polinómica` una forma curva NO monotona (p.ej. una parabola, Spearman ~0 pero R^2 polinomico alto).
|
||||
- Umbrales fijos (calibrados para EDA con datos discretos/ruidosos, no para inferencia formal): `débil/sin forma` si las tres senales son bajas a la vez (`abs(pearson) < 0.3` y `abs(spearman) < 0.3` y `mejor_poly < 0.3`); `monótona no-lineal` si `abs(spearman) - abs(pearson) >= 0.1` y `abs(spearman) >= 0.4`; `polinómica (grado N)` si el mejor polinomico mejora `>= 0.1` sobre el lineal y su R^2 `>= 0.3`; en cualquier otro caso con senal (no debil) `lineal`. El suelo de 0.3 evita llamar "debil" a relaciones reales pero discretas (conteos, escalas ordinales) con R^2 bajo pero direccion clara.
|
||||
- `coeffs` va en orden de `numpy.polyval` (grado descendente). Para `lineal` es `[pendiente, intercepto]` (grado 1); para `polinómica` los del grado elegido; para `monótona no-lineal` y `débil/sin forma` es `None` (el scatter pintara una curva suavizada o nada — lo decide el capitulo, no esta funcion).
|
||||
- `best_degree` prefiere el grado 2 sobre el 3 cuando empatan dentro de 0.02 de R^2 (parsimonia): no esperes grado 3 salvo que mejore claramente.
|
||||
- Los pares con `None`, `bool`, `NaN` o `inf` se descartan por indice en silencio; `bool` cuenta como no-numerico (un `True` no es `1`). El dominio de los datos afecta al resultado: una parabola sobre un dominio simetrico da Pearson ~0 (sale `polinómica`), pero sobre un dominio asimetrico el Pearson sube y puede salir `lineal`.
|
||||
@@ -0,0 +1,187 @@
|
||||
"""Clasifica el TIPO de relacion entre dos variables numericas pareadas.
|
||||
|
||||
Funcion pura del grupo eda. Dadas dos listas numericas pareadas por indice,
|
||||
limpia los pares de forma defensiva, calcula correlaciones lineal (Pearson) y de
|
||||
rangos (Spearman) y ajustes polinomicos de grado 2 y 3, y a partir de esas
|
||||
senales etiqueta la forma de la relacion para el EDA automatico:
|
||||
|
||||
"lineal" | "polinómica (grado 2)" | "polinómica (grado 3)" |
|
||||
"monótona no-lineal" | "débil/sin forma"
|
||||
|
||||
Ademas devuelve los coeficientes del mejor modelo (en orden de numpy.polyval)
|
||||
para que el capitulo pinte la curva de ajuste sobre el scatter. Reusa las
|
||||
funciones del registry `pearson` y `spearman_corr` en vez de reimplementarlas.
|
||||
|
||||
NUNCA lanza: ante cualquier fallo o dato insuficiente devuelve el dict canonico
|
||||
con tipo="débil/sin forma" y el resto de claves a None.
|
||||
"""
|
||||
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from datascience.datascience import pearson
|
||||
from datascience.spearman_corr import spearman_corr
|
||||
|
||||
# Forma canonica de la respuesta cuando no se puede clasificar (datos
|
||||
# insuficientes, varianza nula o error interno). Siempre las mismas claves.
|
||||
_WEAK = {
|
||||
"tipo": "débil/sin forma",
|
||||
"pearson": None,
|
||||
"r2_linear": None,
|
||||
"spearman": None,
|
||||
"r2_poly2": None,
|
||||
"r2_poly3": None,
|
||||
"best_degree": None,
|
||||
"coeffs": None,
|
||||
}
|
||||
|
||||
|
||||
def _is_num(v) -> bool:
|
||||
"""True si v es un numero real finito (int/float, no bool, no NaN, no inf)."""
|
||||
return (
|
||||
isinstance(v, (int, float))
|
||||
and not isinstance(v, bool)
|
||||
and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v)))
|
||||
)
|
||||
|
||||
|
||||
def _poly_r2(coeffs, x_arr, y_arr, ss_tot: float) -> float:
|
||||
"""R^2 de un ajuste polinomico: 1 - SS_res/SS_tot. 0 si SS_tot==0."""
|
||||
if ss_tot == 0.0:
|
||||
return 0.0
|
||||
pred = np.polyval(coeffs, x_arr)
|
||||
ss_res = float(np.sum((y_arr - pred) ** 2))
|
||||
return 1.0 - ss_res / ss_tot
|
||||
|
||||
|
||||
def classify_relationship_type(xs: list, ys: list) -> dict:
|
||||
"""Clasifica el tipo de relacion entre dos variables numericas pareadas.
|
||||
|
||||
Empareja xs[i],ys[i] por indice y descarta el par si cualquiera de los dos
|
||||
es None, bool, NaN o inf. Sobre los pares limpios calcula Pearson r
|
||||
(r2_linear = r**2), Spearman rho y los R^2 de ajustes polinomicos de grado 2
|
||||
y 3 (con numpy.polyfit + R^2 manual). Con esas senales decide la etiqueta.
|
||||
|
||||
Orden de evaluacion de la etiqueta (la primera que matchee gana). Los
|
||||
umbrales estan calibrados para datos reales, a menudo discretos y ruidosos
|
||||
(conteos, escalas ordinales): una relacion con |r| >= 0.3, |rho| >= 0.3 o un
|
||||
polinomio con R^2 >= 0.3 ya tiene FORMA y no debe etiquetarse como "debil".
|
||||
1. "débil/sin forma" — todas las senales bajas a la vez:
|
||||
abs(pearson) < 0.3 y abs(spearman) < 0.3 y mejor_poly < 0.3.
|
||||
2. "monótona no-lineal" — el rango (Spearman) capta una monotonia que el
|
||||
Pearson lineal no: abs(spearman) - abs(pearson) >= 0.1 y
|
||||
abs(spearman) >= 0.4. No se fuerza un polinomio (coeffs/best_degree =
|
||||
None); el capitulo dibuja la tendencia ordenada sobre el scatter.
|
||||
3. "polinómica (grado N)" — el mejor polinomico mejora claramente sobre
|
||||
el lineal (mejor_poly - r2_linear >= 0.1) y mejor_poly >= 0.3. N es el
|
||||
grado (2 o 3) con mejor R^2, prefiriendo el 2 si empatan dentro de 0.02
|
||||
(parsimonia).
|
||||
4. "lineal" — el resto: hay senal (no es debil) y la forma que existe es
|
||||
esencialmente lineal. best_degree=1, coeffs del ajuste de grado 1.
|
||||
|
||||
Si hay menos de 5 pares validos, o la varianza de xs o de ys es ~0
|
||||
(constante), devuelve directamente "débil/sin forma".
|
||||
|
||||
Args:
|
||||
xs: lista (o tupla) de valores numericos de la primera variable,
|
||||
pareada por indice con ys. Pares con None/bool/NaN/inf se descartan.
|
||||
ys: lista (o tupla) de valores numericos de la segunda variable,
|
||||
pareada por indice con xs.
|
||||
|
||||
Returns:
|
||||
dict con SIEMPRE las mismas claves:
|
||||
tipo (str), pearson (float|None), r2_linear (float|None),
|
||||
spearman (float|None), r2_poly2 (float|None), r2_poly3 (float|None),
|
||||
best_degree (int|None: 1, 2, 3 o None),
|
||||
coeffs (list|None: coeficientes en orden de numpy.polyval, o None).
|
||||
Nunca lanza: ante fallo o datos insuficientes devuelve el dict debil.
|
||||
"""
|
||||
try:
|
||||
if xs is None or ys is None:
|
||||
return dict(_WEAK)
|
||||
|
||||
pairs = [
|
||||
(float(x), float(y))
|
||||
for x, y in zip(xs, ys)
|
||||
if _is_num(x) and _is_num(y)
|
||||
]
|
||||
|
||||
# Datos insuficientes para hablar de forma de la relacion.
|
||||
if len(pairs) < 5:
|
||||
return dict(_WEAK)
|
||||
|
||||
clean_x = [p[0] for p in pairs]
|
||||
clean_y = [p[1] for p in pairs]
|
||||
|
||||
# Varianza ~0 en cualquiera de las series => relacion indefinida.
|
||||
if len(set(clean_x)) < 2 or len(set(clean_y)) < 2:
|
||||
return dict(_WEAK)
|
||||
x_arr = np.asarray(clean_x, dtype=float)
|
||||
y_arr = np.asarray(clean_y, dtype=float)
|
||||
if float(np.var(x_arr)) < 1e-15 or float(np.var(y_arr)) < 1e-15:
|
||||
return dict(_WEAK)
|
||||
|
||||
# Correlaciones reutilizando las funciones del registry.
|
||||
r = pearson(clean_x, clean_y)
|
||||
spearman = spearman_corr(clean_x, clean_y)
|
||||
r2_linear = r ** 2
|
||||
|
||||
# Ajustes polinomicos grado 2 y 3 con R^2 manual.
|
||||
ss_tot = float(np.sum((y_arr - float(np.mean(y_arr))) ** 2))
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
c1 = np.polyfit(x_arr, y_arr, 1)
|
||||
c2 = np.polyfit(x_arr, y_arr, 2)
|
||||
c3 = np.polyfit(x_arr, y_arr, 3)
|
||||
r2_poly2 = _poly_r2(c2, x_arr, y_arr, ss_tot)
|
||||
r2_poly3 = _poly_r2(c3, x_arr, y_arr, ss_tot)
|
||||
|
||||
mejor_poly = max(r2_poly2, r2_poly3)
|
||||
# Grado del mejor polinomico, con preferencia por la parsimonia: solo se
|
||||
# elige el grado 3 si supera al grado 2 por mas de 0.02.
|
||||
best_poly_degree = 3 if (r2_poly3 - r2_poly2) > 0.02 else 2
|
||||
|
||||
abs_s = abs(spearman)
|
||||
abs_p = abs(r)
|
||||
|
||||
# Decision en orden: debil-temprano -> monotona -> polinomica -> lineal.
|
||||
if abs_p < 0.3 and abs_s < 0.3 and mejor_poly < 0.3:
|
||||
# Ninguna senal supera el suelo de forma: relacion debil/sin forma.
|
||||
tipo = "débil/sin forma"
|
||||
best_degree = None
|
||||
coeffs = None
|
||||
elif (abs_s - abs_p) >= 0.1 and abs_s >= 0.4:
|
||||
# Spearman (rango) capta una monotonia que el Pearson lineal no:
|
||||
# relacion monotona no-lineal. No se fuerza un polinomio que tal vez
|
||||
# no ajusta bien; el capitulo dibuja la tendencia ordenada.
|
||||
tipo = "monótona no-lineal"
|
||||
best_degree = None
|
||||
coeffs = None
|
||||
elif (mejor_poly - r2_linear) >= 0.1 and mejor_poly >= 0.3:
|
||||
tipo = "polinómica (grado {})".format(best_poly_degree)
|
||||
best_degree = best_poly_degree
|
||||
best_coeffs = c2 if best_poly_degree == 2 else c3
|
||||
coeffs = [float(c) for c in best_coeffs]
|
||||
else:
|
||||
# Hay senal (no es debil) y no es ni monotona-pura ni polinomica:
|
||||
# la correlacion que existe es esencialmente lineal.
|
||||
tipo = "lineal"
|
||||
best_degree = 1
|
||||
coeffs = [float(c) for c in c1]
|
||||
|
||||
return {
|
||||
"tipo": tipo,
|
||||
"pearson": round(float(r), 6),
|
||||
"r2_linear": round(float(r2_linear), 6),
|
||||
"spearman": round(float(spearman), 6),
|
||||
"r2_poly2": round(float(r2_poly2), 6),
|
||||
"r2_poly3": round(float(r2_poly3), 6),
|
||||
"best_degree": best_degree,
|
||||
"coeffs": (
|
||||
[round(c, 8) for c in coeffs] if coeffs is not None else None
|
||||
),
|
||||
}
|
||||
except Exception:
|
||||
return dict(_WEAK)
|
||||
@@ -0,0 +1,174 @@
|
||||
"""Tests para classify_relationship_type."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from classify_relationship_type import classify_relationship_type
|
||||
|
||||
# Claves que el dict de salida debe contener SIEMPRE.
|
||||
_EXPECTED_KEYS = {
|
||||
"tipo", "pearson", "r2_linear", "spearman",
|
||||
"r2_poly2", "r2_poly3", "best_degree", "coeffs",
|
||||
}
|
||||
|
||||
|
||||
def _assert_shape(r):
|
||||
"""Toda salida tiene exactamente las 8 claves canonicas."""
|
||||
assert isinstance(r, dict)
|
||||
assert set(r.keys()) == _EXPECTED_KEYS
|
||||
|
||||
|
||||
def test_lineal():
|
||||
"""Golden: y = 2x + 1 con ruido pequeno -> 'lineal', best_degree=1."""
|
||||
rng = np.random.default_rng(42)
|
||||
x = np.linspace(0.0, 10.0, 50)
|
||||
y = 2.0 * x + 1.0 + rng.normal(0.0, 0.3, 50)
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "lineal"
|
||||
assert r["best_degree"] == 1
|
||||
assert r["r2_linear"] >= 0.5
|
||||
# coeffs ~ [pendiente, intercepto] del ajuste de grado 1.
|
||||
assert r["coeffs"] is not None and len(r["coeffs"]) == 2
|
||||
assert abs(r["coeffs"][0] - 2.0) < 0.1 # pendiente ~2
|
||||
assert abs(r["coeffs"][1] - 1.0) < 0.3 # intercepto ~1
|
||||
|
||||
|
||||
def test_polinomica_cuadratica():
|
||||
"""Golden: y = x**2 sobre [-10, 10] -> 'polinómica', best_degree in (2, 3)."""
|
||||
x = np.linspace(-10.0, 10.0, 60)
|
||||
y = x ** 2
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"].startswith("polinómica")
|
||||
assert r["best_degree"] in (2, 3)
|
||||
# Una parabola perfecta queda capturada por el grado 2 (parsimonia).
|
||||
assert r["best_degree"] == 2
|
||||
assert r["r2_poly2"] > 0.99
|
||||
assert r["coeffs"] is not None and len(r["coeffs"]) == r["best_degree"] + 1
|
||||
|
||||
|
||||
def test_monotona_no_lineal():
|
||||
"""Golden: monotona convexa de cola pesada -> 'monótona no-lineal'.
|
||||
|
||||
y = 1/(N+1-i)**2 es estrictamente creciente (Spearman ~ 1) pero su cola
|
||||
explosiva hace que ni la recta ni un polinomio de grado 2/3 la ajusten
|
||||
(R^2 polinomico < 0.5), de modo que el Pearson lineal NO capta la relacion
|
||||
que el rango (Spearman) si ve. Construccion deterministica (sin azar).
|
||||
"""
|
||||
n = 200
|
||||
i = np.arange(n, dtype=float)
|
||||
y = 1.0 / (n + 1 - i) ** 2
|
||||
|
||||
r = classify_relationship_type(list(i), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "monótona no-lineal"
|
||||
assert r["best_degree"] is None
|
||||
assert r["coeffs"] is None
|
||||
# Spearman fuerte y claramente por encima del Pearson.
|
||||
assert abs(r["spearman"]) >= 0.5
|
||||
assert abs(r["spearman"]) - abs(r["pearson"]) >= 0.15
|
||||
|
||||
|
||||
def test_monotona_exponencial():
|
||||
"""DoD literal: y = exp(x) (monotona no-lineal) -> 'monótona no-lineal'.
|
||||
|
||||
exp es estrictamente creciente (Spearman = 1) pero el Pearson lineal queda
|
||||
claramente por debajo (~0.86), así que la dominancia del rango la marca como
|
||||
monótona no-lineal en vez de lineal o polinómica.
|
||||
"""
|
||||
x = np.linspace(0.0, 5.0, 80)
|
||||
y = np.exp(x)
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "monótona no-lineal"
|
||||
assert r["best_degree"] is None and r["coeffs"] is None
|
||||
assert abs(r["spearman"]) >= 0.9
|
||||
assert abs(r["spearman"]) - abs(r["pearson"]) >= 0.1
|
||||
|
||||
|
||||
def test_debil_sin_forma():
|
||||
"""Golden: x e y independientes (semilla fija) -> 'débil/sin forma'."""
|
||||
rng = np.random.default_rng(0)
|
||||
x = rng.normal(0.0, 1.0, 200)
|
||||
y = rng.normal(0.0, 1.0, 200)
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["best_degree"] is None
|
||||
assert r["coeffs"] is None
|
||||
# Todas las senales son bajas.
|
||||
assert abs(r["pearson"]) < 0.3
|
||||
assert r["r2_linear"] < 0.1
|
||||
|
||||
|
||||
def test_lista_vacia_no_lanza():
|
||||
"""Edge: listas vacias -> dict debil canonico, sin lanzar."""
|
||||
r = classify_relationship_type([], [])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["pearson"] is None
|
||||
assert r["r2_linear"] is None
|
||||
assert r["spearman"] is None
|
||||
assert r["r2_poly2"] is None
|
||||
assert r["r2_poly3"] is None
|
||||
assert r["best_degree"] is None
|
||||
assert r["coeffs"] is None
|
||||
|
||||
|
||||
def test_longitudes_distintas_no_lanza():
|
||||
"""Edge: listas de distinta longitud -> empareja por indice, no lanza."""
|
||||
# zip trunca a la longitud minima: solo 3 pares (< 5) -> debil.
|
||||
r = classify_relationship_type([1, 2, 3, 4, 5, 6, 7, 8], [1.0, 2.0, 3.0])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["best_degree"] is None
|
||||
|
||||
|
||||
def test_todos_none_no_lanza():
|
||||
"""Edge: todos los valores None -> ningun par valido -> debil, no lanza."""
|
||||
r = classify_relationship_type([None, None, None, None, None, None],
|
||||
[None, None, None, None, None, None])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["coeffs"] is None
|
||||
|
||||
|
||||
def test_entradas_none_no_lanza():
|
||||
"""Edge: xs/ys None directamente -> debil, no lanza."""
|
||||
assert classify_relationship_type(None, None)["tipo"] == "débil/sin forma"
|
||||
assert classify_relationship_type([1.0, 2.0], None)["tipo"] == "débil/sin forma"
|
||||
|
||||
|
||||
def test_constante_no_lanza():
|
||||
"""Edge: ys constante (varianza ~0) -> debil, no lanza."""
|
||||
r = classify_relationship_type([1, 2, 3, 4, 5, 6, 7], [5, 5, 5, 5, 5, 5, 5])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
|
||||
|
||||
def test_filtra_nan_inf_bool():
|
||||
"""Edge: pares con NaN/inf/bool/None se descartan por indice."""
|
||||
nan = float("nan")
|
||||
inf = float("inf")
|
||||
# Solo i=0,1,2,3,4 quedan validos (5 pares) y forman una recta perfecta.
|
||||
xs = [0.0, 1.0, 2.0, 3.0, 4.0, nan, inf, True, None]
|
||||
ys = [1.0, 3.0, 5.0, 7.0, 9.0, 1.0, 2.0, 3.0, 4.0]
|
||||
r = classify_relationship_type(xs, ys)
|
||||
_assert_shape(r)
|
||||
# Los 5 pares validos son y = 2x + 1 exacto -> lineal.
|
||||
assert r["tipo"] == "lineal"
|
||||
assert r["best_degree"] == 1
|
||||
@@ -0,0 +1,122 @@
|
||||
---
|
||||
id: relationship_scatter_figure_py_datascience
|
||||
name: relationship_scatter_figure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def relationship_scatter_figure(xs: list, ys: list, x_label: str = \"\", y_label: str = \"\", classification: dict = None, max_points: int = 2000) -> \"matplotlib.figure.Figure\""
|
||||
description: "Construye una figura matplotlib scatter de un par de variables numéricas con su curva/recta de ajuste y una anotación del tipo de relación (lineal, polinómica grado 2/3, monótona no-lineal, etc.) más sus métricas (r, ρ, R²lin, R²poly). Consume el dict de classify_relationship_type; si es None lo calcula internamente reusando esa función. Devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA (PDF/PPTX). Backend Agg sin pyplot global; downsample determinista de los puntos dibujados; defensivo ante vacío/None."
|
||||
tags: [eda, correlation, scatter, relationship, matplotlib, figure, visualization, datascience, impure]
|
||||
uses_functions: [classify_relationship_type_py_datascience]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [matplotlib, numpy]
|
||||
example: |
|
||||
from relationship_scatter_figure import relationship_scatter_figure
|
||||
xs = [float(i) for i in range(100)]
|
||||
ys = [0.5 * x * x - x + 3 for x in xs]
|
||||
classification = {
|
||||
"tipo": "polinómica (grado 2)", "pearson": 0.97, "spearman": 0.99,
|
||||
"r2_linear": 0.92, "r2_poly2": 0.999, "r2_poly3": 0.999,
|
||||
"best_degree": 2, "coeffs": [0.5, -1.0, 3.0],
|
||||
}
|
||||
fig = relationship_scatter_figure(xs, ys, x_label="dosis", y_label="efecto", classification=classification)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_returns_figure"
|
||||
- "test_downsample_determinista"
|
||||
- "test_empty_no_lanza"
|
||||
- "test_classification_none"
|
||||
test_file_path: "python/functions/datascience/relationship_scatter_figure_test.py"
|
||||
file_path: "python/functions/datascience/relationship_scatter_figure.py"
|
||||
params:
|
||||
- name: xs
|
||||
desc: "Lista (o tupla) de valores x. Se emparejan por índice con ys. Valores None, bool, NaN o inf descartan ese par (lectura defensiva)."
|
||||
- name: ys
|
||||
desc: "Lista (o tupla) de valores y, paralela a xs. Mismas reglas defensivas que xs."
|
||||
- name: x_label
|
||||
desc: "Etiqueta del eje/título para la variable x. Default \"\" (en el título cae a \"x\")."
|
||||
- name: y_label
|
||||
desc: "Etiqueta del eje/título para la variable y. Default \"\" (en el título cae a \"y\")."
|
||||
- name: classification
|
||||
desc: "Opcional. Dict de classify_relationship_type con claves tipo, pearson, r2_linear, spearman, r2_poly2, r2_poly3, best_degree, coeffs. Si es None se calcula internamente importando y llamando a classify_relationship_type sobre los pares limpios (self-contained). Si el módulo hermano no está disponible, se dibuja el scatter sin curva de ajuste ni anotación. Default None."
|
||||
- name: max_points
|
||||
desc: "Tope del nº de puntos DIBUJADOS. Si los pares limpios superan el tope, la nube se submuestrea por paso fijo ceil(n/max_points) tomando pairs[::step] — DETERMINISTA, no aleatorio, reproducible. La clasificación/ajuste usa SIEMPRE todos los pares limpios; el downsample solo adelgaza el dibujo. Valor no-positivo o no-int desactiva el downsample. Default 2000."
|
||||
output: "Un matplotlib.figure.Figure (figsize 6.4x4.0, dpi 150) con un Axes scatter (puntos semitransparentes alpha 0.5, color #4C72B0), la curva/recta de ajuste (numpy.polyval sobre coeffs, color #C44E52) cuando hay un ajuste polinómico disponible, título \"{x_label} ↔ {y_label}\", labels de ejes y una caja de anotación en la esquina superior izquierda con el tipo de relación y las métricas disponibles (r, ρ, R²lin, R²poly; se omiten las None). Si tras la limpieza hay menos de 2 pares válidos, devuelve igualmente una Figure con un texto centrado \"Sin datos suficientes para el scatter\" (nunca lanza). El caller rasteriza/cierra la figura; la función no la muestra ni la guarda."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from relationship_scatter_figure import relationship_scatter_figure
|
||||
|
||||
# Par numérico con relación cuadrática y su clasificación (de
|
||||
# classify_relationship_type). Pasándola explícita evitas recomputarla.
|
||||
xs = [float(i) for i in range(100)]
|
||||
ys = [0.5 * x * x - x + 3 for x in xs]
|
||||
classification = {
|
||||
"tipo": "polinómica (grado 2)",
|
||||
"pearson": 0.97,
|
||||
"spearman": 0.99,
|
||||
"r2_linear": 0.92,
|
||||
"r2_poly2": 0.999,
|
||||
"r2_poly3": 0.999,
|
||||
"best_degree": 2,
|
||||
"coeffs": [0.5, -1.0, 3.0],
|
||||
}
|
||||
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="dosis", y_label="efecto", classification=classification
|
||||
)
|
||||
|
||||
# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
|
||||
fig.savefig("/tmp/scatter_dosis_efecto.png")
|
||||
|
||||
# Con classification=None la función la calcula internamente (self-contained):
|
||||
fig2 = relationship_scatter_figure(xs, ys, x_label="dosis", y_label="efecto")
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala dentro del informe EDA automático cuando quieras visualizar de un vistazo
|
||||
la relación entre dos variables numéricas: la nube de puntos, la curva que mejor
|
||||
la ajusta y una etiqueta legible del tipo de relación con sus métricas. Es la
|
||||
pareja "vista humana" de `classify_relationship_type`: esa función decide el
|
||||
tipo y los coeficientes; esta los pinta en una `Figure` que el renderer del
|
||||
informe rasteriza a PDF/PPTX. Pásale el dict de clasificación si ya lo tienes
|
||||
calculado (evitas recomputar el ajuste); si no, déjalo en `None` y la función lo
|
||||
resuelve sola sobre los pares limpios. Pensada para móvil: anotación pequeña
|
||||
(fontsize 8) y nube adelgazada por `max_points` para que el PDF no pese.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
|
||||
y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
|
||||
para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
|
||||
es thread-safe; esta función lo evita construyendo el `Figure` directamente,
|
||||
así que es segura de llamar en bucle desde el renderer.
|
||||
- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo
|
||||
guarda. Quien la consume debe rasterizarla y luego liberarla
|
||||
(`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes de
|
||||
pares de columnas.
|
||||
- **Downsample determinista, solo del dibujo.** Cuando los pares limpios superan
|
||||
`max_points`, la nube DIBUJADA se adelgaza por paso fijo `pairs[::step]`
|
||||
(reproducible, no aleatorio). La clasificación y el ajuste usan SIEMPRE todos
|
||||
los pares limpios; el downsample no altera las métricas ni la curva.
|
||||
- **`classification=None` ⇒ se calcula sola.** Importa y llama a
|
||||
`classify_relationship_type` sobre los pares limpios. Si ese módulo hermano no
|
||||
está disponible (entorno incompleto), NO lanza: dibuja el scatter sin curva de
|
||||
ajuste ni anotación. Pasar la clasificación explícita es más barato (no
|
||||
recomputa el ajuste).
|
||||
- **Sin curva para `monótona no-lineal`.** Cuando `coeffs` es `None` o
|
||||
`best_degree` es `None` (p.ej. tipo "monótona no-lineal"), no se pinta recta
|
||||
polinómica — solo la nube y la anotación. Tampoco se dibuja la curva si el
|
||||
rango de x es nulo (todos los x iguales). Nunca falla por esto.
|
||||
- **Defensiva, nunca lanza.** `xs=[]`, `ys=[]`, menos de 2 pares válidos, ends
|
||||
`None`/`bool`/`NaN`/`inf` o `coeffs` malformado se manejan sin error: en el
|
||||
peor caso devuelve una `Figure` con "Sin datos suficientes para el scatter".
|
||||
No envuelvas la llamada en try/except por miedo a un raise — no lo hay.
|
||||
@@ -0,0 +1,322 @@
|
||||
"""Impure EDA helper: scatter figure of a numeric pair with its fit (`eda` group).
|
||||
|
||||
Builds a matplotlib scatter of two numeric variables, overlays the fitted
|
||||
curve/line implied by the relationship classification (linear, polynomial of
|
||||
degree 2/3, etc.) and annotates the relationship type with its available
|
||||
metrics. Returns a ready-to-rasterize ``matplotlib.figure.Figure``; it never
|
||||
shows nor saves it.
|
||||
|
||||
Impure because it touches matplotlib's rendering machinery. It uses the headless
|
||||
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
|
||||
global state and is safe to call repeatedly from a report renderer.
|
||||
|
||||
To keep the rendered PDF/PPTX light on phones, when the number of valid pairs
|
||||
exceeds ``max_points`` the *plotted* points are down-sampled DETERMINISTICALLY by
|
||||
a fixed step (``pairs[::step]``), never randomly, so the output is reproducible.
|
||||
The classification/fit always uses every clean pair; the down-sample only thins
|
||||
the drawn cloud.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import numpy as np # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
# Sober blue for the scatter cloud and red for the fitted curve (Tufte: the
|
||||
# data points are the primary ink, the fit is the secondary highlight).
|
||||
_POINT_COLOR = "#4C72B0"
|
||||
_FIT_COLOR = "#C44E52"
|
||||
# Muted gray for the no-data fallback message.
|
||||
_MUTED_TEXT = "#5f6b7a"
|
||||
|
||||
|
||||
def _finite(value):
|
||||
"""Coerce ``value`` to a finite float, or return None when not usable.
|
||||
|
||||
bool is a subclass of int, but a real numeric measurement is never a bool,
|
||||
so True/False are treated as missing instead of coercing to 1.0/0.0. NaN and
|
||||
+/-infinity are never valid either.
|
||||
"""
|
||||
if value is None or isinstance(value, bool):
|
||||
return None
|
||||
try:
|
||||
f = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if math.isnan(f) or math.isinf(f):
|
||||
return None
|
||||
return f
|
||||
|
||||
|
||||
def _clean_pairs(xs, ys):
|
||||
"""Pair ``xs[i], ys[i]`` by index, dropping any pair with a non-finite end."""
|
||||
pairs = []
|
||||
if isinstance(xs, (list, tuple)) and isinstance(ys, (list, tuple)):
|
||||
n = min(len(xs), len(ys))
|
||||
for i in range(n):
|
||||
x = _finite(xs[i])
|
||||
y = _finite(ys[i])
|
||||
if x is None or y is None:
|
||||
continue
|
||||
pairs.append((x, y))
|
||||
return pairs
|
||||
|
||||
|
||||
def _ordered_trend(xs_clean, ys_clean, n_bins: int = 12):
|
||||
"""Return (x_trend, y_trend): the ordered trend of y over x for a monotonic
|
||||
relationship that has no polynomial fit.
|
||||
|
||||
When x has few distinct values (an ordinal/discrete scale) the trend is the
|
||||
mean of y per distinct x value. Otherwise x is split into ``n_bins`` ordered
|
||||
quantile bins and each point is (mean x, mean y) of the bin. Returns
|
||||
``(None, None)`` when there is nothing meaningful to draw.
|
||||
"""
|
||||
x_arr = np.asarray(xs_clean, dtype=float)
|
||||
y_arr = np.asarray(ys_clean, dtype=float)
|
||||
if x_arr.size < 2:
|
||||
return None, None
|
||||
uniq = np.unique(x_arr)
|
||||
if uniq.size <= max(2, n_bins):
|
||||
# Discrete x: one trend point per distinct value (mean y).
|
||||
xt = uniq
|
||||
yt = np.array([float(np.mean(y_arr[x_arr == ux])) for ux in uniq])
|
||||
return xt, yt
|
||||
# Continuous x: ordered quantile bins, (mean x, mean y) per bin.
|
||||
order = np.argsort(x_arr, kind="stable")
|
||||
x_sorted = x_arr[order]
|
||||
y_sorted = y_arr[order]
|
||||
chunks_x = np.array_split(x_sorted, n_bins)
|
||||
chunks_y = np.array_split(y_sorted, n_bins)
|
||||
xt = np.array([float(np.mean(cx)) for cx in chunks_x if cx.size])
|
||||
yt = np.array([float(np.mean(cy)) for cy in chunks_y if cy.size])
|
||||
return xt, yt
|
||||
|
||||
|
||||
def _no_data_figure(message: str) -> "matplotlib.figure.Figure":
|
||||
"""A bare Figure carrying a centered muted message (defensive fallback)."""
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
ax.axis("off")
|
||||
ax.text(
|
||||
0.5,
|
||||
0.5,
|
||||
message,
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=12,
|
||||
color=_MUTED_TEXT,
|
||||
transform=ax.transAxes,
|
||||
)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def _metrics_caption(classification: dict) -> str:
|
||||
"""Format the available metrics of a classification dict into one line.
|
||||
|
||||
Omits the metrics that are None. Keys consumed (any may be absent/None):
|
||||
``pearson`` (r), ``spearman`` (rho), ``r2_linear`` (R²lin) and the best
|
||||
polynomial R² (``r2_poly3`` if a cubic was the best fit, else ``r2_poly2``).
|
||||
"""
|
||||
parts = []
|
||||
r = _finite(classification.get("pearson"))
|
||||
if r is not None:
|
||||
parts.append(f"r={r:.2f}")
|
||||
rho = _finite(classification.get("spearman"))
|
||||
if rho is not None:
|
||||
parts.append(f"ρ={rho:.2f}")
|
||||
r2_lin = _finite(classification.get("r2_linear"))
|
||||
if r2_lin is not None:
|
||||
parts.append(f"R²lin={r2_lin:.2f}")
|
||||
# Prefer the R² of the best polynomial degree when it is a poly fit.
|
||||
best_degree = classification.get("best_degree")
|
||||
r2_poly = None
|
||||
if best_degree == 3:
|
||||
r2_poly = _finite(classification.get("r2_poly3"))
|
||||
elif best_degree == 2:
|
||||
r2_poly = _finite(classification.get("r2_poly2"))
|
||||
if r2_poly is None:
|
||||
# Fall back to whichever poly R² is present (cubic first).
|
||||
r2_poly = _finite(classification.get("r2_poly3"))
|
||||
if r2_poly is None:
|
||||
r2_poly = _finite(classification.get("r2_poly2"))
|
||||
if r2_poly is not None:
|
||||
parts.append(f"R²poly={r2_poly:.2f}")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def relationship_scatter_figure(
|
||||
xs: list,
|
||||
ys: list,
|
||||
x_label: str = "",
|
||||
y_label: str = "",
|
||||
classification: dict = None,
|
||||
max_points: int = 2000,
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Build a scatter figure of a numeric pair with its fit and a type label.
|
||||
|
||||
Cleans the pairs defensively (drops any pair with a None/bool/NaN/inf end),
|
||||
plots a semi-transparent scatter cloud (down-sampled deterministically when
|
||||
it exceeds ``max_points``), overlays the polynomial fit implied by
|
||||
``classification`` and annotates the relationship type plus its available
|
||||
metrics in a corner box.
|
||||
|
||||
The fit and classification always use every clean pair; only the drawn cloud
|
||||
is thinned by the down-sample. When ``classification`` is None it is computed
|
||||
internally by reusing ``classify_relationship_type`` over the clean pairs, so
|
||||
the function is self-contained.
|
||||
|
||||
The function is fully defensive: empty input, fewer than 2 clean pairs, a
|
||||
missing/None ``coeffs`` or a missing sibling classifier never raise. When
|
||||
there is nothing valid to draw it still returns a ``Figure`` carrying a
|
||||
centered "Sin datos suficientes para el scatter" message.
|
||||
|
||||
Args:
|
||||
xs: List (or tuple) of x values. Paired by index with ``ys``. Values that
|
||||
are None, bool, NaN or infinite discard that pair. Read defensively.
|
||||
ys: List (or tuple) of y values, parallel to ``xs``. Same defensive rules.
|
||||
x_label: Axis/title label for the x variable. Default "" (falls back to
|
||||
"x" in the title).
|
||||
y_label: Axis/title label for the y variable. Default "" (falls back to
|
||||
"y" in the title).
|
||||
classification: Optional dict from ``classify_relationship_type`` with
|
||||
keys ``tipo, pearson, r2_linear, spearman, r2_poly2, r2_poly3,
|
||||
best_degree, coeffs``. When None, it is computed internally by
|
||||
importing and calling ``classify_relationship_type`` over the clean
|
||||
pairs. When that sibling module is unavailable, the scatter is still
|
||||
drawn (no fit curve, no annotation).
|
||||
max_points: Cap on the number of *plotted* points. When the number of
|
||||
clean pairs exceeds this cap, the drawn cloud is down-sampled by a
|
||||
fixed step ``ceil(n/max_points)`` taking ``pairs[::step]`` —
|
||||
DETERMINISTIC, not random, so the figure is reproducible. A
|
||||
non-positive or non-int value disables down-sampling. Default 2000.
|
||||
|
||||
Returns:
|
||||
A ``matplotlib.figure.Figure`` (figsize 6.4x4.0, dpi 150) with a single
|
||||
scatter Axes, the fitted curve (when a polynomial fit is available) and a
|
||||
corner annotation with the relationship type and metrics. When there are
|
||||
fewer than 2 clean pairs it returns a Figure with a centered "Sin datos
|
||||
suficientes para el scatter" message. The caller rasterizes/closes it.
|
||||
"""
|
||||
pairs = _clean_pairs(xs, ys)
|
||||
if len(pairs) < 2:
|
||||
return _no_data_figure("Sin datos suficientes para el scatter")
|
||||
|
||||
# Full clean coordinates feed the classification/fit; the plotted cloud is
|
||||
# what gets thinned.
|
||||
xs_clean = [p[0] for p in pairs]
|
||||
ys_clean = [p[1] for p in pairs]
|
||||
|
||||
# Resolve the classification. If not provided, reuse the sibling classifier
|
||||
# over ALL clean pairs (self-contained). Missing module => no fit/annotation.
|
||||
cls = classification
|
||||
if cls is None:
|
||||
try:
|
||||
from classify_relationship_type import classify_relationship_type
|
||||
|
||||
cls = classify_relationship_type(xs_clean, ys_clean)
|
||||
except Exception:
|
||||
cls = None
|
||||
if not isinstance(cls, dict):
|
||||
cls = {}
|
||||
|
||||
# --- Deterministic down-sampling of the DRAWN points only.
|
||||
n_total = len(pairs)
|
||||
if (
|
||||
isinstance(max_points, int)
|
||||
and not isinstance(max_points, bool)
|
||||
and max_points > 0
|
||||
and n_total > max_points
|
||||
):
|
||||
step = math.ceil(n_total / max_points)
|
||||
sampled = pairs[::step]
|
||||
else:
|
||||
sampled = pairs
|
||||
|
||||
x_plot = [p[0] for p in sampled]
|
||||
y_plot = [p[1] for p in sampled]
|
||||
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
ax.scatter(
|
||||
x_plot,
|
||||
y_plot,
|
||||
s=12,
|
||||
alpha=0.5,
|
||||
color=_POINT_COLOR,
|
||||
edgecolors="none",
|
||||
rasterized=True,
|
||||
)
|
||||
|
||||
# --- Fitted curve/line over the full clean x range.
|
||||
coeffs = cls.get("coeffs")
|
||||
best_degree = cls.get("best_degree")
|
||||
tipo = cls.get("tipo")
|
||||
x_min, x_max = min(xs_clean), max(xs_clean)
|
||||
drew_fit = False
|
||||
if coeffs is not None and best_degree is not None and x_max > x_min:
|
||||
try:
|
||||
coeff_arr = np.asarray(coeffs, dtype=float)
|
||||
if coeff_arr.ndim == 1 and coeff_arr.size > 0 and np.all(np.isfinite(coeff_arr)):
|
||||
x_line = np.linspace(x_min, x_max, 200)
|
||||
y_line = np.polyval(coeff_arr, x_line)
|
||||
if np.all(np.isfinite(y_line)):
|
||||
ax.plot(x_line, y_line, color=_FIT_COLOR, linewidth=2)
|
||||
drew_fit = True
|
||||
except Exception:
|
||||
# Never fail the figure because of a malformed coeffs array.
|
||||
pass
|
||||
|
||||
# A monotonic non-linear relationship has no fitted polynomial (coeffs is
|
||||
# None by design — a low-degree polynomial would mislead). Draw instead the
|
||||
# ordered trend of y over x so the reader still sees the shape: y averaged
|
||||
# within ordered x-bins (or per distinct x value when x is discrete with few
|
||||
# levels, e.g. an ordinal scale). Defensive: any failure leaves the cloud.
|
||||
if (not drew_fit and isinstance(tipo, str) and "monóton" in tipo.lower()
|
||||
and x_max > x_min):
|
||||
try:
|
||||
xt, yt = _ordered_trend(xs_clean, ys_clean)
|
||||
if xt is not None and len(xt) >= 2:
|
||||
ax.plot(xt, yt, color=_FIT_COLOR, linewidth=2, marker="o",
|
||||
markersize=3)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Labels and title.
|
||||
tx = x_label if x_label else "x"
|
||||
ty = y_label if y_label else "y"
|
||||
ax.set_title(f"{tx} ↔ {ty}", fontsize=12, loc="left", pad=8)
|
||||
ax.set_xlabel(x_label)
|
||||
ax.set_ylabel(y_label)
|
||||
|
||||
# --- Corner annotation: relationship type + available metrics.
|
||||
caption_lines = []
|
||||
if tipo:
|
||||
caption_lines.append(str(tipo))
|
||||
metrics_line = _metrics_caption(cls)
|
||||
if metrics_line:
|
||||
caption_lines.append(metrics_line)
|
||||
if caption_lines:
|
||||
ax.text(
|
||||
0.03,
|
||||
0.97,
|
||||
"\n".join(caption_lines),
|
||||
transform=ax.transAxes,
|
||||
ha="left",
|
||||
va="top",
|
||||
fontsize=8,
|
||||
bbox=dict(
|
||||
boxstyle="round,pad=0.35",
|
||||
facecolor="white",
|
||||
edgecolor="#cccccc",
|
||||
alpha=0.85,
|
||||
),
|
||||
)
|
||||
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
@@ -0,0 +1,100 @@
|
||||
"""Tests para relationship_scatter_figure (scatter de un par numérico, grupo eda).
|
||||
|
||||
Usa el backend Agg sin pyplot global; no muestra ni guarda figuras. Cada test
|
||||
cierra explícitamente la Figure construida (matplotlib.pyplot.close) para no
|
||||
acumular estado entre tests.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
from matplotlib.collections import PathCollection # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
from relationship_scatter_figure import relationship_scatter_figure
|
||||
|
||||
|
||||
def _scatter_offsets(fig):
|
||||
"""Return the plotted points of the first PathCollection (scatter) found."""
|
||||
for ax in fig.axes:
|
||||
for coll in ax.collections:
|
||||
if isinstance(coll, PathCollection):
|
||||
return coll.get_offsets()
|
||||
return None
|
||||
|
||||
|
||||
def test_returns_figure():
|
||||
xs = [float(i) for i in range(20)]
|
||||
ys = [2.0 * x + 1.0 for x in xs] # y = 2x + 1
|
||||
classification = {
|
||||
"tipo": "lineal",
|
||||
"pearson": 1.0,
|
||||
"r2_linear": 1.0,
|
||||
"spearman": 1.0,
|
||||
"r2_poly2": 1.0,
|
||||
"r2_poly3": 1.0,
|
||||
"best_degree": 1,
|
||||
"coeffs": [2.0, 1.0],
|
||||
}
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="a", y_label="b", classification=classification
|
||||
)
|
||||
assert hasattr(fig, "savefig")
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_downsample_determinista():
|
||||
n = 5000
|
||||
xs = [float(i) for i in range(n)]
|
||||
ys = [0.5 * x for x in xs]
|
||||
classification = {
|
||||
"tipo": "lineal",
|
||||
"pearson": 1.0,
|
||||
"r2_linear": 1.0,
|
||||
"spearman": 1.0,
|
||||
"r2_poly2": 1.0,
|
||||
"r2_poly3": 1.0,
|
||||
"best_degree": 1,
|
||||
"coeffs": [0.5, 0.0],
|
||||
}
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="x", y_label="y", classification=classification, max_points=1000
|
||||
)
|
||||
assert isinstance(fig, Figure)
|
||||
offsets = _scatter_offsets(fig)
|
||||
assert offsets is not None
|
||||
# El nº de puntos dibujados no debe exceder el cap.
|
||||
assert len(offsets) <= 1000
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_no_lanza():
|
||||
fig = relationship_scatter_figure([], [], x_label="x", y_label="y")
|
||||
assert isinstance(fig, Figure)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_classification_none():
|
||||
# Solo se ejecuta si el módulo hermano classify_relationship_type existe.
|
||||
try:
|
||||
import classify_relationship_type # noqa: F401
|
||||
except Exception:
|
||||
import pytest
|
||||
|
||||
pytest.skip("classify_relationship_type aún no disponible")
|
||||
xs = [float(i) for i in range(30)]
|
||||
ys = [3.0 * x - 2.0 for x in xs]
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="a", y_label="b", classification=None
|
||||
)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
@@ -0,0 +1,89 @@
|
||||
---
|
||||
name: render_automatic_eda_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def render_automatic_eda_markdown(chapters_or_profile, out_path: str, meta: dict = None) -> dict"
|
||||
description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en un único MARKDOWN autocontenido pensado para PEGAR A UN LLM. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (construye los capítulos canónicos con build_document). Prioriza TEXTO + DATOS sobre lo visual: las tablas se vuelcan como tablas markdown con TODAS las filas (sin paginar — no hay páginas que cortar), una figura matplotlib se reduce a su caption más la tabla de datos subyacente (Desde/Hasta/Frecuencia de las barras del histograma) porque un LLM no ve la imagen, y los marcadores de glosario se eliminan conservando el **negrita**. Lleva cabecera (# título), bloque de metadatos en blockquote e índice numerado con anclas GitHub. Espejo de render_automatic_eda_pdf/render_automatic_eda_pptx pero SIN manifest (KISS, el markdown es un único artefacto de texto). dict-no-throw: nunca lanza, devuelve {path, n_chars, chapters, note}; en error fatal path es None y note explica la causa. Flag opcional meta['embed_figures'] exporta PNGs junto al .md (off por defecto)."
|
||||
tags: [eda, markdown, render, report, llm, automatic-eda, chapters, versioned, no-cut, text, datascience, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [os, re, matplotlib, "datascience.automatic_eda"]
|
||||
params:
|
||||
- name: chapters_or_profile
|
||||
desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Bloques soportados: heading, markdown, kv_table, data_table, figure, image, caption, note, group, glossary_entry. Lectura defensiva: lo no reconocido se degrada a Note, nunca lanza."
|
||||
- name: out_path
|
||||
desc: "ruta del archivo .md de salida. Los directorios padre se crean si faltan. Directorio no escribible → {path:None, note:<causa>} sin lanzar."
|
||||
- name: meta
|
||||
desc: "dict opcional. Claves: title (título del documento), ctx (dict con dataset_name→Dataset, source_origin→Fuente, storage→Almacenamiento, n_rows/n_cols→Dimensiones; también lo consumen los builders de capítulo cuando se da un profile), generated_at (timestamp; si falta se genera ISO UTC), embed_figures (True para exportar PNGs <basename>_figN.png junto al .md; por defecto False y el markdown queda autocontenido)."
|
||||
output: "dict (nunca lanza): {path: str|None, n_chars: int, chapters: list[{id,version}], note: str}. En error fatal (p.ej. directorio no escribible) path es None y note explica la causa. Un documento sin capítulos aplicables produce un markdown mínimo válido con 'documento vacío' y chapters=[]."
|
||||
tested: true
|
||||
tests: ["test_golden_bloques_sinteticos_serializa_todo_a_markdown", "test_edge_documento_vacio_no_revienta", "test_profile_path_construye_capitulos_y_escribe"]
|
||||
test_file_path: "python/functions/datascience/render_automatic_eda_markdown_test.py"
|
||||
file_path: "python/functions/datascience/render_automatic_eda_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience import render_automatic_eda_markdown
|
||||
|
||||
# Desde un TableProfile del grupo eda (mismo modelo que los renderers PDF/PPTX).
|
||||
profile = {
|
||||
"table": "ventas", "source": "/data/ventas.csv",
|
||||
"n_rows": 1000, "n_cols": 2, "quality_score": 92.5,
|
||||
"columns": [
|
||||
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.01,
|
||||
"numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0,
|
||||
"std": 12.3}},
|
||||
{"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0,
|
||||
"categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
|
||||
],
|
||||
}
|
||||
res = render_automatic_eda_markdown(
|
||||
profile, "reports/ventas_aeda.md",
|
||||
{"title": "EDA — ventas",
|
||||
"ctx": {"dataset_name": "Ventas", "source_origin": "ERP export",
|
||||
"n_rows": 1000, "n_cols": 2}})
|
||||
print(res["path"], res["n_chars"], res["chapters"])
|
||||
# -> reports/ventas_aeda.md 4123 [{'id':'portada','version':'1.0.0'}, ...]
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando quieras **pegar el EDA a un LLM** (ChatGPT, Claude, ...) o tenerlo en texto
|
||||
plano versionable: mismo documento por capítulos que el PDF/PPTX, pero serializado a
|
||||
Markdown sin binarios. Úsala como tercera salida junto a `render_automatic_eda_pdf`
|
||||
(móvil) y `render_automatic_eda_pptx` (compartir) desde el MISMO modelo de capítulos.
|
||||
A diferencia de esas dos, no hay páginas ni slides: todas las filas de cada tabla se
|
||||
vuelcan (nada se corta) y cada figura se reduce a su caption + la tabla de datos
|
||||
subyacente, que es lo que un LLM puede leer. Para añadir capítulos al documento, ver
|
||||
`docs/capabilities/automatic_eda.md`.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura**: escribe el `.md` en `out_path` (crea los directorios padre). Con
|
||||
`meta['embed_figures']=True` además exporta un PNG `<basename>_figN.png` por figura
|
||||
junto al `.md`; por defecto NO exporta nada y el markdown queda autocontenido.
|
||||
- **Nunca lanza** (dict-no-throw): un bloque que falle se degrada a una nota y se anota
|
||||
en `note`; el documento se escribe igual. Un profile/lista vacíos producen un markdown
|
||||
mínimo válido con `*(documento vacío …)*` y `chapters=[]`.
|
||||
- **Figuras = datos, no imagen**: un bloque `figure` se serializa como `*Figura: caption*`
|
||||
más, si la figura matplotlib trae barras (histograma / barras), una tabla
|
||||
`| Desde | Hasta | Frecuencia |` extraída de los `Rectangle` patches (máx 100 filas;
|
||||
el resto se trunca con `*… (N filas más)*`). Si no hay barras o algo falla, solo sale
|
||||
el caption. La figura se cierra (`plt.close`) tras leerla.
|
||||
- **Glosario vs negrita**: se eliminan SOLO los marcadores de glosario
|
||||
`[[term:key]]visible[[/term]]` (queda `visible`); el `**negrita**` markdown SE
|
||||
CONSERVA (es válido). No se usa `strip_inline_md` aquí porque ese también quita el bold.
|
||||
- **Anclas del índice**: el `## Índice` enlaza cada capítulo con un ancla estilo GitHub
|
||||
del encabezado `## N. Título` (minúsculas, espacios→`-`, sin signos). Si dos capítulos
|
||||
comparten título exacto sus anclas colisionan (caso raro; los capítulos canónicos tienen
|
||||
títulos únicos).
|
||||
- **Tablas**: las celdas escapan `|` (→ `\|`) y pliegan saltos de línea a `<br>` para no
|
||||
romper la columna. No hay reparto por ancho — un LLM no lo necesita.
|
||||
@@ -0,0 +1,55 @@
|
||||
"""render_automatic_eda_markdown — chapter-based EDA report as one Markdown file.
|
||||
|
||||
Public ``eda``-group entry point that serializes an AutomaticEDA document (a list
|
||||
of chapters, or an ``eda`` TableProfile from which the canonical chapters are
|
||||
built) into a single self-contained Markdown file optimised to be **pasted into
|
||||
an LLM**: plain text, Markdown tables (every row dumped — there are no pages to
|
||||
cut), figures reduced to caption + underlying data, no binaries. It mirrors
|
||||
``render_automatic_eda_pdf`` / ``render_automatic_eda_pptx`` but for text output;
|
||||
unlike those it writes no manifest (KISS — Markdown is a single text artefact).
|
||||
|
||||
dict-no-throw: never raises. Returns ``{path, n_chars, chapters, note}``; on a
|
||||
fatal error ``path`` is None and ``note`` explains why.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datascience.automatic_eda import build_document, render_md
|
||||
from datascience.automatic_eda.model import as_chapter, as_chapters
|
||||
|
||||
|
||||
def _coerce_chapters(chapters_or_profile, meta: dict) -> list:
|
||||
"""Accept chapters OR an eda profile and return a list of Chapter."""
|
||||
arg = chapters_or_profile
|
||||
if isinstance(arg, (list, tuple)):
|
||||
return as_chapters(list(arg))
|
||||
if isinstance(arg, dict):
|
||||
if "blocks" in arg and "columns" not in arg:
|
||||
ch = as_chapter(arg)
|
||||
return [ch] if ch is not None else []
|
||||
return build_document(arg, (meta or {}).get("ctx"))
|
||||
return []
|
||||
|
||||
|
||||
def render_automatic_eda_markdown(chapters_or_profile, out_path: str,
|
||||
meta: dict = None) -> dict:
|
||||
"""Render an AutomaticEDA document into a single self-contained Markdown file.
|
||||
|
||||
Args:
|
||||
chapters_or_profile: a list of chapters (``Chapter`` dataclasses or
|
||||
dicts) or an ``eda`` TableProfile dict (chapters built via
|
||||
``build_document(profile, meta['ctx'])``).
|
||||
out_path: filesystem path for the ``.md`` (parent dirs are created).
|
||||
meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
|
||||
``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
|
||||
``generated_at``, ``embed_figures`` (export PNGs beside the .md,
|
||||
default False — off keeps the Markdown self-contained).
|
||||
|
||||
Returns:
|
||||
dict (never raises): ``{path: str|None, n_chars: int,
|
||||
chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
|
||||
None and ``note`` explains the cause.
|
||||
"""
|
||||
meta = dict(meta or {})
|
||||
chapters = _coerce_chapters(chapters_or_profile, meta)
|
||||
return render_md(chapters, out_path, meta)
|
||||
@@ -0,0 +1,168 @@
|
||||
"""Tests for render_automatic_eda_markdown — DoD: golden + edge + profile path.
|
||||
|
||||
Self-contained synthetic blocks (no DuckDB). Verifies every block kind serializes
|
||||
to Markdown (heading, markdown with glossary+bold, kv/data tables, a figure whose
|
||||
histogram bars become a data table, caption, note, group, glossary entry), that a
|
||||
leading level-1 heading equal to the chapter title is omitted, that an empty
|
||||
document degrades to a valid minimal Markdown without raising, and that passing a
|
||||
minimal TableProfile builds chapters and writes the file.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from datascience.render_automatic_eda_markdown import render_automatic_eda_markdown
|
||||
from datascience.automatic_eda.model import (
|
||||
Caption, Chapter, DataTable, Figure, GlossaryEntry, Group, Heading, KVTable,
|
||||
Markdown, Note,
|
||||
)
|
||||
|
||||
|
||||
def _hist_fig():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
fig, ax = plt.subplots()
|
||||
ax.hist([1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5], bins=5)
|
||||
return fig
|
||||
|
||||
|
||||
def _chapters() -> list:
|
||||
blocks = [
|
||||
Heading("Demo", 1), # == chapter title -> omitted.
|
||||
Heading("Seccion dos", 2), # -> ####
|
||||
Markdown("Texto con [[term:ent]]entropia[[/term]] y **bold** aqui."),
|
||||
KVTable(rows=[("Filas", 1000), ("Columnas", 5)], title="Resumen"),
|
||||
DataTable(header=["col", "valor"],
|
||||
rows=[["alpha", "111"], ["beta", "222"], ["gamma", "333"]],
|
||||
title="Datos", note="nota inferior"),
|
||||
Figure(make=_hist_fig, caption="Histograma demo"),
|
||||
Caption("pie de figura"),
|
||||
Note("una nota aparte"),
|
||||
Group(title="Grupo X", blocks=[Markdown("dentro del grupo")]),
|
||||
GlossaryEntry(key="ent", label="Entropia",
|
||||
definition="Medida de incertidumbre."),
|
||||
]
|
||||
return [Chapter(id="demo", title="Demo", version="1.0.0", blocks=blocks)]
|
||||
|
||||
|
||||
def _read(path: str) -> str:
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
return fh.read()
|
||||
|
||||
|
||||
def test_golden_bloques_sinteticos_serializa_todo_a_markdown():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "demo.md")
|
||||
res = render_automatic_eda_markdown(
|
||||
_chapters(), out,
|
||||
{"title": "EDA Demo",
|
||||
"ctx": {"dataset_name": "Demo", "n_rows": 12, "n_cols": 2}})
|
||||
assert res["path"] == out
|
||||
assert os.path.exists(out)
|
||||
assert res["n_chars"] > 0
|
||||
assert res["chapters"] == [{"id": "demo", "version": "1.0.0"}]
|
||||
|
||||
content = _read(out)
|
||||
# Document structure.
|
||||
assert content.startswith("# ")
|
||||
assert "## Índice" in content
|
||||
# A Markdown table is present (header + separator row).
|
||||
assert "| " in content and "| --- " in content
|
||||
# DataTable values are all dumped.
|
||||
for v in ("alpha", "111", "beta", "222", "gamma", "333"):
|
||||
assert v in content
|
||||
# Glossary markers stripped, bold kept.
|
||||
assert "[[term" not in content
|
||||
assert "[[/term]]" not in content
|
||||
assert "**bold**" in content
|
||||
assert "entropia" in content # visible glossary text preserved.
|
||||
# Figure histogram bars became a data table.
|
||||
assert "| Desde | Hasta | Frecuencia |" in content
|
||||
# Glossary entry rendered as a level-3 heading.
|
||||
assert "### Entropia" in content
|
||||
# Level-2 heading -> ####.
|
||||
assert "#### Seccion dos" in content
|
||||
# Leading level-1 heading equal to the title was omitted.
|
||||
assert "### Demo" not in content
|
||||
# Group title rendered.
|
||||
assert "### Grupo X" in content
|
||||
|
||||
|
||||
def _hist_fig_with_span():
|
||||
"""Histogram with a wide ``axvspan`` (±1σ band) over it.
|
||||
|
||||
Reproduces the num_distr figure shape: matplotlib keeps the span as a lone
|
||||
Rectangle in ``ax.patches`` alongside the bin bars; it must NOT leak into the
|
||||
extracted bins table as a fake bin (it is ~5x wider than a bin)."""
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
fig, ax = plt.subplots()
|
||||
data = [1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5]
|
||||
ax.hist(data, bins=5)
|
||||
ax.axvspan(2.0, 4.0, alpha=0.2) # mean±σ band — a wide stray rectangle.
|
||||
return fig
|
||||
|
||||
|
||||
def test_figura_descarta_axvspan_de_la_tabla_de_bins():
|
||||
"""The ±1σ band rectangle must not appear as a row in the bins table."""
|
||||
blocks = [Figure(make=_hist_fig_with_span, caption="Hist con banda")]
|
||||
chapters = [Chapter(id="f", title="Fig", version="1.0.0", blocks=blocks)]
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "fig.md")
|
||||
render_automatic_eda_markdown(chapters, out, {"title": "T"})
|
||||
content = _read(out)
|
||||
assert "| Desde | Hasta | Frecuencia |" in content
|
||||
# Extract the rows of the bins table: lines between the header/separator
|
||||
# and the next blank line.
|
||||
lines = content.splitlines()
|
||||
hi = next(i for i, ln in enumerate(lines)
|
||||
if ln.startswith("| Desde | Hasta | Frecuencia |"))
|
||||
rows = []
|
||||
for ln in lines[hi + 2:]: # skip header + separator
|
||||
if not ln.startswith("|"):
|
||||
break
|
||||
rows.append(ln)
|
||||
# 5 histogram bins, no extra wide span row.
|
||||
assert len(rows) == 5, rows
|
||||
# No row spans a width of ~2.0 (the axvspan from x=2 to x=4).
|
||||
for ln in rows:
|
||||
cells = [c.strip() for c in ln.strip("|").split("|")]
|
||||
lo, hi_v = float(cells[0]), float(cells[1])
|
||||
assert (hi_v - lo) < 1.5, f"wide span leaked: {ln}"
|
||||
|
||||
|
||||
def test_edge_documento_vacio_no_revienta():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "empty.md")
|
||||
res = render_automatic_eda_markdown([], out, {})
|
||||
assert res["path"] == out
|
||||
assert os.path.exists(out)
|
||||
assert res["chapters"] == []
|
||||
content = _read(out)
|
||||
assert "documento vacío" in content
|
||||
assert content.startswith("# ")
|
||||
|
||||
|
||||
def test_profile_path_construye_capitulos_y_escribe():
|
||||
profile = {
|
||||
"table": "mini",
|
||||
"source": "/data/mini.csv",
|
||||
"n_rows": 10,
|
||||
"n_cols": 1,
|
||||
"quality_score": 88.0,
|
||||
"columns": [
|
||||
{"name": "x", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"null_count": 0,
|
||||
"numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
|
||||
"std": 0.5}},
|
||||
],
|
||||
}
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "mini.md")
|
||||
res = render_automatic_eda_markdown(
|
||||
profile, out, {"title": "Mini", "ctx": {"dataset_name": "Mini"}})
|
||||
assert res["path"] == out # not None — no exception, file written.
|
||||
assert os.path.exists(out)
|
||||
assert res["n_chars"] > 0
|
||||
@@ -1,9 +1,10 @@
|
||||
"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX.
|
||||
"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX + MD.
|
||||
|
||||
Pipeline impuro del grupo de capacidad `eda`. Dada UNA tabla DuckDB (o
|
||||
PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus dos formatos a la
|
||||
vez (PDF móvil A5 + PPTX 16:9) con los 11 capítulos POBLADOS, en una sola
|
||||
llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry:
|
||||
PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus tres formatos a la
|
||||
vez (PDF móvil A5 + PPTX 16:9 + Markdown autocontenido para pegar a un LLM) con
|
||||
los capítulos POBLADOS, en una sola llamada. Compone, sin reimplementar su
|
||||
lógica, varias funciones del registry:
|
||||
|
||||
- profile_table : perfila la tabla end-to-end (TableProfile agregado),
|
||||
opcionalmente con modelos baratos y análisis de serie.
|
||||
@@ -12,8 +13,11 @@ llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry:
|
||||
modelos/geo, timeseries_raw para series, geo_points
|
||||
para el mapa, db_path/table para la agregación
|
||||
push-down). Sin él, esos capítulos degradan.
|
||||
- render_automatic_eda_pdf : renderiza el documento por capítulos a PDF.
|
||||
- render_automatic_eda_pptx : renderiza el mismo documento a PPTX.
|
||||
- render_automatic_eda_pdf : renderiza el documento por capítulos a PDF.
|
||||
- render_automatic_eda_pptx : renderiza el mismo documento a PPTX.
|
||||
- render_automatic_eda_markdown : serializa el mismo documento a Markdown
|
||||
autocontenido (texto + tablas markdown, sin
|
||||
binarios) para incorporar a un LLM.
|
||||
|
||||
El TableProfile agregado basta para portada/overview/distribuciones/calidad/
|
||||
correlación, pero los capítulos `modelos`, `timeseries`, `geospatial` y
|
||||
@@ -32,6 +36,7 @@ from datetime import datetime, timezone
|
||||
|
||||
from datascience import (
|
||||
build_eda_render_ctx,
|
||||
render_automatic_eda_markdown,
|
||||
render_automatic_eda_pdf,
|
||||
render_automatic_eda_pptx,
|
||||
run_eda_models,
|
||||
@@ -93,6 +98,7 @@ def render_automatic_eda(
|
||||
out_dir: str = "reports",
|
||||
basename: str = None,
|
||||
ctx_extra: dict = None,
|
||||
emit_md: bool = True,
|
||||
) -> dict:
|
||||
"""Perfila una tabla y emite el informe AutomaticEDA completo (PDF + PPTX).
|
||||
|
||||
@@ -140,13 +146,19 @@ def render_automatic_eda(
|
||||
ctx_extra: dict opcional con claves de presentación/contexto extra que se
|
||||
mezclan en el ctx (p.ej. dataset_name, description, source_origin).
|
||||
No pisan las claves de datos calculadas por build_eda_render_ctx.
|
||||
emit_md: además del PDF y el PPTX, emite un Markdown autocontenido del
|
||||
MISMO documento por capítulos (texto plano + tablas markdown, sin
|
||||
binarios), pensado para pegar a un LLM. Default True. La ruta sale en
|
||||
la clave de retorno ``aeda_md_path``. No altera las demás salidas.
|
||||
|
||||
Returns:
|
||||
dict (nunca lanza). En éxito::
|
||||
|
||||
{"status": "ok", "pdf_path": str, "pptx_path": str,
|
||||
"manifest_path": str|None, "n_pages": int, "n_slides": int,
|
||||
"pdf_note": str, "pptx_note": str, "profile": <TableProfile>}
|
||||
"aeda_md_path": str|None, "manifest_path": str|None,
|
||||
"n_pages": int, "n_slides": int, "md_chars": int|None,
|
||||
"pdf_note": str, "pptx_note": str, "md_note": str|None,
|
||||
"profile": <TableProfile>}
|
||||
|
||||
En error: {"status": "error", "error": str}.
|
||||
"""
|
||||
@@ -243,15 +255,26 @@ def render_automatic_eda(
|
||||
rpdf = render_automatic_eda_pdf(prof, pdf_path, meta) or {}
|
||||
rpptx = render_automatic_eda_pptx(prof, pptx_path, meta) or {}
|
||||
|
||||
# Salida Markdown autocontenida (mismo documento por capítulos) para
|
||||
# pegar a un LLM. Aditiva: no afecta a PDF/PPTX/manifest. dict-no-throw.
|
||||
rmd = {}
|
||||
md_path = None
|
||||
if emit_md:
|
||||
md_path = os.path.join(out_dir, base + ".md")
|
||||
rmd = render_automatic_eda_markdown(prof, md_path, meta) or {}
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"pdf_path": rpdf.get("path"),
|
||||
"pptx_path": rpptx.get("path"),
|
||||
"aeda_md_path": rmd.get("path"),
|
||||
"manifest_path": rpdf.get("manifest_path"),
|
||||
"n_pages": rpdf.get("n_pages"),
|
||||
"n_slides": rpptx.get("n_slides"),
|
||||
"md_chars": rmd.get("n_chars"),
|
||||
"pdf_note": rpdf.get("note"),
|
||||
"pptx_note": rpptx.get("note"),
|
||||
"md_note": rmd.get("note"),
|
||||
"profile": prof,
|
||||
}
|
||||
except Exception as e: # noqa: BLE001 — dict-no-throw: degradar, nunca lanzar.
|
||||
|
||||
Reference in New Issue
Block a user