feat(eda): el Markdown del AutomaticEDA vuelca TODOS los datos del profile

El .md del grupo `eda` es la salida pensada para pegar a un LLM, así que debe contener todo lo que el motor computó, aunque el PDF/PPTX (vista humana) resuman. La evaluación 2053 detectó 6 datos que el .md perdía respecto al profile. Se cierran de forma aditiva (el .md tiene MÁS que el PDF/PPTX, sin tocar esos renderers ni los capítulos). render_automatic_eda.py pasa el profile al serializador Markdown vía meta['profile'] (un meta propio del MD; el de PDF/PPTX queda intacto). render_md_impl.py añade un "Apéndice — Datos completos del perfil" al final del documento, emitido solo cuando hay profile y degradando limpio cuando falta una sección (lite sin modelos, profile sin correlaciones). El apéndice no se acopla a los ids de capítulo (que editan otros agentes en paralelo). Pérdidas cerradas: 1. Matriz de asociación COMPLETA: los N pares de correlations.pairs (no solo el top-17), incluidos correlation_ratio (num↔cat) y cramers_v (cat↔cat). 2. Numéricas: describe completo por columna — mean/median/mode/std/variance/cv, skew y kurtosis para TODAS (no solo las asimétricas), p1/p5/p25/p50/p75/p95/ p99, iqr, min/max, outliers, distribution_type. 3. Re-expresión: nombra la transformación concreta (log1p/sqrt/yeo-johnson) con potencia, razón y alternativas, no un vago "considerar re-expresión". 4. KMeans: tabla scores_by_k (silhouette + inercia por k) marcando el k elegido. 5. Normalidad: el estadístico (stat) de cada test junto al p-value. 6. Encabezados de figuras de barras/scree dejan de heredar "Desde/Hasta/Frecuencia" del histograma; usan "Inicio/Fin/Valor" cuando el caption no es un histograma. Test nuevo md_completeness_test.py: profile sintético, asserta los N pares de correlación, skew/kurtosis de cada numérica, percentiles extendidos, log1p, scores_by_k, stat de normalidad, headers de barras y los edges (sin modelos / sin correlaciones / sin profile, defensivo). Verificado con titanic (profile_level=full): 28 pares en la tabla (incl. Sex↔Embarked cramers_v), 7 numéricas con skew+kurtosis, p5/p95/p99, scores_by_k y JB/D'Agostino/Shapiro stat presentes. PDF/PPTX/manifest siguen saliendo. Suite automatic_eda + render_automatic_eda_test: 134 passed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
merge: 4c cat_distr una hoja por columna (PDF+PPTX 1:1) + sin descripcion entropia redundante + page_break motor (verificado met)
2026-06-30 20:27:30 +02:00 · 2026-06-30 19:53:57 +02:00 · 2026-06-30 19:45:09 +02:00 · 2026-06-30 19:26:33 +02:00 · 2026-06-30 19:24:22 +02:00 · 2026-06-30 19:15:24 +02:00
44 changed files with 4980 additions and 513 deletions
@@ -34,6 +34,7 @@ from .theils_u import theils_u
 from .correlation_ratio import correlation_ratio
 from .mutual_info_columns import mutual_info_columns
 from .infer_fk_containment_duckdb import infer_fk_containment_duckdb
+from .detect_declared_keys_duckdb import detect_declared_keys_duckdb
 from .build_join_graph import build_join_graph
 from .association_matrix import association_matrix
 from .correlation_matrix_duckdb import correlation_matrix_duckdb
@@ -63,14 +64,17 @@ from .exploratory_caveats import exploratory_caveats
 from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational
 from .render_automatic_eda_pdf import render_automatic_eda_pdf
 from .render_automatic_eda_pptx import render_automatic_eda_pptx
+from .render_automatic_eda_markdown import render_automatic_eda_markdown
 from .detect_time_column import detect_time_column
 from .extract_timeseries_raw import extract_timeseries_raw
 from .build_eda_render_ctx import build_eda_render_ctx
 from .profile_datetime import profile_datetime
 from .resample_timeseries import resample_timeseries
 from .add_pdf_internal_links import add_pdf_internal_links
+from .suggest_intratable_fk_candidates import suggest_intratable_fk_candidates

 __all__ = [
+    "suggest_intratable_fk_candidates",
    "detect_time_column",
    "extract_timeseries_raw",
    "build_eda_render_ctx",
@@ -79,6 +83,7 @@ __all__ = [
    "resample_timeseries",
    "render_automatic_eda_pdf",
    "render_automatic_eda_pptx",
+    "render_automatic_eda_markdown",
    "decode_qr_image",
    "adf_kpss_stationarity",
    "acf_pacf",
@@ -97,6 +102,7 @@ __all__ = [
    "correlation_ratio",
    "mutual_info_columns",
    "infer_fk_containment_duckdb",
+    "detect_declared_keys_duckdb",
    "build_join_graph",
    "association_matrix",
    "correlation_matrix_duckdb",
@@ -36,6 +36,7 @@ from .model import (  # noqa: F401
 from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document  # noqa: F401
 from .render_pdf_impl import render_pdf  # noqa: F401
 from .render_pptx_impl import render_pptx  # noqa: F401
+from .render_md_impl import render_md  # noqa: F401

 __all__ = [
    "ENGINE_NAME",
@@ -60,4 +61,5 @@ __all__ = [
    "build_document",
    "render_pdf",
    "render_pptx",
+    "render_md",
 ]
@@ -89,6 +89,35 @@ _DEF_MAX_CARD = 20
 _DEF_MAX_MEASURES = 4
 _DEF_TOP_N = 12

+# Glossary terms this chapter explains. Both appear in the always-rendered intro,
+# so they are registered and marked clickable whenever a collector is in ctx —
+# the canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
+# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
+# block. Mapping key -> (label, definition).
+_TERM_DEFS = {
+    "groupby": (
+        "Agrupación (split-apply-combine)",
+        "Operación de agrupación (group by): parte la tabla en grupos según los "
+        "valores de una columna categórica, aplica un cálculo (conteo, media, "
+        "mediana…) dentro de cada grupo y combina los resultados en una tabla "
+        "resumen. Es el patrón split-apply-combine."),
+    "pivot_table": (
+        "Tabla dinámica (pivot)",
+        "Tabla dinámica que cruza dos variables categóricas — una en las filas y "
+        "otra en las columnas — y rellena cada celda con un agregado (media, "
+        "suma…) de una medida numérica. Resume de un vistazo cómo interactúan las "
+        "dos categóricas sobre esa medida."),
+}
+
+
+def _term(mark: bool, key: str, text: str) -> str:
+    """Wrap ``text`` as a clickable glossary span when ``mark`` is True.
+
+    The visible text is identical with or without the marker (the renderers strip
+    it), so wrapping never changes line layout — it only adds the link.
+    """
+    return f"[[term:{key}]]{text}[[/term]]" if mark else text
+

 # --------------------------------------------------------------------------- #
 # Formatting helpers (mirror the other chapters' defensive style).
@@ -525,15 +554,18 @@ def _sections_live(profile: dict, ctx: dict, candidates: dict) -> list:
 # --------------------------------------------------------------------------- #
 # Entry point.
 # --------------------------------------------------------------------------- #
-def _intro_blocks() -> list:
+def _intro_blocks(gloss=None, mark_term: bool = False) -> list:
+    if gloss is not None:
+        for key, (label, definition) in _TERM_DEFS.items():
+            gloss.add(key, label, definition)
+    t_groupby = _term(mark_term, "groupby", "**por grupos** (split-apply-combine)")
+    t_pivot = _term(mark_term, "pivot_table", "**tablas dinámicas** (pivot)")
    text = (
-        "Este capítulo analiza la tabla **por grupos** (split-apply-combine): "
-        "elige las columnas categóricas más informativas — por su cardinalidad "
-        "y relevancia, no todas contra todas, para no inflar comparaciones "
-        "espurias — y resume las variables numéricas dentro de cada grupo "
-        "(conteo, media, mediana, desviación). Las **tablas dinámicas** (pivot) "
-        "cruzan dos categóricas sobre una medida, y los **gráficos de barras** "
-        "(siempre desde cero) comparan los grupos de un vistazo."
+        f"Este capítulo analiza la tabla {t_groupby}: elige las columnas "
+        "categóricas más informativas (por cardinalidad y relevancia, no todas "
+        "contra todas) y resume las variables numéricas dentro de cada grupo "
+        f"(conteo, media, mediana, desviación). Se añaden {t_pivot} y "
+        "**gráficos de barras** (siempre desde cero) para comparar los grupos."
    )
    return [model.Heading(text=CHAPTER_TITLE, level=1),
            model.Markdown(text=text)]
@@ -556,13 +588,21 @@ def build_agregacion(profile: dict, ctx: dict):
    if not isinstance(profile, dict):
        return None

+    # Shared glossary collector: groupby + pivot_table live in the always-present
+    # intro, so they are registered + marked there. Degrades silently (mark_term
+    # False) when no collector is in ctx (standalone render).
+    glossary = ctx.get("glossary")
+    gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
+    mark_term = gloss is not None
+
    # Pre-computed results take precedence (offline / tests / forward-compat).
    pre = ctx.get("aggregations")
    if _is_dict(pre) and (pre.get("groupby") or pre.get("pivots")):
        sections = _sections_from_precomputed(pre)
        if not sections:
            return None
-        blocks = _intro_blocks() + sections + _insights_section(ctx)
+        blocks = (_intro_blocks(gloss, mark_term) + sections
+                  + _insights_section(ctx))
        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                             version=CHAPTER_VERSION, blocks=blocks)

@@ -583,10 +623,11 @@ def build_agregacion(profile: dict, ctx: dict):
            "crudos. Pasa ctx['db_path'] + ctx['table'] (para el cálculo "
            "push-down en DuckDB) o ctx['aggregations'] ya precalculado. "
            f"Columnas categóricas candidatas: {keys or '—'}.")
-        blocks = _intro_blocks() + [note] + _insights_section(ctx)
+        blocks = (_intro_blocks(gloss, mark_term) + [note]
+                  + _insights_section(ctx))
        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                             version=CHAPTER_VERSION, blocks=blocks)

-    blocks = _intro_blocks() + sections + _insights_section(ctx)
+    blocks = _intro_blocks(gloss, mark_term) + sections + _insights_section(ctx)
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -254,3 +254,25 @@ def test_anti_corte_muchos_grupos_y_texto_largo():
        # First, middle and last words of the long paragraph all present.
        for i in (0, 60, 119):
            assert f"palabra{i}" in txt
+
+
+def test_glosario_engancha_groupby_y_pivot():
+    """Mejora 4b: la agrupación (split-apply-combine) y la tabla dinámica (pivot)
+    se registran en el colector compartido y se marcan clicables en el cuerpo.
+    Sin colector en ctx, el capítulo degrada y no marca nada."""
+    from datascience.automatic_eda.model import GlossaryCollector
+
+    g = GlossaryCollector()
+    ctx = dict(_ctx_precomputed())
+    ctx["glossary"] = g
+    ch = build_agregacion(_profile(), ctx)
+    assert ch is not None
+    keys = {t["key"] for t in g.terms()}
+    assert {"groupby", "pivot_table"} <= keys
+    body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
+    assert "[[term:groupby]]" in body and "[[term:pivot_table]]" in body
+
+    # Sin colector: degrada limpio (ningún marcador en el cuerpo).
+    ch2 = build_agregacion(_profile(), _ctx_precomputed())
+    body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
+    assert "[[term:" not in body2
@@ -42,7 +42,11 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables
+# no longer carry a ``title`` (the section Heading labels them once, per the
+# OVERVIEW pattern in the contract). The data-dictionary column already reads
+# "Significado de negocio".
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "analisis_llm"
 CHAPTER_TITLE = "Análisis LLM"

@@ -118,6 +122,11 @@ def _dictionary_block(llm: dict):
    Columns: Columna / Descripción / Significado de negocio / Unidad. The
    paginator splits this by rows repeating the header and wraps long cells, so a
    long dictionary (many columns) never gets cut.
+
+    The block carries **no** ``title``: the section is labelled once by the
+    ``Heading`` that ``build_analisis_llm`` appends right before it (the canonical
+    OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would
+    print "Diccionario de datos" twice in a row.
    """
    entries = llm.get("dictionary")
    if not isinstance(entries, (list, tuple)) or not entries:
@@ -137,7 +146,7 @@ def _dictionary_block(llm: dict):
        ])
    if not rows:
        return None
-    return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
+    return model.DataTable(header=header, rows=rows)


 def _analyses_blocks(llm: dict) -> list:
@@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list:


 def _pii_block(llm: dict):
-    """DataTable for PII/GDPR findings, or None if absent/empty."""
+    """DataTable for PII/GDPR findings, or None if absent/empty.
+
+    Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in
+    ``build_analisis_llm`` labels the section once); it keeps its ``note`` with
+    the orientative-detection caveat, which the renderers print under the table.
+    """
    entries = llm.get("pii")
    if not isinstance(entries, (list, tuple)) or not entries:
        return None
@@ -176,7 +190,7 @@ def _pii_block(llm: dict):
    if not rows:
        return None
    return model.DataTable(
-        header=header, rows=rows, title="Datos personales (PII / RGPD)",
+        header=header, rows=rows,
        note="detección automática orientativa — revisar antes de tratar los datos")


@@ -24,7 +24,7 @@ from pptx import Presentation
 from datascience.automatic_eda.chapters.analisis_llm import (
    build_analisis_llm, CHAPTER_VERSION)
 from datascience.automatic_eda.chapters_registry import build_document
-from datascience.automatic_eda.model import Chapter, DataTable
+from datascience.automatic_eda.model import Chapter, DataTable, Heading
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx

@@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx():
        assert "DESCTOKEN" in ptx


+def test_sin_rotulos_duplicados_y_significado_de_negocio():
+    """The dictionary / PII sections must be labelled ONCE.
+
+    Regression for the duplicated 'Diccionario de datos' and 'Datos personales
+    (PII / RGPD)' headings (each section used to print its label twice: a Heading
+    plus the DataTable's own title). The fix drops the DataTable title and keeps
+    a single Heading — the OVERVIEW pattern. The data-dictionary column header is
+    also pinned to the exact text 'Significado de negocio'.
+    """
+    ch = build_analisis_llm(_profile(), {})
+    assert ch is not None
+
+    # Structure: section labels come from Headings; tables carry no title.
+    headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
+    assert headings.count("Diccionario de datos") == 1
+    assert headings.count("Datos personales (PII / RGPD)") == 1
+    for b in ch.blocks:
+        if isinstance(b, DataTable):
+            assert not b.title, f"DataTable should not duplicate the label: {b.title!r}"
+
+    # The data dictionary's third column reads exactly 'Significado de negocio'.
+    dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header]
+    assert dicts, "expected the data-dictionary DataTable"
+    assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"]
+
+    # The PII table keeps its orientative-detection note.
+    pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]]
+    assert pii and pii[0].note and "orientativa" in pii[0].note
+
+    # Render: each label appears exactly once across the whole document (the only
+    # 'Diccionario de datos' / 'Datos personales' producer is this chapter).
+    with tempfile.TemporaryDirectory() as d:
+        out_pdf = os.path.join(d, "eda.pdf")
+        render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"})
+        txt = _pdf_text(out_pdf)
+        assert txt.count("Diccionario de datos") == 1
+        assert txt.count("Datos personales") == 1
+
+
 def test_orden_capitulo_junto_a_overview():
    chapters = build_document(_profile(), {})
    ids = [c.id for c in chapters]
@@ -1,22 +1,27 @@
 """Data-quality chapter (CALIDAD) for AutomaticEDA.

 Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
-chapter answers, in Spanish and as tables, the three things the user asked for:
+chapter implements the quality model of report 2046:

-1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
-   their weights (completeness, validity, consistency) before any number, plus a
-   table-level summary (global score and aggregates).
+1. **En qué se basa la calidad** — a concise intro naming the two scored
+   dimensions and their weights (completitud 60%, validez 40%) plus the
+   table-level row uniqueness, BEFORE any number, and stating that outliers are
+   reported as observations and do **not** lower the score. The criteria terms
+   (calidad de datos, completitud, validez, unicidad de registro) are hooked
+   into the shared glossary as clickable jumps; their full definitions live in
+   the GLOSARIO chapter, not inline here.
 2. **Scores por columna** — a table with, per column, the total quality score and
-   its breakdown into completeness / validity / consistency.
-3. **Problemas en español** — a second table listing, per column, the readable
-   issues in Spanish (kept separate from the type ``flags``).
+   its breakdown into completeness / validity (no consistency dimension).
+3. **Problemas de calidad** — a table listing ONLY real quality defects
+   (nulls, empty cells, values not conforming to their type/semantics).
+4. **Observaciones analíticas** — a SEPARATE table for outliers, constant
+   columns, high-cardinality ids and strong skew, with an explicit note that
+   these do not affect the score.

-The breakdown and the issues are NOT recomputed here: they come from the registry
-function ``column_quality_score`` (group ``eda``), which already derives
-``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
-This chapter is render-only — it consumes that function and lays the result out
-as model blocks; the renderers paginate tables (splitting by rows, repeating the
-header) and wrap long cells so nothing is ever cut.
+The breakdown, issues and observations are NOT recomputed here: they come from
+the registry function ``column_quality_score`` (group ``eda``), which derives
+``{score, completeness, validity, dimensions, applicable, issues,
+observations}`` from the ColumnProfile. This chapter is render-only.

 Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
 """
@@ -33,28 +38,47 @@ try:  # pragma: no cover - import wiring
 except Exception:  # noqa: BLE001 - never let an import error abort the document.
    _column_quality_score = None

-CHAPTER_VERSION = "1.0.0"
+CHAPTER_VERSION = "2.0.0"
 CHAPTER_ID = "calidad"
 CHAPTER_TITLE = "Calidad"

-# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
-# consistency 0.2. Kept here only to render the human explanation; the actual
-# numbers always come from the function so the two never drift in computation.
-_CRITERIA_INTRO = (
-    "La calidad de cada columna es un score de 0 a 100 que combina tres "
-    "criterios, cada uno con un peso:\n\n"
-    "- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
-    "ni vacíos). Una columna con muchos nulos baja de score.\n"
-    "- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
-    "esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
-    "- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
-    "columnas constantes o identificadores de cardinalidad muy alta).\n\n"
-    "Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
-    "Los problemas detectados por columna se listan en español más abajo."
-)
+# Glossary terms this chapter explains (report 2046 §6). Registered in the shared
+# collector and marked clickable on their first appearance (contract §11.1).
+_TERMS = {
+    "calidad_datos": (
+        "Calidad de datos (score 0-100)",
+        "Mide hasta qué punto los datos están presentes y son utilizables tal "
+        "cual, no si son «buenos para el análisis». Se compone solo de "
+        "dimensiones medibles automáticamente desde el perfil de la tabla, sin "
+        "fuente externa de verdad: completitud (60%), validez (40%, cuando es "
+        "medible) y, a nivel de tabla, unicidad de registro. Los valores "
+        "atípicos NO bajan la calidad: se listan aparte como observaciones.",
+    ),
+    "completitud": (
+        "Completitud",
+        "Proporción de valores realmente presentes en una columna (1 − % de "
+        "nulos; en texto, las celdas vacías también cuentan como faltantes). Los "
+        "nulos y vacíos bajan el score porque falta información que debería "
+        "estar. Pesa el 60% del score de columna.",
+    ),
+    "validez": (
+        "Validez",
+        "Proporción de valores que encajan con su tipo o formato esperado: un "
+        "número que parsea, una fecha legible, un email con forma de email. Los "
+        "valores que no parsean a su tipo bajan el score. Si la columna es texto "
+        "libre sin formato esperado, la validez no se puede medir y el score se "
+        "basa solo en la completitud. Pesa el 40% del score cuando es medible.",
+    ),
+    "unicidad_registro": (
+        "Unicidad de registro",
+        "A nivel de tabla, las filas duplicadas restan calidad al conjunto "
+        "(1 − % de filas duplicadas). Es distinta de que una columna no-clave "
+        "repita valores, que no es un defecto de calidad.",
+    ),
+}

-# Cap for the joined issues cell so a single row never grows taller than a page;
-# the remainder is summarized as "(+N más)" instead of being silently dropped.
+# Cap for the joined cell so a single row never grows taller than a page; the
+# remainder is summarized as "(+N más)" instead of being silently dropped.
 _ISSUES_MAXLEN = 160


@@ -82,12 +106,19 @@ def _fmt_unit_pct(value) -> str:
        return str(value)


+def _fmt_validity(value) -> str:
+    """Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%."""
+    if value is None:
+        return "n/a"
+    return _fmt_unit_pct(value)
+
+
 def _quality_of(col: dict) -> dict:
-    """Return ``{score, completeness, validity, consistency, issues}`` for a column.
+    """Return the quality dict for a column.

    Uses the registry ``column_quality_score`` when available; otherwise falls
    back to the per-column ``quality_score`` already in the profile (number only,
-    empty breakdown/issues). Never raises.
+    empty breakdown/issues/observations). Never raises.
    """
    if not isinstance(col, dict):
        col = {}
@@ -98,26 +129,25 @@ def _quality_of(col: dict) -> dict:
                return res
        except Exception:  # noqa: BLE001 - degrade instead of aborting.
            pass
-    # Fallback: only the final score is available pre-computed in the profile.
    return {
        "score": col.get("quality_score"),
        "completeness": None,
        "validity": None,
-        "consistency": None,
        "issues": [],
+        "observations": [],
    }


-def _join_issues(issues) -> str:
-    """Join Spanish issue strings into one cell, truncating overly long lists.
+def _join_cells(items) -> str:
+    """Join Spanish strings into one cell, truncating overly long lists.

-    The renderer wraps cell text, but a column with many long issues could make a
-    single row taller than a whole page; cap the length and append ``(+N más)``
-    so the count of hidden issues is honest rather than silently lost.
+    The renderer wraps cell text, but a column with many long entries could make
+    a single row taller than a whole page; cap the length and append ``(+N más)``
+    so the count of hidden entries is honest rather than silently lost.
    """
-    if not isinstance(issues, (list, tuple)) or not issues:
+    if not isinstance(items, (list, tuple)) or not items:
        return ""
-    parts = [model._safe_str(i).strip() for i in issues]
+    parts = [model._safe_str(i).strip() for i in items]
    parts = [p for p in parts if p]
    if not parts:
        return ""
@@ -142,6 +172,33 @@ def _columns_with_quality(profile: dict):
            yield c, _quality_of(c)


+def _fmt_unit_pct_or_pct(value) -> str:
+    """Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
+    try:
+        num = float(value)
+    except (TypeError, ValueError):
+        return model._safe_str(value)
+    if num != num:  # NaN
+        return "—"
+    pct = num * 100 if num <= 1.0 else num
+    text = f"{pct:.1f}".rstrip("0").rstrip(".")
+    return f"{text}%"
+
+
+def _row_uniqueness(profile: dict):
+    """Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown."""
+    dup = profile.get("duplicate_pct")
+    if dup is None:
+        return None
+    try:
+        d = float(dup)
+    except (TypeError, ValueError):
+        return None
+    if d > 1.0:  # tolerate a 0-100 scale
+        d = d / 100.0
+    return max(0.0, min(1.0, 1.0 - d))
+
+
 def _summary_block(profile: dict, evaluated: list):
    """Table-level KVTable: global score and quality aggregates."""
    rows = []
@@ -153,14 +210,15 @@ def _summary_block(profile: dict, evaluated: list):
             if isinstance(q.get("completeness"), (int, float))]
    vals = [q.get("validity") for _, q in evaluated
            if isinstance(q.get("validity"), (int, float))]
-    cons = [q.get("consistency") for _, q in evaluated
-            if isinstance(q.get("consistency"), (int, float))]
    if comps:
        rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
    if vals:
-        rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
-    if cons:
-        rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
+        rows.append(("Validez media (donde aplica)",
+                     _fmt_unit_pct(sum(vals) / len(vals))))
+
+    ru = _row_uniqueness(profile)
+    if ru is not None:
+        rows.append(("Unicidad de registro", _fmt_unit_pct(ru)))

    n_problem = sum(1 for _, q in evaluated if q.get("issues"))
    rows.append(("Columnas con problemas", str(n_problem)))
@@ -182,22 +240,9 @@ def _summary_block(profile: dict, evaluated: list):
    return model.KVTable(rows=rows, title="Resumen de calidad")


-def _fmt_unit_pct_or_pct(value) -> str:
-    """Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
-    try:
-        num = float(value)
-    except (TypeError, ValueError):
-        return model._safe_str(value)
-    if num != num:  # NaN
-        return "—"
-    pct = num * 100 if num <= 1.0 else num
-    text = f"{pct:.1f}".rstrip("0").rstrip(".")
-    return f"{text}%"
-
-
 def _scores_block(evaluated: list):
-    """DataTable with per-column score and its three-criteria breakdown."""
-    header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
+    """DataTable with per-column score and its completeness/validity breakdown."""
+    header = ["Columna", "Calidad", "Completitud", "Validez"]
    rows = []
    # Worst columns first so the reader sees the problems at the top.
    ordered = sorted(
@@ -210,22 +255,22 @@ def _scores_block(evaluated: list):
            col.get("name") or "(col)",
            _fmt_score(q.get("score")),
            _fmt_unit_pct(q.get("completeness")),
-            _fmt_unit_pct(q.get("validity")),
-            _fmt_unit_pct(q.get("consistency")),
+            _fmt_validity(q.get("validity")),
        ])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows,
                           title="Scores de calidad por columna",
-                           note="0 = peor, 100 = mejor; ordenado de peor a mejor")
+                           note="0 = peor, 100 = mejor; «n/a» = dimensión no "
+                                "medible; ordenado de peor a mejor")


 def _issues_block(evaluated: list):
-    """DataTable listing Spanish issues per column, or a Note when there are none."""
-    header = ["Columna", "Problemas detectados (español)"]
+    """DataTable listing ONLY real quality defects per column, or a Note."""
+    header = ["Columna", "Problemas de calidad (español)"]
    rows = []
    for col, q in evaluated:
-        joined = _join_issues(q.get("issues"))
+        joined = _join_cells(q.get("issues"))
        if joined:
            rows.append([col.get("name") or "(col)", joined])
    if not rows:
@@ -235,6 +280,55 @@ def _issues_block(evaluated: list):
                           title="Problemas de calidad por columna")


+def _observations_block(evaluated: list):
+    """DataTable listing analytical observations per column, or None.
+
+    Observations (outliers, constant columns, ids, strong skew) are NOT quality
+    defects: they do not affect the score. Returned as a separate table from the
+    issues so the report never presents a legitimate outlier as a problem.
+    """
+    header = ["Columna", "Observaciones analíticas"]
+    rows = []
+    for col, q in evaluated:
+        joined = _join_cells(q.get("observations"))
+        if joined:
+            rows.append([col.get("name") or "(col)", joined])
+    if not rows:
+        return None
+    return model.DataTable(
+        header=header, rows=rows,
+        title="Observaciones analíticas por columna",
+        note="No son defectos de calidad y NO afectan al score; orientan el "
+             "análisis (atípicos, columnas constantes, identificadores).")
+
+
+def _term(key: str, label: str, mark: bool) -> str:
+    """Render a term as a clickable glossary span when marking is enabled."""
+    if mark:
+        return f"[[term:{key}]]**{label}**[[/term]]"
+    return f"**{label}**"
+
+
+def _criteria_intro(mark: bool) -> str:
+    """Intro: how the score is composed, with every term marked clickable.
+
+    Concise on purpose: the definitions of each term (calidad de datos,
+    completitud, validez, unicidad de registro) now live in the GLOSARIO
+    chapter, so the body no longer repeats them — it only states how the score
+    is composed and keeps each term marked so it stays a clickable jump.
+    """
+    calidad = _term("calidad_datos", "calidad de datos", mark)
+    completitud = _term("completitud", "completitud", mark)
+    validez = _term("validez", "validez", mark)
+    unicidad = _term("unicidad_registro", "unicidad de registro", mark)
+    return (
+        f"La {calidad} de cada columna es un score de 0 a 100 que combina "
+        f"{completitud} (peso 60%) y {validez} (peso 40%, cuando es medible); "
+        f"a nivel de tabla se añade la {unicidad}. Los valores atípicos no "
+        "bajan el score: se listan aparte como **observaciones analíticas**."
+    )
+
+
 def build_calidad(profile: dict, ctx: dict):
    """Build the data-quality Chapter, or None if the profile has no columns.

@@ -250,17 +344,35 @@ def build_calidad(profile: dict, ctx: dict):
    if not evaluated:
        return None  # no columns to score -> chapter does not apply.

+    # Register the criteria terms in the shared glossary (if present) and mark
+    # their first appearance clickable. Contract §11.1.
+    glossary = ctx.get("glossary")
+    mark = False
+    if isinstance(glossary, model.GlossaryCollector):
+        for key, (label, definition) in _TERMS.items():
+            glossary.add(key, label, definition)
+        mark = True
+
    blocks = [
        model.Heading(text="Cómo se calcula la calidad", level=2),
-        model.Markdown(text=_CRITERIA_INTRO),
+        model.Markdown(text=_criteria_intro(mark)),
        _summary_block(profile, evaluated),
        model.Heading(text="Scores por columna", level=2),
    ]
    scores = _scores_block(evaluated)
    if scores is not None:
        blocks.append(scores)
-    blocks.append(model.Heading(text="Problemas detectados", level=2))
+
+    blocks.append(model.Heading(text="Problemas de calidad", level=2))
    blocks.append(_issues_block(evaluated))

+    observations = _observations_block(evaluated)
+    if observations is not None:
+        blocks.append(model.Heading(text="Observaciones analíticas", level=2))
+        blocks.append(model.Note(
+            "Las observaciones siguientes NO son defectos de calidad y no "
+            "afectan al score: son señales para orientar el análisis."))
+        blocks.append(observations)
+
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -1,11 +1,12 @@
-"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
+"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut + glossary.

 Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
-and deterministic. Verifies that the chapter explains the quality criteria, shows
-per-column scores with the completeness/validity/consistency breakdown, lists the
-issues in Spanish (separate from the type flags), returns None when it does not
-apply, and that a wide profile with long names renders to PDF and PPTX without
-cutting any cell text (long content wraps, it is never truncated).
+and deterministic. Verifies the report-2046 quality model: the chapter explains
+the two scored dimensions (completitud 60% / validez 40%), shows per-column
+scores without a consistency column, keeps quality DEFECTS (issues) separate
+from analytical OBSERVATIONS (outliers, constant, ids), hooks the criteria terms
+into the glossary, returns None when it does not apply, and renders a wide
+profile to PDF and PPTX without cutting any cell text.
 """

 import os
@@ -20,28 +21,30 @@ from datascience.automatic_eda.chapters.calidad import (
    CHAPTER_VERSION,
 )
 from datascience.automatic_eda import build_document, render_pdf, render_pptx
+from datascience.automatic_eda import model


 def _profile() -> dict:
    """A small profile with one column per quality problem (nulls, outliers,
-    constant, high-cardinality id) plus one clean column."""
+    constant, high-cardinality id) plus one clean column. ``outlier_pct`` is in
+    the 0-100 scale that describe_numeric actually emits."""
    return {
        "table": "demo",
-        "quality_score": 72.5,
+        "quality_score": 82.0,
        "duplicate_pct": 0.04,
        "null_cell_pct": 0.11,
        "constant_cols": ["flag_const"],
        "all_null_cols": [],
        "columns": [
-            {"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
-             "numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
-             "quality_score": 60},
+            {"name": "edad", "inferred_type": "numeric", "null_pct": 0.2,
+             "n_rows": 100, "unique_pct": 0.5,
+             "numeric": {"outlier_pct": 15.0, "min": 0, "max": 99}},
            {"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
-             "unique_pct": 0.98, "quality_score": 80},
+             "unique_pct": 0.98, "flags": ["possible_id"]},
            {"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
-             "flags": ["constant"], "quality_score": 50},
-            {"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
-             "numeric": {"outlier_pct": 0.0}, "quality_score": 100},
+             "unique_pct": 0.01, "flags": ["constant"]},
+            {"name": "limpia", "inferred_type": "numeric", "null_pct": 0.0,
+             "unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
        ],
    }

@@ -50,16 +53,9 @@ def _tables(chapter):
    return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]


-def _scores_table(chapter):
+def _table_by_title(chapter, needle):
    for t in _tables(chapter):
-        if "Scores" in (t.title or ""):
-            return t
-    return None
-
-
-def _issues_table(chapter):
-    for t in _tables(chapter):
-        if "Problemas" in (t.title or ""):
+        if needle in (t.title or ""):
            return t
    return None

@@ -73,41 +69,86 @@ def test_golden_chapter_estructura_y_version():
    assert ch.id == "calidad"
    assert ch.version == CHAPTER_VERSION
    kinds = [b.kind for b in ch.blocks]
-    # intro heading + markdown criteria + summary kv + scores table + issues table
    assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds


-def test_golden_intro_explica_criterios_y_pesos():
+def test_golden_intro_nombra_dos_dimensiones_y_pesos():
+    # La intro nombra las dos dimensiones, sus pesos y la unicidad, pero ya NO
+    # repite sus definiciones largas: estas viven ahora en el capítulo GLOSARIO.
    ch = build_calidad(_profile(), {})
    intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
-    for needle in ("Completitud", "Validez", "Consistencia",
-                   "50%", "30%", "20%"):
+    for needle in ("completitud", "validez", "60%", "40%",
+                   "unicidad de registro"):
        assert needle in intro, f"falta {needle!r} en la intro de criterios"
+    # El principio: los outliers NO bajan la calidad.
+    assert "atípicos" in intro and "no bajan" in intro
+    # Ya no se menciona la dimensión consistencia eliminada.
+    assert "20%" not in intro


-def test_golden_scores_incluyen_desglose_por_criterio():
+def test_golden_scores_sin_columna_consistencia():
    ch = build_calidad(_profile(), {})
-    scores = _scores_table(ch)
+    scores = _table_by_title(ch, "Scores")
    assert scores is not None
-    assert scores.header == ["Columna", "Calidad", "Completitud",
-                             "Validez", "Consistencia"]
-    # 4 columns scored, none dropped.
+    assert scores.header == ["Columna", "Calidad", "Completitud", "Validez"]
+    assert "Consistencia" not in scores.header
    assert len(scores.rows) == 4
    names = {r[0] for r in scores.rows}
    assert names == {"edad", "nombre", "flag_const", "limpia"}


-def test_golden_issues_en_espanol_separados_de_flags():
+def test_golden_outliers_en_observaciones_no_en_problemas():
    ch = build_calidad(_profile(), {})
-    issues = _issues_table(ch)
-    assert issues is not None
-    flat = " | ".join(" ".join(r) for r in issues.rows)
-    assert "nulos" in flat            # completeness issue (ES)
-    assert "outliers" in flat         # validity issue (ES)
-    assert "columna constante" in flat
-    assert "posible id de alta cardinalidad" in flat
-    # The raw type flag string must NOT leak as a "problem".
-    assert "constant" not in flat or "columna constante" in flat
+    problemas = _table_by_title(ch, "Problemas de calidad")
+    observaciones = _table_by_title(ch, "Observaciones")
+    assert problemas is not None
+    assert observaciones is not None
+
+    problemas_txt = " | ".join(" ".join(r) for r in problemas.rows)
+    observaciones_txt = " | ".join(" ".join(r) for r in observaciones.rows)
+
+    # Los nulos SÍ son problema de calidad.
+    assert "nulos" in problemas_txt
+    # Los outliers NO aparecen como problema...
+    assert "atípic" not in problemas_txt and "outlier" not in problemas_txt
+    # ...sino como observación analítica.
+    assert "atípic" in observaciones_txt
+    # Constante e id: observaciones, no problemas.
+    assert "constante" in observaciones_txt
+    assert "identificador" in observaciones_txt
+    assert "constante" not in problemas_txt
+
+
+def test_golden_score_columna_limpia_es_100():
+    """Columna sin nulos, numérica nativa: score 100 aunque tenga (o no) outliers."""
+    ch = build_calidad(_profile(), {})
+    scores = _table_by_title(ch, "Scores")
+    by_name = {r[0]: r for r in scores.rows}
+    assert by_name["limpia"][1] == "100 / 100"
+    # edad: 20% nulos -> 100*(0.6*0.8 + 0.4*1.0) = 88; los outliers no bajan nada.
+    assert by_name["edad"][1] == "88 / 100"
+
+
+# --------------------------------------------------------------------------- #
+# Glosario (contrato §11.1)
+# --------------------------------------------------------------------------- #
+def test_glosario_registra_los_cuatro_terminos_y_marca_clicable():
+    glossary = model.GlossaryCollector()
+    ch = build_calidad(_profile(), {"glossary": glossary})
+    for key in ("calidad_datos", "completitud", "validez", "unicidad_registro"):
+        assert glossary.has(key), f"término {key!r} no registrado en el glosario"
+    intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
+    # Con colector presente, la primera aparición se marca clicable.
+    assert "[[term:completitud]]" in intro
+    assert "[[term:validez]]" in intro
+    assert "[[term:calidad_datos]]" in intro
+    assert "[[term:unicidad_registro]]" in intro
+
+
+def test_sin_glosario_no_marca_terminos():
+    ch = build_calidad(_profile(), {})  # ctx sin glossary
+    intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
+    assert "[[term:" not in intro


 # --------------------------------------------------------------------------- #
@@ -124,17 +165,17 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
    prof = {
        "quality_score": 100,
        "columns": [
-            {"name": "a", "inferred_type": "float", "null_pct": 0.0,
-             "numeric": {"outlier_pct": 0.0}},
-            {"name": "b", "inferred_type": "float", "null_pct": 0.0,
-             "numeric": {"outlier_pct": 0.0}},
+            {"name": "a", "inferred_type": "numeric", "null_pct": 0.0,
+             "unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
+            {"name": "b", "inferred_type": "numeric", "null_pct": 0.0,
+             "unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
        ],
    }
    ch = build_calidad(prof, {})
    assert ch is not None
-    assert _issues_table(ch) is None  # no issues table
+    assert _table_by_title(ch, "Problemas de calidad") is None  # no issues table
    notes = [b for b in ch.blocks if b.kind == "note"]
-    assert notes and "No se detectaron problemas" in notes[0].text
+    assert any("No se detectaron problemas" in n.text for n in notes)


 # --------------------------------------------------------------------------- #
@@ -143,44 +184,42 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
 def _wide_profile(ncols: int = 22) -> dict:
    cols = [
        {"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
-         "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
+         "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99,
+         "flags": ["possible_id"]},
        {"name": "columna_constante_sin_ninguna_variacion_de_valor",
-         "inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
+         "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.01,
+         "flags": ["constant"]},
    ]
    for k in range(ncols - 2):
        cols.append({
            "name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
-            "inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
-            "numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
+            "inferred_type": "numeric", "null_pct": 0.1 + (k % 3) * 0.05,
+            "unique_pct": 0.5,
+            "numeric": {"outlier_pct": 8.0, "min": 0, "max": 1000},
        })
-    return {"table": "ancha", "quality_score": 70.0, "columns": cols}
+    return {"table": "ancha", "quality_score": 70.0, "duplicate_pct": 0.0,
+            "columns": cols}


 def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
    prof = _wide_profile(22)
    full = build_document(prof, {"dataset_name": "ancha"})
    assert any(c.id == "calidad" for c in full)
-    # Render ONLY the calidad chapter so the anti-cut assertions are scoped to
-    # this chapter (other chapters, e.g. portada, legitimately contain '…').
    chapters = [c for c in full if c.id == "calidad"]
    long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
    with tempfile.TemporaryDirectory() as d:
        pdf = os.path.join(d, "q.pdf")
        pptx = os.path.join(d, "q.pptx")
        rp = render_pdf(chapters, pdf, {"title": "EDA"})
-        rx = render_pptx(chapters, pptx, {"title": "EDA"})
+        render_pptx(chapters, pptx, {"title": "EDA"})
        assert os.path.exists(pdf) and os.path.exists(pptx)
-        # The wide table forces pagination across several pages/slides.
        assert (rp or {}).get("n_pages", 0) >= 2

-        # PDF: the long name survives whole once wraps (spaces/newlines) removed,
-        # and there is no truncation marker.
        pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
        assert "…" not in pdf_txt and "..." not in pdf_txt
        norm = re.sub(r"\s+", "", pdf_txt)
        assert long_name in norm, "el nombre largo se cortó en el PDF"

-        # PPTX: long name present in some cell, untruncated.
        allt = []
        for s in Presentation(pptx).slides:
            for sh in s.shapes:
@@ -1,19 +1,25 @@
 """Categorical distributions chapter (CAT DISTR).

-Third reference chapter for AutomaticEDA. For every categorical column it shows,
-fulfilling the user's request:
+Third reference chapter for AutomaticEDA. Each categorical column gets **its own
+page (PDF) / slide (PPTX)**: every column is wrapped in a keep-together
+``model.Group`` with ``page_break_before=True`` (except the first, which may share
+the intro's page), so its chart sits next to its tables and no column is split.

-1. A short opening explanation of **Shannon entropy** (what it measures, its 0
-   and log2(k) bounds, the normalized 0–1 version) and the dataset row total used
-   as a comparison baseline.
-2. Per column, a cardinality key/value table: distinct values, ``% distinct``
-   (distinct / total rows), total dataset rows, singleton values (frequency 1),
-   entropy with its theoretical maximum and the normalized ratio, mode, imbalance
-   and string-length stats.
-3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
+A short intro names the clickable **[[term:entropia]]entropía[[/term]]** term —
+the full definition lives in the GLOSARIO chapter, so it is NOT repeated inline
+here (one click jumps to the glossary entry). The intro also carries the dataset
+row total used as a comparison baseline.
+
+Per column the Group contains, in order:
+
+1. A cardinality key/value table: distinct values, ``% distinct`` (distinct /
+   total rows), total dataset rows, singleton values (frequency 1), entropy with
+   its theoretical maximum and the normalized ratio, mode, imbalance and
+   string-length stats.
+2. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
   single dominating category).
-4. A ``top-k`` table (value / count / %).
-5. A **donut pie chart** of the most common categories (top-k + an "Otros"
+3. A ``top-k`` table (value / count / %).
+4. A **donut pie chart** of the most common categories (top-k + an "Otros"
   bucket), drawn lazily so the renderers scale it to fit entirely.

 Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the
@@ -33,7 +39,7 @@ import math

 from .. import model

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "cat_distr"
 CHAPTER_TITLE = "Distribuciones categóricas"

@@ -53,11 +59,17 @@ _TERM_ENTROPIA_DEF = (
 # Cap the number of categorical columns rendered to keep the document bounded;
 # the rest are summarized in a closing note (no silent truncation).
 MAX_COLS = 40
-# Rows shown in each top-k table and explicit slices in the pie.
-TOP_TABLE_ROWS = 15
+# Rows shown in each top-k table and explicit slices in the pie. Kept moderate so
+# the whole column — cardinality table + top-k table + donut — fits on ONE
+# page/slide with the chart next to its tables; the table note still reports
+# "top N of M" so nothing is silently hidden. For id-like columns (≈100%
+# distinct) the top-k table is dropped entirely (it would be a list of unique
+# values — pure noise), which also frees the room the donut needs (see build).
+TOP_TABLE_ROWS = 8
 PIE_TOP_K = 6
-# Truncate very long category labels in tables (the renderer also wraps).
-LABEL_MAX = 48
+# Truncate very long category labels in tables (the renderer also wraps). Kept
+# tight so a column with long id-like values (names, tickets) still fits its page.
+LABEL_MAX = 28


 def _fmt_int(value) -> str:
@@ -267,45 +279,55 @@ def _normalize_card(card: dict) -> dict:


 def _cardinality_block(card: dict):
-    """KVTable with the cardinality / entropy metrics for one column."""
+    """KVTable with the cardinality / entropy metrics for one column.
+
+    Related metrics are grouped onto a single row each (distinct/%/unique;
+    entropy bits/max/normalized; length min/mean/max) so the whole column —
+    table + chart — fits one page/slide without dropping any datum; the short
+    16:9 PPTX slide does not fit one metric per row plus a chart otherwise."""
    n_singletons = card.get("n_singletons")
    if n_singletons is not None and card.get("n_singletons_partial"):
-        singletons = f"≥{_fmt_int(n_singletons)} (en top mostrado)"
+        singletons = f"≥{_fmt_int(n_singletons)}"
    elif n_singletons is not None:
        singletons = _fmt_int(n_singletons)
    else:
        singletons = "—"

-    entropy_ref = _fmt_num(card.get("entropy"))
-    emax = card.get("entropy_max")
-    if emax is not None:
-        entropy_ref = f"{entropy_ref} (máx {_fmt_num(emax)})"
+    # Distinct count · % distinct · unique (frequency 1) on one row.
+    distinct_combo = (f"{_fmt_int(card.get('n_distinct'))} · "
+                      f"{_fmt_pct_value(card.get('pct_distinct'))} · "
+                      f"{singletons} únicos")
+
+    # Entropy bits · theoretical max · normalized 0–1 on one row.
+    entropy_combo = (f"{_fmt_num(card.get('entropy'))} bits · "
+                     f"máx {_fmt_num(card.get('entropy_max'))} · "
+                     f"norm {_fmt_num(card.get('entropy_norm'))}")

    mode = card.get("mode")
    mode_pct = card.get("mode_pct")
-    mode_str = "—" if mode is None else model._safe_str(mode)
+    mode_str = "—" if mode is None else _truncate(mode, 32)
    if mode is not None and mode_pct is not None:
        mode_str = f"{mode_str} ({_fmt_pct_value(mode_pct)})"

    rows = [
-        ("Valores distintos", _fmt_int(card.get("n_distinct"))),
-        ("% distintos", _fmt_pct_value(card.get("pct_distinct"))),
+        ("Distintos · % · únicos", distinct_combo),
        ("Total filas (dataset)", _fmt_int(card.get("n_rows"))),
-        ("Valores únicos (frecuencia 1)", singletons),
-        ("Entropía (bits)", entropy_ref),
-        ("Entropía normalizada (0–1)", _fmt_num(card.get("entropy_norm"))),
+        ("Entropía (bits · máx · norm)", entropy_combo),
        ("Moda", mode_str),
    ]
    imbalance = card.get("imbalance")
-    if imbalance is not None:
-        rows.append(("Desbalance", _fmt_num(imbalance)))
    lm = card.get("len_min")
    lmean = card.get("len_mean")
    lmax = card.get("len_max")
+    # Imbalance and string length (both secondary) share one closing row.
+    extras = []
+    if imbalance is not None:
+        extras.append(f"desbalance {_fmt_num(imbalance)}")
    if any(v is not None for v in (lm, lmean, lmax)):
-        rows.append((
-            "Longitud (mín/media/máx)",
-            f"{_fmt_num(lm)} / {_fmt_num(lmean)} / {_fmt_num(lmax)}"))
+        extras.append(
+            f"long. {_fmt_num(lm)}/{_fmt_num(lmean)}/{_fmt_num(lmax)}")
+    if extras:
+        rows.append(("Desbalance · longitud", " · ".join(extras)))
    return model.KVTable(rows=rows, title="Cardinalidad")


@@ -315,7 +337,8 @@ def _flag_note(card: dict):
        return model.Note(
            "Casi todos los valores son distintos (≈100% distintos): la columna "
            "se comporta como un identificador y aporta poco para agrupar o "
-            "comparar categorías.")
+            "comparar categorías. No se lista el top de categorías (serían "
+            "valores casi todos únicos).")
    if card.get("dominated"):
        mp = card.get("mode_pct")
        mp_str = _fmt_pct_value(mp) if mp is not None else "muy alta"
@@ -335,7 +358,7 @@ def _topk_table(cat: dict):
        if not isinstance(t, dict):
            continue
        rows.append([
-            model._safe_str(t.get("value")),
+            _truncate(t.get("value")),
            _fmt_int(t.get("count")),
            _pct_from_maybe_fraction(t.get("pct")),
        ])
@@ -353,20 +376,16 @@ def _topk_table(cat: dict):
 def _intro_blocks(n_rows, mark_term: bool = False):
    total = _fmt_int(n_rows)
    # Mark the first appearance of the term as a clickable glossary jump when the
-    # term was registered (mark_term). The visible text is identical either way.
-    entropia = ("[[term:entropia]]**entropía de Shannon**[[/term]]" if mark_term
-                else "**entropía de Shannon**")
+    # term was registered (mark_term). The full definition of entropy lives in the
+    # GLOSARIO chapter, so the intro only names the clickable term here instead of
+    # repeating the long explanation (avoids the redundancy with the glossary).
+    entropia = ("[[term:entropia]]entropía[[/term]]" if mark_term
+                else "entropía")
    text = (
-        f"La {entropia} mide cómo de repartidos están los valores de "
-        "una columna categórica, en bits. Vale 0 cuando una sola categoría "
-        "concentra todas las filas (máxima previsibilidad) y alcanza su máximo, "
-        "log2(k) para k categorías distintas, cuando todas aparecen por igual "
-        "(máxima diversidad). La **entropía normalizada** (entropía dividida por "
-        "su máximo) la lleva al rango 0–1 para comparar columnas con distinto "
-        "número de categorías. Para cada columna se muestran los valores "
-        "distintos, el porcentaje que representan sobre el total de filas, los "
-        "valores únicos (que aparecen una sola vez), la tabla de las categorías "
-        "más frecuentes y un gráfico de tarta (donut) de las más comunes."
+        f"Cada columna categórica ocupa su propia página: sus métricas de "
+        f"cardinalidad —incluida la {entropia}—, una nota que señala cardinalidad "
+        "problemática, la tabla de las categorías más frecuentes y un gráfico de "
+        "tarta (donut) de las más comunes, todo junto."
    )
    if n_rows is not None:
        text += f" El dataset tiene {total} filas en total como referencia."
@@ -398,24 +417,37 @@ def build_cat_distr(profile: dict, ctx: dict):
    blocks = list(_intro_blocks(n_rows, mark_term=mark_term))

    rendered = cat_cols[:MAX_COLS]
-    for col in rendered:
+    for idx, col in enumerate(rendered):
        name = col.get("name") or "(columna)"
        cat = col.get("categorical") or {}
        card = _normalize_card(_cardinality(cat, n_rows))

-        blocks.append(model.Heading(text=str(name), level=2))
-        blocks.append(_cardinality_block(card))
+        # One Group per categorical column: heading + cardinality table + flag
+        # note + top-k table + donut figure are kept together and the renderer
+        # starts each on a fresh page/slide (page_break_before) so every column
+        # gets its own page with its chart next to its tables. The first column
+        # may share the intro's page (no forced break) to avoid a near-empty page.
+        col_blocks = [
+            model.Heading(text=str(name), level=2),
+            _cardinality_block(card),
+        ]
        note = _flag_note(card)
        if note is not None:
-            blocks.append(note)
-        topk = _topk_table(cat)
-        if topk is not None:
-            blocks.append(topk)
-        blocks.append(model.Figure(
+            col_blocks.append(note)
+        # For id-like columns (≈100% distinct) the top-k is a list of unique
+        # values — pure noise; skip it (the flag note already explains why) and
+        # let the donut take that room so the whole column fits one page/slide.
+        if not card.get("id_like"):
+            topk = _topk_table(cat)
+            if topk is not None:
+                col_blocks.append(topk)
+        col_blocks.append(model.Figure(
            make=_pie_make(cat.get("top") or [], card.get("n_distinct"),
                           str(name), n_rows),
            caption=(f"Categorías más comunes de «{_truncate(name, 32)}» "
                     "(donut: top-k + «Otros»)")))
+        blocks.append(model.Group(blocks=col_blocks,
+                                  page_break_before=(idx > 0)))

    if len(cat_cols) > len(rendered):
        omitted = len(cat_cols) - len(rendered)
@@ -2,11 +2,14 @@

 Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
 and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user
-asked for (entropy intro, distinct/total/%-distinct/unique metrics, top-k table
-and a donut figure), that the chapter renders inside the full document to both
-PDF and PPTX showing that content, that a profile with no categorical columns
-yields ``None`` without raising, and that long labels / many columns are never
-cut in either output.
+asked for (distinct/total/%-distinct/unique metrics, top-k table and a donut
+figure), that EACH categorical column is wrapped in its own keep-together
+``Group`` that starts on a fresh page/slide (one column per page, chart next to
+its tables), that the long entropy explanation is NOT repeated inline (it lives
+in the glossary — only the clickable term is kept), that the chapter renders
+inside the full document to both PDF and PPTX showing that content, that a
+profile with no categorical columns yields ``None`` without raising, and that
+long labels / many columns are never cut in either output.
 """

 import os
@@ -17,7 +20,8 @@ from pypdf import PdfReader
 from pptx import Presentation

 from datascience.automatic_eda.model import (
-    DataTable, Figure, Heading, KVTable, Note,
+    DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown,
+    Note,
 )
 from datascience.automatic_eda.chapters.cat_distr import (
    CHAPTER_ID, CHAPTER_VERSION, build_cat_distr,
@@ -81,8 +85,20 @@ def _pptx_text(path: str) -> str:
    return re.sub(r"\s+", " ", " ".join(parts))


-def _kinds(chapter):
-    return [b.kind for b in chapter.blocks]
+def _flatten(blocks):
+    """Expand keep-together Groups so the per-column heading/table/figure are
+    inspectable as a flat block list (the chapter wraps each column in a Group)."""
+    out = []
+    for b in blocks:
+        if getattr(b, "kind", "") == "group":
+            out.extend(_flatten(getattr(b, "blocks", []) or []))
+        else:
+            out.append(b)
+    return out
+
+
+def _column_groups(chapter):
+    return [b for b in chapter.blocks if isinstance(b, Group)]


 def test_golden_build_cat_distr_emite_bloques_pedidos():
@@ -90,36 +106,101 @@ def test_golden_build_cat_distr_emite_bloques_pedidos():
    assert ch is not None
    assert ch.id == CHAPTER_ID
    assert ch.version == CHAPTER_VERSION
-    kinds = _kinds(ch)
-    # Entropy intro present.
+
+    # Entropy intro present, but the long explanation is gone (it lives in the
+    # glossary now): only the term is named, no log2/normalizada walkthrough.
    headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
    assert any("Entrop" in h for h in headings)
-    md = next(b for b in ch.blocks if b.kind == "markdown")
-    assert "entropía" in md.text.lower() and "log2" in md.text
-    # Cardinality metrics: distinct, total rows, %-distinct, unique values.
-    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
+    md = next(b for b in ch.blocks if isinstance(b, Markdown))
+    assert "entropía" in md.text.lower()
+    assert "log2" not in md.text          # redundant explanation removed.
+    assert "máxima diversidad" not in md.text
+
+    # Per-column blocks are wrapped in keep-together Groups: flatten to inspect.
+    flat = _flatten(ch.blocks)
+    kv = next(b for b in flat if isinstance(b, KVTable))
    labels = [r[0] for r in kv.rows]
-    assert "Valores distintos" in labels
-    assert "% distintos" in labels
+    values = " ".join(str(r[1]) for r in kv.rows)
+    # Cardinality metrics: distinct count, %-distinct, unique values and total
+    # rows are present (grouped onto compact rows so the chart fits the page).
+    assert "Distintos · % · únicos" in labels
    assert "Total filas (dataset)" in labels
-    assert "Valores únicos (frecuencia 1)" in labels
    assert any("Entropía" in lbl for lbl in labels)
+    assert "únicos" in values and "%" in values
+    assert "bits" in values and "norm" in values   # entropy + max + normalized.
    # Top-k table + pie figure.
-    dt = next(b for b in ch.blocks if isinstance(b, DataTable))
+    dt = next(b for b in flat if isinstance(b, DataTable))
    assert dt.header == ["Valor", "Conteo", "%"]
    assert any("neumaticos" in str(cell) for row in dt.rows for cell in row)
-    assert any(isinstance(b, Figure) for b in ch.blocks)
-    # id-like column flagged with a Note.
-    assert any(isinstance(b, Note) and "identificador" in b.text
-               for b in ch.blocks)
+    assert any(isinstance(b, Figure) for b in flat)
+    # id-like column flagged with a Note that also explains the top-k is dropped.
+    idnote = next((b for b in flat
+                   if isinstance(b, Note) and "identificador" in b.text), None)
+    assert idnote is not None
+    assert "No se lista el top" in idnote.text


-def test_golden_render_pdf_muestra_categoricas():
+def test_golden_idlike_omite_topk_y_conserva_donut():
+    # The id-like column (uuid, 100% distinct) must NOT carry a top-k DataTable
+    # (it would be a list of unique values), but must still keep its donut Figure
+    # and its cardinality table so it stays a full per-column page.
+    ch = build_cat_distr(_profile(), {})
+    groups = _column_groups(ch)
+    uuid_group = next(g for g in groups
+                      if any(getattr(b, "text", "") == "uuid" for b in g.blocks))
+    kinds = [b.kind for b in uuid_group.blocks]
+    assert "data_table" not in kinds      # top-k of unique values dropped.
+    assert "kv_table" in kinds            # cardinality kept.
+    assert "figure" in kinds              # donut kept (chart per column).
+    # A non-id-like column keeps its top-k table.
+    cat_group = next(g for g in groups
+                     if any(getattr(b, "text", "") == "categoria"
+                            for b in g.blocks))
+    assert "data_table" in [b.kind for b in cat_group.blocks]
+
+
+def test_golden_una_pagina_por_columna_groups():
+    ch = build_cat_distr(_profile(), {})
+    groups = _column_groups(ch)
+    # Two categorical columns -> two column Groups (numeric column excluded).
+    assert len(groups) == 2
+    # Each Group carries one column: a heading + its cardinality table + figure.
+    for g in groups:
+        kinds = [b.kind for b in g.blocks]
+        assert kinds[0] == "heading"
+        assert "kv_table" in kinds
+        assert "figure" in kinds
+    # The first column may share the intro page (no forced break); every later
+    # column starts on a fresh page/slide so each column gets its own page.
+    assert groups[0].page_break_before is False
+    assert all(g.page_break_before is True for g in groups[1:])
+
+
+def test_golden_entropia_clicable_y_definicion_en_glosario():
+    # With a glossary collector the intro marks the clickable term and the FULL
+    # definition (the long explanation removed from the intro) lands in the
+    # glossary, not inline — no data lost, just relocated.
+    gc = GlossaryCollector()
+    ch = build_cat_distr(_profile(), {"glossary": gc})
+    md = next(b for b in ch.blocks if isinstance(b, Markdown))
+    assert "[[term:entropia]]entropía[[/term]]" in md.text
+    assert gc.has("entropia")
+    entry = gc.get("entropia")
+    assert entry is not None
+    # The definition kept in the glossary still carries the detail removed inline.
+    assert "log2" in entry["definition"]
+    assert "normalizada" in entry["definition"].lower()
+
+
+def test_golden_render_pdf_una_pagina_por_columna():
    with tempfile.TemporaryDirectory() as d:
        out = os.path.join(d, "eda.pdf")
        res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
        assert res["path"] == out and os.path.exists(out)
-        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID)
+        # Two categorical columns, each on its own page -> >= 2 pages for the
+        # chapter (intro shares the first column's page).
+        assert cat_meta["n_pages"] >= 2
        txt = _pdf_text(out)
        assert "Entrop" in txt
        assert "distintos" in txt
@@ -133,13 +214,91 @@ def test_golden_render_pptx_muestra_categoricas():
        out = os.path.join(d, "eda.pptx")
        res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
        assert res["path"] == out and os.path.exists(out)
-        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID)
+        assert cat_meta["n_slides"] >= 2  # one slide per categorical column.
        txt = _pptx_text(out)
        assert "Entrop" in txt
        assert "categoria" in txt and "neumaticos" in txt
        assert "distintos" in txt


+def _profile_high_card() -> dict:
+    """Profile with a high-cardinality NON-id-like categorical column whose top-k
+    of long values would split from its donut on a short 16:9 slide unless the
+    renderer trims the table — the exact case the adversarial check flagged
+    (Ticket / Cabin)."""
+    long_vals = [f"Valor largo de categoria numero {i:02d} con texto extra"
+                 for i in range(40)]
+    top = [{"value": v, "count": 60 - i, "pct": (60 - i) / 5000.0}
+           for i, v in enumerate(long_vals)]
+    return {
+        "table": "t", "source": "t.csv", "n_rows": 5000, "n_cols": 3,
+        "quality_score": 80.0,
+        "columns": [
+            {"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
+             "numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
+                         "std": 0.5}},
+            # 40 distinct over 5000 rows = 0.8% distinct -> NOT id-like, keeps
+            # its (long) top-k table; the tall table must not push the donut off.
+            {"name": "alta_card_col", "inferred_type": "categorical",
+             "null_pct": 0.0, "distinct_count": 40,
+             "categorical": {"top": top, "mode": long_vals[0], "n_distinct": 40,
+                             "entropy": 5.2, "imbalance": 1.2, "len_min": 40,
+                             "len_mean": 45, "len_max": 50}},
+            {"name": "baja_card_col", "inferred_type": "categorical",
+             "null_pct": 0.0, "distinct_count": 4,
+             "categorical": {
+                 "top": [{"value": "norte", "count": 2000, "pct": 0.4},
+                         {"value": "sur", "count": 1500, "pct": 0.3},
+                         {"value": "este", "count": 1000, "pct": 0.2},
+                         {"value": "oeste", "count": 500, "pct": 0.1}],
+                 "mode": "norte", "n_distinct": 4, "entropy": 1.8}},
+        ],
+    }
+
+
+def test_golden_pptx_una_slide_por_columna_con_su_grafico():
+    """Each categorical column occupies EXACTLY ONE cat_distr slide that carries
+    BOTH its cardinality table and its donut figure (picture) — i.e. the chart is
+    never separated from its table, even for a high-cardinality column."""
+    from pptx.enum.shapes import MSO_SHAPE_TYPE
+
+    prof = _profile_high_card()
+    cat_names = ["alta_card_col", "baja_card_col"]
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        prs = Presentation(out)
+
+        # Per column: the cat_distr slides whose text mentions it, and whether the
+        # owning slide also has the donut caption + an actual picture shape.
+        slides_with_col = {n: [] for n in cat_names}
+        owner_has_chart = {n: False for n in cat_names}
+        for i, sl in enumerate(prs.slides):
+            texts, has_pic = [], False
+            for sh in sl.shapes:
+                if sh.has_text_frame:
+                    texts.append(sh.text_frame.text)
+                if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:
+                    has_pic = True
+            txt = re.sub(r"\s+", " ", " ".join(texts))
+            if "Distribuciones categ" not in txt:   # footer stamp of the chapter.
+                continue
+            for n in cat_names:
+                if n in txt:
+                    slides_with_col[n].append(i)
+                    has_table = "Cardinalidad" in txt or "distintos" in txt
+                    if has_pic and "donut" in txt and has_table:
+                        owner_has_chart[n] = True
+
+        for n in cat_names:
+            # Exactly one slide carries the column (not split across slides).
+            assert len(slides_with_col[n]) == 1, (n, slides_with_col[n])
+            # That single slide also holds its table AND its donut picture.
+            assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide")
+
+
 def test_edge_sin_categoricas_devuelve_none():
    only_numeric = {
        "n_rows": 10, "columns": [
@@ -170,11 +329,15 @@ def test_anti_corte_label_largo_y_muchas_columnas():

    ch = build_cat_distr(profile, {})
    assert ch is not None
+    # One Group per column, each forcing its own page (except the first).
+    groups = _column_groups(ch)
+    assert len(groups) == 30
+    assert sum(1 for g in groups if g.page_break_before) == 29
    with tempfile.TemporaryDirectory() as d:
        pdf = os.path.join(d, "anti.pdf")
        res = render_automatic_eda_pdf(profile, pdf, {"write_manifest": False})
        assert res["path"] == pdf
-        assert res["n_pages"] > 1       # many columns spilled across pages, OK.
+        assert res["n_pages"] > 1       # one page per column, OK.
        txt = _pdf_text(pdf)
        # Long label wrapped (not truncated): every word survives.
        for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"):
@@ -47,6 +47,53 @@ _MAX_MATRIX_LABELS = 16
 # How many pairs to show in each of the top-positive / top-negative tables.
 _TOP_N = 10

+# Glossary terms this chapter explains. Each is registered in the shared
+# collector (ctx['glossary']) and marked clickable on its first appearance in the
+# body — the canonical two-step pattern (see ``cat_distr`` for the reference
+# implementation): ``glossary.add(key, label, definition)`` + the inline span
+# ``[[term:KEY]]texto visible[[/term]]`` in a Markdown block. Mapping key ->
+# (label, definition). ``fdr`` is only registered when the FDR summary is present.
+_TERM_DEFS = {
+    "pearson": (
+        "Pearson (coeficiente r)",
+        "Coeficiente de correlación lineal de Pearson (r) entre dos variables "
+        "numéricas. Va de −1 (relación lineal inversa perfecta) a +1 (directa "
+        "perfecta); 0 indica ausencia de relación lineal. Sólo capta relaciones "
+        "lineales, por eso lleva signo."),
+    "spearman": (
+        "Spearman (correlación de rangos)",
+        "Correlación de rangos de Spearman: el coeficiente de Pearson calculado "
+        "sobre los puestos (rangos) de los valores en vez de sus magnitudes. Mide "
+        "relaciones monótonas (no necesariamente lineales), va de −1 a +1 y es "
+        "robusta frente a valores atípicos."),
+    "cramers_v": (
+        "Cramér's V",
+        "Medida de asociación entre dos variables categóricas, derivada del "
+        "estadístico chi-cuadrado y normalizada al rango 0–1 (0 = independientes, "
+        "1 = asociación total). No tiene signo: sólo mide la intensidad."),
+    "correlation_ratio": (
+        "Razón de correlación (η)",
+        "Razón de correlación (eta) entre una variable numérica y una "
+        "categórica: la fracción de la varianza de la numérica explicada por los "
+        "grupos de la categórica. Va de 0 (los grupos no explican nada) a 1 (la "
+        "explican toda); no tiene signo."),
+    "fdr": (
+        "Comparaciones múltiples (FDR)",
+        "Al evaluar muchos pares a la vez, algunos parecen significativos por "
+        "puro azar. La corrección por tasa de falsos descubrimientos (FDR, "
+        "Benjamini-Hochberg) ajusta los p-valores para controlar la proporción "
+        "esperada de falsos positivos entre los pares declarados significativos."),
+}
+
+
+def _term(mark: bool, key: str, text: str) -> str:
+    """Wrap ``text`` as a clickable glossary span when ``mark`` is True.
+
+    The visible text is identical with or without the marker (the renderers strip
+    the marker), so wrapping never changes line layout — it only adds the link.
+    """
+    return f"[[term:{key}]]{text}[[/term]]" if mark else text
+

 def _is_num(v) -> bool:
    """True for a real, finite int/float (not bool, not NaN/inf)."""
@@ -245,7 +292,7 @@ def _methods_block(corr: dict):
    return model.KVTable(rows=rows, title="Métodos de asociación")


-def _fdr_text(corr: dict) -> str | None:
+def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
    """One-line summary of the multiple-testing (FDR) correction, or None."""
    mt = corr.get("multiple_testing")
    if not isinstance(mt, dict) or not mt:
@@ -254,7 +301,8 @@ def _fdr_text(corr: dict) -> str | None:
    alpha = mt.get("alpha")
    n_tests = mt.get("n_tests")
    n_rej = mt.get("n_rejected")
-    parts = [f"Corrección por comparaciones múltiples ({method}"]
+    multi = _term(mark_term, "fdr", "comparaciones múltiples")
+    parts = [f"Corrección por {multi} ({method}"]
    if _is_num(alpha):
        parts[0] += f", α={float(alpha):g}"
    parts[0] += ")."
@@ -289,13 +337,30 @@ def build_correlacion(profile: dict, ctx: dict):

    blocks: list = []

-    # Intro: what this chapter shows and how to read the sign.
+    # Register the always-present method terms in the shared glossary and mark
+    # their first appearance clickable (the FDR term is registered lazily below,
+    # only when the FDR summary is actually emitted). Degrades silently when no
+    # collector is in ctx (standalone render) — mark_term stays False.
+    glossary = ctx.get("glossary")
+    gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
+    mark_term = gloss is not None
+    if gloss is not None:
+        for key in ("pearson", "spearman", "cramers_v", "correlation_ratio"):
+            label, definition = _TERM_DEFS[key]
+            gloss.add(key, label, definition)
+
+    # Intro: what this chapter shows and how to read the sign. Build the marked
+    # method names as locals first (avoids backslash-in-f-string for "Cramér's V").
+    t_pearson = _term(mark_term, "pearson", "Pearson")
+    t_spearman = _term(mark_term, "spearman", "Spearman")
+    t_cramers = _term(mark_term, "cramers_v", "Cramér's V")
+    t_corr_ratio = _term(mark_term, "correlation_ratio", "razón de correlación")
    blocks.append(model.Markdown(text=(
-        "Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
-        "sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
-        "entre categóricas; razón de correlación num-categórica; información mutua "
-        "como medida común no lineal). Sólo las correlaciones **num-num** tienen "
-        "dirección: por eso los pares **negativos** son siempre num-num.")))
+        "Asociación entre columnas. Cada par se evalúa con la métrica adecuada "
+        f"a sus tipos: {t_pearson}/{t_spearman} (numéricas), {t_cramers} "
+        f"(categóricas), {t_corr_ratio} (num-categórica) e información mutua. "
+        "Sólo las correlaciones **num-num** llevan **signo** (dirección): por "
+        "eso los pares **negativos** son siempre num-num.")))

    # 1) Association matrix (heatmap).
    labels, trimmed = _ordered_labels(pairs)
@@ -337,9 +402,13 @@ def build_correlacion(profile: dict, ctx: dict):
            "no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
            "sobre los retornos/diferencias antes de interpretarlas.")))

-    # 4) FDR summary + methods legend.
-    fdr_text = _fdr_text(corr)
+    # 4) FDR summary + methods legend. Register the FDR term only when its
+    # summary is emitted, so the glossary never lists an unreferenced entry.
+    fdr_text = _fdr_text(corr, mark_term=mark_term)
    if fdr_text:
+        if gloss is not None:
+            label, definition = _TERM_DEFS["fdr"]
+            gloss.add("fdr", label, definition)
        blocks.append(model.Markdown(text=fdr_text))
    methods = _methods_block(corr)
    if methods is not None:
@@ -173,3 +173,25 @@ def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
        assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
        # A short, unbreakable fragment of the long label survives the wrap.
        assert "azufre" in _pdf_text(pdf)
+
+
+def test_glosario_engancha_metodos_y_fdr():
+    """Mejora 4b: los métodos de correlación (Pearson, Spearman, Cramér's V,
+    razón de correlación) y la corrección por comparaciones múltiples (FDR) se
+    registran en el colector compartido y se marcan clicables en el cuerpo. Sin
+    colector en ctx, el capítulo degrada y no marca nada."""
+    from datascience.automatic_eda.model import GlossaryCollector
+
+    g = GlossaryCollector()
+    ch = build_correlacion(_profile(), {"glossary": g})
+    assert ch is not None
+    keys = {t["key"] for t in g.terms()}
+    assert {"pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"} <= keys
+    body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
+    for k in ("pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"):
+        assert f"[[term:{k}]]" in body, k
+
+    # Sin colector: degrada limpio (ningún marcador en el cuerpo).
+    ch2 = build_correlacion(_profile(), {})
+    body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
+    assert "[[term:" not in body2
@@ -6,15 +6,16 @@ normality}``). It renders, as structured markdown/tables/figures that the core
 paginator never cuts:

 1. **Normalization note** — every multivariate model below standardizes the
-   columns with z-score first; the chapter explains why (different scales would
-   otherwise dominate distance/variance).
+   columns with z-score first (the term is marked clickable; its definition
+   lives in the GLOSARIO chapter, not inline).
 2. **PCA** — a scree plot (explained + cumulative variance, single Y axis) plus
   variance and top-loadings tables.
 3. **KMeans segments** — a PCA scatter **coloured by cluster** (its own
   page/slide), the cluster-size table, and a per-cluster LLM micro-analysis
   with a title for each segment.
-4. **Isolation Forest outliers** — a short explanation of how anomalous rows are
-   isolated multivariately and how the threshold is chosen, plus the counts.
+4. **Isolation Forest outliers** — the multivariate anomaly counts and decision
+   threshold (the method is marked clickable; its definition lives in the
+   GLOSARIO chapter, not inline).
 5. **Normality** — per-column Jarque-Bera / D'Agostino / Shapiro verdicts.

 The raw numeric data needed to colour the cluster scatter is **not** in the
@@ -55,6 +56,62 @@ _CLUSTER_COLORS = [
    "#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac",
 ]

+# Glossary terms this chapter explains. Each is registered in the shared
+# collector (ctx['glossary']) and marked clickable on its first appearance — the
+# canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
+# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
+# block. A term is registered only when its section is actually rendered, so the
+# glossary never lists an entry no in-text appearance points to.
+_TERM_DEFS = {
+    "zscore": (
+        "Estandarización z-score",
+        "Transformación que lleva cada columna numérica a media 0 y desviación "
+        "típica 1: a cada valor le resta la media de su columna y lo divide por "
+        "la desviación típica. Así variables con escalas muy distintas (euros "
+        "frente a un ratio 0–1) pesan por igual en las distancias y la varianza."),
+    "pca": (
+        "PCA (componentes principales)",
+        "El análisis de componentes principales resume muchas variables "
+        "numéricas correlacionadas en pocos ejes nuevos (componentes), "
+        "ortogonales entre sí y ordenados por la cantidad de varianza que "
+        "capturan. Permite ver la estructura de los datos en 2D y saber cuántas "
+        "dimensiones bastan para explicarlos."),
+    "kmeans": (
+        "KMeans (segmentación)",
+        "Algoritmo de agrupamiento no supervisado que reparte las filas en k "
+        "segmentos: asigna cada fila al centro (centroide) más cercano y recoloca "
+        "los centroides de forma iterativa hasta minimizar la distancia interna "
+        "de cada grupo. Aquí k se elige automáticamente."),
+    "silhouette": (
+        "Coeficiente de silueta (silhouette)",
+        "Métrica de calidad de un agrupamiento, en el rango −1 a 1: para cada "
+        "fila compara cómo de cerca está de su propio segmento frente al segmento "
+        "vecino más próximo. Cuanto más alto el promedio, más compactos y "
+        "separados están los segmentos."),
+    "isolation_forest": (
+        "Isolation Forest (anomalías)",
+        "Algoritmo de detección de anomalías multivariante: construye árboles que "
+        "parten el espacio con cortes aleatorios y mide cuántos cortes hacen "
+        "falta para aislar cada fila. Las filas raras se aíslan con muy pocos "
+        "cortes y se marcan como outliers según un umbral de contaminación."),
+}
+
+
+def _term(mark: bool, key: str, text: str) -> str:
+    """Wrap ``text`` as a clickable glossary span when ``mark`` is True.
+
+    The visible text is identical with or without the marker (the renderers strip
+    it), so wrapping never changes line layout — it only adds the link.
+    """
+    return f"[[term:{key}]]{text}[[/term]]" if mark else text
+
+
+def _register(gloss, key: str) -> None:
+    """Register term ``key`` in the collector (idempotent); no-op if gloss None."""
+    if gloss is not None:
+        label, definition = _TERM_DEFS[key]
+        gloss.add(key, label, definition)
+

 # --------------------------------------------------------------------------- #
 # Formatting helpers (mirror the overview chapter's defensive style).
@@ -252,34 +309,33 @@ def _make_cluster_scatter(projection: dict):
 # --------------------------------------------------------------------------- #
 # Section builders. Each returns a list of blocks (possibly empty).
 # --------------------------------------------------------------------------- #
-def _normalization_intro() -> list:
+def _normalization_intro(gloss=None, mark_term: bool = False) -> list:
+    _register(gloss, "zscore")
+    zscore = _term(mark_term, "zscore", "**estandarizan con z-score**")
    text = (
        "Estos modelos son **no supervisados**: buscan estructura latente sin "
        "una variable objetivo. Antes de aplicarlos, todas las columnas "
-        "numéricas se **estandarizan con z-score** (cada valor menos la media, "
-        "dividido por la desviación típica). Sin esta normalización, una "
-        "variable con escala grande (p.ej. ingresos en euros) dominaría las "
-        "distancias y la varianza frente a otra de escala pequeña (p.ej. un "
-        "ratio entre 0 y 1), sesgando tanto el PCA como el KMeans. Tras la "
-        "estandarización todas las variables pesan por igual."
+        f"numéricas se {zscore}, para que todas pesen por igual con "
+        "independencia de su escala."
    )
    return [model.Heading(text="Modelos no supervisados", level=1),
            model.Markdown(text=text)]


-def _pca_section(pca: dict) -> list:
+def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
    if not _is_dict(pca) or not pca.get("explained_variance_ratio"):
        return []
+    _register(gloss, "pca")
    blocks = [model.Heading(text="PCA — varianza explicada", level=2)]

    n_used = pca.get("n_rows_used")
    n_feat = pca.get("n_features")
    intro = (
-        f"El PCA resume {_fmt_num(n_feat)} variables numéricas en componentes "
-        f"ortogonales ordenados por la varianza que capturan "
-        f"({_fmt_num(n_used)} filas usadas tras eliminar nulos). El gráfico de "
-        "sedimentación (scree) muestra cuánta varianza aporta cada componente y "
-        "su acumulado: un codo marca cuántos componentes bastan."
+        f"El {_term(mark_term, 'pca', 'PCA')} se aplica sobre "
+        f"{_fmt_num(n_feat)} variables numéricas ({_fmt_num(n_used)} filas "
+        "usadas tras eliminar nulos). El gráfico de sedimentación (scree) "
+        "muestra cuánta varianza aporta cada componente y su acumulado: un "
+        "codo marca cuántos componentes bastan."
    )
    blocks.append(model.Markdown(text=intro))

@@ -325,11 +381,14 @@ def _pca_section(pca: dict) -> list:
    return blocks


-def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
+def _kmeans_section(kmeans: dict, projection: dict, titles,
+                    gloss=None, mark_term: bool = False) -> list:
    has_km = _is_dict(kmeans) and kmeans.get("best_k")
    has_proj = _is_dict(projection) and projection.get("points")
    if not has_km and not has_proj:
        return []
+    _register(gloss, "kmeans")
+    _register(gloss, "silhouette")

    blocks = [model.Heading(text="Segmentación (KMeans)", level=2)]

@@ -337,11 +396,12 @@ def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
    sil = (projection or {}).get("silhouette")
    if sil is None:
        sil = (kmeans or {}).get("silhouette")
+    t_kmeans = _term(mark_term, "kmeans", "KMeans")
+    t_sil = _term(mark_term, "silhouette", "*silhouette*")
    intro = (
-        f"KMeans agrupa las filas en **{_fmt_num(best_k)} segmentos** elegidos "
-        "automáticamente maximizando el coeficiente de *silhouette* "
-        f"(**{_fmt_num(sil)}**, rango −1 a 1: cuanto más alto, segmentos más "
-        "compactos y separados). Los segmentos se proyectan sobre el plano de "
+        f"{t_kmeans} agrupa las filas en **{_fmt_num(best_k)} segmentos** "
+        f"elegidos automáticamente por el coeficiente de {t_sil} "
+        f"(**{_fmt_num(sil)}**). Los segmentos se proyectan sobre el plano de "
        "los dos primeros componentes principales para visualizarlos."
    )
    blocks.append(model.Markdown(text=intro))
@@ -394,23 +454,21 @@ def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
    return blocks


-def _outliers_section(outliers: dict) -> list:
+def _outliers_section(outliers: dict, gloss=None, mark_term: bool = False) -> list:
    if not _is_dict(outliers) or outliers.get("n_outliers") is None:
        return []
    if outliers.get("note") and not outliers.get("n_rows_used"):
        # insufficient data — nothing meaningful to show.
        return []
+    _register(gloss, "isolation_forest")
    blocks = [model.Heading(text="Detección de anomalías (Isolation Forest)",
                            level=2)]
+    isof = _term(mark_term, "isolation_forest", "**Isolation Forest**")
    explain = (
-        "**Isolation Forest** detecta filas anómalas de forma *multivariante*: "
-        "construye árboles que parten el espacio con cortes aleatorios y mide "
-        "cuántos cortes hacen falta para aislar cada fila. Las filas raras "
-        "(combinaciones de valores poco frecuentes considerando **todas las "
-        "columnas a la vez**, no una sola) se aíslan con muy pocos cortes y "
-        "obtienen un score bajo. El **umbral** de decisión separa las filas "
-        "normales de las anómalas según la contaminación esperada del modelo: "
-        "una fila es outlier cuando su score queda por debajo de ese umbral."
+        f"{isof} marca filas anómalas de forma *multivariante*: combinaciones "
+        "de valores poco frecuentes considerando **todas las columnas a la "
+        "vez**, no una sola. La tabla resume cuántas se detectaron y el umbral "
+        "de decisión empleado."
    )
    blocks.append(model.Markdown(text=explain))
    blocks.append(model.KVTable(rows=[
@@ -484,15 +542,21 @@ def build_modelos(profile: dict, ctx: dict):
        (kmeans and kmeans.get("best_k")) or (projection and projection.get("points"))
    ) else None

+    # Shared glossary collector: terms are registered + marked clickable inside
+    # each section, only when that section actually renders (no orphan entries).
+    glossary = ctx.get("glossary")
+    gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
+    mark_term = gloss is not None
+
    sections = []
-    sections += _pca_section(pca) if pca else []
-    sections += _kmeans_section(kmeans, projection, titles)
-    sections += _outliers_section(outliers) if outliers else []
+    sections += _pca_section(pca, gloss, mark_term) if pca else []
+    sections += _kmeans_section(kmeans, projection, titles, gloss, mark_term)
+    sections += _outliers_section(outliers, gloss, mark_term) if outliers else []
    sections += _normality_section(normality) if normality else []

    if not sections:
        return None  # models block present but nothing renderable.

-    blocks = _normalization_intro() + sections
+    blocks = _normalization_intro(gloss, mark_term) + sections
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -257,3 +257,26 @@ def test_anticortes_tabla_normalidad_larga_no_corta():
        # Every column name survives (wrapped/split, never truncated).
        for i in (0, 19, 39):
            assert f"col_{i}" in txt
+
+
+def test_glosario_engancha_terminos_modelos():
+    """Mejora 4b: PCA, KMeans, silhouette, Isolation Forest y la estandarización
+    z-score se registran en el colector compartido y se marcan clicables en el
+    cuerpo. Sin colector en ctx, el capítulo degrada y no marca nada."""
+    from datascience.automatic_eda.model import GlossaryCollector
+
+    g = GlossaryCollector()
+    ctx = dict(_ctx_full())
+    ctx["glossary"] = g
+    ch = build_modelos(_profile(), ctx)
+    assert ch is not None
+    keys = {t["key"] for t in g.terms()}
+    assert {"zscore", "pca", "kmeans", "silhouette", "isolation_forest"} <= keys
+    body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
+    for k in ("zscore", "pca", "kmeans", "silhouette", "isolation_forest"):
+        assert f"[[term:{k}]]" in body, k
+
+    # Sin colector: degrada limpio (ningún marcador en el cuerpo).
+    ch2 = build_modelos(_profile(), _ctx_full())
+    body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
+    assert "[[term:" not in body2
@@ -1,9 +1,10 @@
 """Numeric distributions chapter (NUM DISTR) for AutomaticEDA.

 For every numeric column the chapter draws, as a single indivisible figure, a
-histogram with the **mean, median and ±1σ band drawn as reference lines** and a
-**Tukey boxplot right below it** sharing the same X axis — exactly the user
-requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
+histogram with the **mean, median and ±1σ band drawn as reference lines** (the
+legend reports the numeric value of the mean, the median **and the standard
+deviation σ**) and a **Tukey boxplot right below it** sharing the same X axis —
+exactly the user requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
 so the renderers rasterize and scale it to fit a whole page/slide and nothing is
 ever cut; columns with many numerics simply flow across pages as small
 multiples.
@@ -34,7 +35,7 @@ try:
 except Exception:  # noqa: BLE001 — keep the chapter importable no matter what.
    build_boxplot_stats = None  # type: ignore[assignment]

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "num_distr"
 CHAPTER_TITLE = "Distribuciones numéricas"

@@ -140,9 +141,11 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
    std = numeric.get("std")

    # ±1σ band first (behind the lines), then median (solid) and mean (dashed).
+    # The band's legend entry also reports the numeric value of the standard
+    # deviation, so the reader sees mean, median AND σ at a glance.
    if mean is not None and std is not None and std > 0:
        ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22,
-                     zorder=1, label="±1σ")
+                     zorder=1, label=f"±1σ (σ = {_fmt_num(std)})")
    if median is not None:
        ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6,
                     zorder=4, label=f"mediana = {_fmt_num(median)}")
@@ -152,7 +155,19 @@ def _make_hist_box(name: str, numeric: dict, box: dict):

    ax_h.set_ylabel("frecuencia", fontsize=8)
    ax_h.tick_params(labelsize=7)
-    ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
+    # Always surface σ in the legend: if the ±1σ band could not be drawn (no mean
+    # or std<=0) but σ is still known, add a label-only proxy handle so the value
+    # of the standard deviation is reported regardless of the band.
+    handles, labels = ax_h.get_legend_handles_labels()
+    if std is not None and not any("σ =" in lbl for lbl in labels):
+        from matplotlib.lines import Line2D
+        proxy = Line2D([], [], linestyle="none", marker="",
+                       label=f"σ = {_fmt_num(std)}")
+        handles.append(proxy)
+        labels.append(f"σ = {_fmt_num(std)}")
+    if handles:
+        ax_h.legend(handles, labels, fontsize=6.5, loc="upper right",
+                    framealpha=0.85)
    for spine in ("top", "right"):
        ax_h.spines[spine].set_visible(False)

@@ -159,6 +159,50 @@ def test_anti_corte_muchas_columnas_pdf_y_pptx():
        assert res_pptx["n_slides"] >= 8  # at least one slide per column figure.


+def _hist_legend_texts(numeric, box=None):
+    """Build the per-column figure and return its histogram-legend label texts."""
+    from datascience.automatic_eda.chapters.num_distr import _make_hist_box
+    import matplotlib.pyplot as plt
+    fig = _make_hist_box("col", numeric, box or {})
+    ax_h = fig.axes[0]  # the histogram is the top axis.
+    leg = ax_h.get_legend()
+    texts = [t.get_text() for t in leg.get_texts()] if leg else []
+    plt.close(fig)
+    return texts
+
+
+def test_golden_leyenda_histograma_reporta_valor_std():
+    # The histogram legend must report the numeric value of the standard
+    # deviation σ next to mean and median.
+    numeric = _numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5)
+    texts = _hist_legend_texts(numeric)
+    joined = " ".join(texts)
+    assert any("σ =" in t for t in texts), f"σ value missing in legend: {texts}"
+    assert "12.3" in joined, f"std value 12.3 not in legend: {texts}"
+    assert any("media =" in t for t in texts)
+    assert any("mediana =" in t for t in texts)
+
+
+def test_edge_std_en_leyenda_aunque_no_haya_banda():
+    # When the ±1σ band cannot be drawn (no mean) but σ is known, the legend
+    # still surfaces the σ value via a label-only proxy handle.
+    numeric = _numeric_block(42.5, 40.0, 7.5, 1.0, 100.0, "right-skewed", 0)
+    numeric["mean"] = None  # forces the band off; σ must still appear.
+    texts = _hist_legend_texts(numeric)
+    assert any("σ = 7.5" in t for t in texts), f"σ proxy missing: {texts}"
+
+
+def test_edge_sin_std_no_revienta_la_figura():
+    # A numeric block without σ must not raise and simply omits the σ entry.
+    import matplotlib.pyplot as plt
+    numeric = _numeric_block(42.5, 40.0, 0.0, 1.0, 100.0, "discrete", 0)
+    numeric["std"] = None
+    texts = _hist_legend_texts(numeric)
+    assert not any("σ =" in t for t in texts)
+    # mean/median lines still produce their own legend entries.
+    assert any("media =" in t for t in texts)
+
+
 def test_distribution_gloss_cubre_todas_las_etiquetas():
    # Every label detect_distribution_type can emit has a Spanish gloss.
    for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",
@@ -2,8 +2,17 @@

 Builds the document cover from a TableProfile plus an optional ``ctx`` of
 presentation metadata. Reads everything defensively (``.get``) and degrades
-honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
-placeholder rather than invented, leaving a hook for the LLM layer to fill it.
+honestly.
+
+The dataset size (N rows x M columns) is always shown big, as a heading right
+under the dataset name (kept together in a ``Group``), not buried in the
+metadata table. The Description and Granularity are resolved through a cascade
+so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
+(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
+``row_meaning``; otherwise a short summary is derived from the profile itself
+(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
+key-candidate columns or the table shape. Nothing is invented: the derived
+fallbacks state that they come from the profile.

 Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
    build_<id>(profile: dict, ctx: dict) -> Chapter | None
@@ -17,10 +26,15 @@ from datetime import datetime, timezone

 from .. import model

-CHAPTER_VERSION = "1.1.0"
+CHAPTER_VERSION = "1.2.0"
 CHAPTER_ID = "portada"
 CHAPTER_TITLE = "Portada"

+# Key under which eda_llm_insights stores its interpretive block in the profile.
+# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
+# row represents) from it when the LLM layer ran (``run_llm``).
+_LLM_KEY = "llm"
+
 # Default human description of what the table quality score measures. Chapters
 # can override it via ctx["quality_criteria"].
 _DEFAULT_QUALITY_CRITERIA = (
@@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str:
        return s


+def _llm_block(profile: dict, ctx: dict) -> dict:
+    """Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
+
+    It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
+    may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
+    not a dict degrades to an empty dict so the cover never raises.
+    """
+    block = profile.get(_LLM_KEY)
+    if not isinstance(block, dict):
+        block = ctx.get(_LLM_KEY)
+    return block if isinstance(block, dict) else {}
+
+
+def _count_column_types(profile: dict, ctx: dict):
+    """Best-effort (n_numeric, n_categorical) for the dataset.
+
+    Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
+    over the whole body); falls back to counting the profile columns directly so
+    the cover still has the numbers when no summary was passed.
+    """
+    summary = ctx.get("document_summary")
+    if isinstance(summary, dict):
+        n_num = summary.get("n_numeric")
+        n_cat = summary.get("n_categorical")
+        if n_num is not None or n_cat is not None:
+            return n_num, n_cat
+    cols = profile.get("columns") or []
+    n_num = sum(1 for c in cols if isinstance(c, dict)
+                and c.get("inferred_type") == "numeric")
+    n_cat = sum(1 for c in cols if isinstance(c, dict)
+                and isinstance(c.get("categorical"), dict)
+                and c.get("categorical", {}).get("top")
+                and c.get("inferred_type") != "numeric")
+    return n_num, n_cat
+
+
+def _derive_description(profile: dict, ctx: dict) -> str:
+    """A short, honest description of the dataset from the profile.
+
+    Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
+    available. Summarizes shape, column-type mix and quality score; never empty,
+    never invents business meaning (it states the description was derived)."""
+    n_rows = profile.get("n_rows")
+    n_cols = profile.get("n_cols")
+    n_num, n_cat = _count_column_types(profile, ctx)
+    head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
+    type_bits = []
+    if n_num:
+        type_bits.append(f"{_fmt_int(n_num)} numéricas")
+    if n_cat:
+        type_bits.append(f"{_fmt_int(n_cat)} categóricas")
+    if type_bits:
+        head += " (" + ", ".join(type_bits) + ")"
+    parts = [head + "."]
+    score = profile.get("quality_score")
+    if score is not None:
+        parts.append(f"Calidad media estimada: {score}/100.")
+    parts.append(
+        "Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
+        "para una descripción de negocio más rica.")
+    return " ".join(parts)
+
+
+def _derive_granularity(profile: dict, dataset_name: str) -> str:
+    """A ``Cada fila es…`` granularity sentence from the profile.
+
+    Prefers the key-candidate columns (a row is identified by them); when no key
+    is detected, falls back to the table shape so the line is always meaningful
+    and starts with ``Cada fila es`` as the user requested."""
+    keys = profile.get("key_candidates") or []
+    if keys:
+        shown = ", ".join(str(k) for k in keys[:3])
+        more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
+        return (f"Cada fila es un registro identificado por {shown}{more}, "
+                "candidata(s) a clave por ser únicas y sin nulos.")
+    n_rows = profile.get("n_rows")
+    tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
+    return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
+            "columna identificadora única, así que la granularidad se infiere "
+            "de la forma de la tabla." + tail)
+
+
 def build_portada(profile: dict, ctx: dict):
    """Build the cover Chapter, or None if there is truly nothing to show."""
    profile = profile or {}
@@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict):
    quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
    quality_value = "—" if score is None else f"{score} / 100"

-    # Granularity: ctx wins; else derive from key candidates; else be honest.
+    llm = _llm_block(profile, ctx)
+
+    # Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
+    # candidates; finally a shape-based fallback. Always a real "Cada fila es…".
    granularity = ctx.get("granularity")
    if not granularity:
-        keys = profile.get("key_candidates") or []
-        if keys:
-            granularity = ("Cada fila parece identificada por "
-                           + ", ".join(str(k) for k in keys[:3]) + ".")
-        else:
-            granularity = ("Cada fila es… (granularidad no determinada — "
-                           "pendiente de la capa de cálculo/LLM).")
+        granularity = (llm.get("row_meaning") or "").strip() or None
+    if not granularity:
+        granularity = _derive_granularity(profile, str(dataset_name))

+    # Description: explicit ctx wins; then the LLM "summary"; finally a short
+    # profile-derived summary. Never the old empty placeholder.
    description = ctx.get("description")
    if not description:
-        description = ("Descripción no provista — pendiente de la capa LLM "
-                       "(`run_llm`) o de `ctx['description']`.")
+        description = (llm.get("summary") or "").strip() or None
+    if not description:
+        description = _derive_description(profile, ctx)

-    blocks = [
+    # Title + dataset size shown together and BIG (Heading) at the top, kept on
+    # the same page (Group). The size is no longer buried in the metadata table.
+    cover = [
        model.Heading(text=str(dataset_name), level=1),
        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
+        model.Heading(text=shape, level=2),
+    ]
+
+    blocks = [
+        model.Group(blocks=cover),
        model.KVTable(rows=[
            ("Fuente", source_origin),
            ("Almacenamiento", storage),
            ("Generado", when),
-            ("Tamaño", shape),
            ("Calidad", quality_value),
            ("Criterios de calidad", quality_criteria),
        ]),
@@ -0,0 +1,197 @@
+"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
+
+Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
+and deterministic. Verifies the Fase 4b improvements:
+
+1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
+   heading kept together with the dataset name in a ``Group`` — and is no longer
+   a row of the metadata table.
+2. Description and Granularity are resolved through a real cascade and are never
+   the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
+   block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
+   short summary is derived from the profile and a "Cada fila es…" sentence from
+   the key-candidate columns or the table shape.
+3. The chapter degrades without raising on empty/None input.
+4. It renders inside the full document to both PDF and PPTX showing that content.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
+from datascience.automatic_eda.chapters.portada import (
+    CHAPTER_ID, CHAPTER_VERSION, build_portada,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
+    prof = {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 891,
+        "n_cols": 12,
+        "quality_score": 78.0,
+        "columns": [
+            {"name": "PassengerId", "inferred_type": "numeric",
+             "null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
+                                          "max": 891.0, "std": 257.0}},
+            {"name": "Survived", "inferred_type": "numeric",
+             "null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
+                                          "max": 1.0, "std": 0.49}},
+            {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
+             "categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
+                                     {"value": "female", "count": 314,
+                                      "pct": 0.35}],
+                             "mode": "male", "n_distinct": 2, "entropy": 0.93}},
+        ],
+    }
+    if with_keys:
+        prof["key_candidates"] = ["PassengerId"]
+    if with_llm:
+        prof["llm"] = {
+            "summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
+            "row_meaning": "Cada fila es un pasajero del Titanic.",
+            "dictionary": [], "pii": [], "cleaning": [], "analyses": [],
+        }
+    return prof
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def _markdown_after(blocks, heading_text):
+    """Return the Markdown block that follows a Heading whose text matches."""
+    for i, b in enumerate(blocks):
+        if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
+            for nb in blocks[i + 1:]:
+                if isinstance(nb, Markdown):
+                    return nb
+    return None
+
+
+def test_golden_tamano_grande_y_textos_llm():
+    ch = build_portada(_profile(), {})
+    assert ch is not None
+    assert ch.id == CHAPTER_ID
+    assert ch.version == CHAPTER_VERSION
+
+    # 1) Title + size kept together in a Group; size is a BIG level-2 heading.
+    group = next(b for b in ch.blocks if isinstance(b, Group))
+    inner = group.blocks
+    assert isinstance(inner[0], Heading) and inner[0].level == 1
+    assert inner[0].text == "titanic"
+    size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
+    assert "891" in size_h.text and "12" in size_h.text
+    assert "filas" in size_h.text and "columnas" in size_h.text
+
+    # 2) Size is no longer a row of the metadata table.
+    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
+    labels = [r[0] for r in kv.rows]
+    assert "Tamaño" not in labels
+    assert "Fuente" in labels and "Calidad" in labels
+
+    # 3) Description and Granularity come from the LLM block.
+    desc = _markdown_after(ch.blocks, "Descripción")
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    assert desc is not None and "Titanic" in desc.text
+    assert gran is not None and gran.text.startswith("Cada fila es")
+    assert "pasajero" in gran.text.lower()
+
+
+def test_fallback_sin_llm_usa_keys_y_perfil():
+    # No LLM block: description derived from the profile, granularity from keys.
+    ch = build_portada(_profile(with_llm=False, with_keys=True), {})
+    desc = _markdown_after(ch.blocks, "Descripción")
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    # Description is the derived summary, never the old "pendiente" placeholder.
+    assert "pendiente" not in desc.text.lower()
+    assert "891" in desc.text and "columnas" in desc.text
+    assert "numéricas" in desc.text or "categóricas" in desc.text
+    # Granularity mentions the key candidate and starts with "Cada fila es".
+    assert gran.text.startswith("Cada fila es")
+    assert "PassengerId" in gran.text
+    assert "…" not in gran.text  # the old ellipsis placeholder is gone.
+
+
+def test_fallback_sin_llm_sin_keys_usa_forma():
+    ch = build_portada(_profile(with_llm=False, with_keys=False), {})
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    assert gran.text.startswith("Cada fila es")
+    assert "titanic" in gran.text.lower()
+    assert "pendiente" not in gran.text.lower()
+
+
+def test_ctx_explicito_gana_sobre_llm():
+    ctx = {"description": "Descripción manual.",
+           "granularity": "Cada fila es una unidad manual."}
+    ch = build_portada(_profile(), ctx)
+    desc = _markdown_after(ch.blocks, "Descripción")
+    gran = _markdown_after(ch.blocks, "Granularidad")
+    assert desc.text == "Descripción manual."
+    assert gran.text == "Cada fila es una unidad manual."
+
+
+def test_edge_perfil_vacio_no_lanza():
+    # Empty / None never raise; the cover still shows a size and real texts.
+    for prof, ctx in (({}, {}), (None, None)):
+        ch = build_portada(prof, ctx)
+        assert ch is not None
+        group = next(b for b in ch.blocks if isinstance(b, Group))
+        size_h = next(b for b in group.blocks
+                      if isinstance(b, Heading) and b.level == 2)
+        assert "filas" in size_h.text and "columnas" in size_h.text
+        desc = _markdown_after(ch.blocks, "Descripción")
+        gran = _markdown_after(ch.blocks, "Granularidad")
+        assert desc.text and "pendiente" not in desc.text.lower()
+        assert gran.text.startswith("Cada fila es")
+
+
+def test_golden_render_pdf_muestra_portada():
+    prof = _profile()
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pdf_text(out)
+        assert "titanic" in txt.lower()
+        assert "891" in txt and "filas" in txt and "columnas" in txt
+        assert "Titanic" in txt          # LLM summary in the Description.
+        assert "Cada fila es" in txt     # granularity sentence.
+
+
+def test_golden_render_pptx_muestra_portada():
+    prof = _profile()
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pptx_text(out)
+        assert "titanic" in txt.lower()
+        assert "891" in txt and "columnas" in txt
+        assert "Cada fila es" in txt
@@ -0,0 +1,499 @@
+"""Key-relations chapter (RELACIONES) — the keys / join structure of the data.
+
+This chapter is the *relational* section of an AutomaticEDA report. It answers a
+single question for the table (or the whole DuckDB source it lives in): **how do
+the keys relate?** It composes, without reimplementing them, the registry's
+relation primitives and degrades honestly when a layer does not apply.
+
+It renders, in order, only the layers that have something to say:
+
+1. **Declared keys** (real schema constraints) — when the DuckDB source declares
+   PRIMARY KEY / FOREIGN KEY / UNIQUE constraints, they are read verbatim via
+   ``detect_declared_keys_duckdb`` and shown as ground truth: which column is the
+   PK, which columns are FKs and the table/column they point to.
+2. **Primary-key candidates** — the ``key_candidates`` the TableProfile already
+   carries (columns whose cardinality equals the row count, with no nulls). These
+   are *candidates*: a column that could serve as the row identifier.
+3. **Foreign-key candidates** when none are declared:
+   - **Inter-table** (the DuckDB source has several tables): real FK candidates by
+     name signal + value containment via ``infer_fk_containment_duckdb``, plus the
+     join graph (roles + a pasteable Mermaid diagram) via ``build_join_graph``.
+   - **Intra-table** (a single table): columns that *look* like a foreign key by a
+     name+cardinality heuristic (``suggest_intratable_fk_candidates``). This is a
+     **suggestion**, explicitly flagged as a heuristic, never an assertion.
+
+``build_relaciones(profile, ctx) -> Chapter | None``: returns ``None`` when there
+is nothing to say (no declared key, no key candidates, and no FK candidate —
+inter- or intra-table). Reads everything defensively (``.get``) and never raises:
+anything missing degrades to a note or is omitted; a failing registry call drops
+its layer instead of aborting the chapter.
+
+ctx keys this chapter consumes (all optional):
+    db_path, table : str — the DuckDB file and table being profiled (set by
+        ``build_eda_render_ctx``). ``db_path`` is needed to read declared
+        constraints, to list the sibling tables, and to run the containment-based
+        FK inference. Without it, only the profile-derived layers (PK candidates,
+        intra-table FK heuristic) are available.
+    glossary : model.GlossaryCollector — shared glossary; the chapter registers
+        the relational terms (PK, FK, containment, cardinality) and marks their
+        first appearance clickable.
+
+Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
+"""
+
+from __future__ import annotations
+
+from .. import model
+
+# Pure/impure registry functions (group ``eda``) this chapter composes. Imported
+# defensively (module-leaf imports, like the AGREGACION chapter) so the chapter
+# still builds — degrading the affected layer to nothing — if a function is
+# somehow unavailable / not indexed yet.
+try:
+    from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb
+except Exception:  # noqa: BLE001 — keep the chapter importable no matter what.
+    detect_declared_keys_duckdb = None  # type: ignore[assignment]
+try:
+    from datascience.infer_fk_containment_duckdb import infer_fk_containment_duckdb
+except Exception:  # noqa: BLE001
+    infer_fk_containment_duckdb = None  # type: ignore[assignment]
+try:
+    from datascience.build_join_graph import build_join_graph
+except Exception:  # noqa: BLE001
+    build_join_graph = None  # type: ignore[assignment]
+try:
+    from datascience.suggest_intratable_fk_candidates import (
+        suggest_intratable_fk_candidates,
+    )
+except Exception:  # noqa: BLE001
+    suggest_intratable_fk_candidates = None  # type: ignore[assignment]
+try:
+    from infra import duckdb_list_tables
+except Exception:  # noqa: BLE001
+    duckdb_list_tables = None  # type: ignore[assignment]
+
+CHAPTER_VERSION = "1.0.0"
+CHAPTER_ID = "relaciones"
+CHAPTER_TITLE = "Relaciones de clave"
+
+# Cap the inter-table FK table so a wide schema does not blow up the page; the
+# rest is summarized in a closing note (no silent truncation).
+MAX_FK_ROWS = 40
+
+# --------------------------------------------------------------------------- #
+# Glossary terms this chapter explains. Registered in the shared collector and
+# marked clickable on their first appearance (contract §11.1).
+# --------------------------------------------------------------------------- #
+_TERMS = {
+    "pk": (
+        "Clave primaria (PK)",
+        "Columna (o conjunto de columnas) que identifica de forma única cada fila "
+        "de una tabla: sus valores no se repiten y no son nulos. Una tabla tiene "
+        "como mucho una clave primaria; es el ancla por la que otras tablas la "
+        "referencian.",
+    ),
+    "fk": (
+        "Clave foránea (FK)",
+        "Columna de una tabla cuyos valores apuntan a la clave primaria de otra "
+        "tabla (o de la misma), creando una relación entre ambas. Una FK suele ser "
+        "N:1: muchas filas de la tabla origen comparten el mismo valor de la tabla "
+        "destino.",
+    ),
+    "containment": (
+        "Containment / inclusión",
+        "Señal con la que se infiere una clave foránea sin que la base la declare: "
+        "la fracción de valores distintos de una columna A que también aparecen "
+        "como valores de otra columna B. Si casi todos los valores de A están "
+        "contenidos en B (inclusión ≈ 1) y B parece una clave, A → B es una FK "
+        "candidata.",
+    ),
+    "cardinalidad": (
+        "Cardinalidad",
+        "Número de valores distintos de una columna. Cardinalidad igual al número "
+        "de filas (y sin nulos) señala un identificador (candidato a clave "
+        "primaria); cardinalidad alta pero menor que el número de filas, con "
+        "valores repetidos, es típica de una clave foránea.",
+    ),
+}
+
+
+def _register_terms(ctx: dict) -> bool:
+    """Register the relational terms in the shared glossary. Returns whether the
+    in-text appearances should be marked clickable."""
+    glossary = ctx.get("glossary")
+    if not isinstance(glossary, model.GlossaryCollector):
+        return False
+    for key, (label, definition) in _TERMS.items():
+        glossary.add(key, label, definition)
+    return True
+
+
+# --------------------------------------------------------------------------- #
+# Formatting helpers (mirror the other chapters' defensive style).
+# --------------------------------------------------------------------------- #
+def _fmt_int(value) -> str:
+    if value is None:
+        return "—"
+    try:
+        return f"{int(value):,}".replace(",", ".")
+    except (TypeError, ValueError):
+        return model._safe_str(value)
+
+
+def _fmt_pct_fraction(value, decimals: int = 1) -> str:
+    """Format a 0–1 fraction as a percentage. None -> placeholder."""
+    if value is None:
+        return "—"
+    try:
+        v = float(value)
+    except (TypeError, ValueError):
+        return model._safe_str(value)
+    if v <= 1.0:
+        v *= 100.0
+    return f"{v:.{decimals}f}%"
+
+
+def _fmt_ratio(value, decimals: int = 3) -> str:
+    """Format an already-0–1 ratio (inclusion) as a plain number."""
+    if value is None:
+        return "—"
+    try:
+        return f"{float(value):.{decimals}f}".rstrip("0").rstrip(".")
+    except (TypeError, ValueError):
+        return model._safe_str(value)
+
+
+def _is_dict(v) -> bool:
+    return isinstance(v, dict)
+
+
+def _columns_by_name(profile: dict) -> dict:
+    """Index the profile columns by name for quick metric lookup."""
+    out = {}
+    for col in (profile.get("columns") or []):
+        if _is_dict(col) and col.get("name") is not None:
+            out[col.get("name")] = col
+    return out
+
+
+# --------------------------------------------------------------------------- #
+# Layer 1 — declared keys (real schema constraints).
+# --------------------------------------------------------------------------- #
+def _declared_keys(db_path: str, table: str):
+    """Read declared PK/FK/UNIQUE for the source, or None if unavailable."""
+    if not db_path or detect_declared_keys_duckdb is None:
+        return None
+    try:
+        out = detect_declared_keys_duckdb(db_path, table)
+    except Exception:  # noqa: BLE001 — dict-no-throw: treat as unavailable.
+        return None
+    if not _is_dict(out) or out.get("status") != "ok":
+        return None
+    return out
+
+
+def _declared_section(declared: dict) -> list:
+    """Blocks for the declared-keys layer, or [] if there is nothing declared."""
+    pks = [p for p in (declared.get("primary_keys") or []) if _is_dict(p)]
+    fks = [f for f in (declared.get("foreign_keys") or []) if _is_dict(f)]
+    uqs = [u for u in (declared.get("unique") or []) if _is_dict(u)]
+    if not (pks or fks or uqs):
+        return []
+
+    blocks = [
+        model.Heading(text="Claves declaradas en el esquema", level=2),
+        model.Markdown(text=(
+            "La base **declara** estas relaciones de clave como restricciones "
+            "reales del esquema (constraints). Son la verdad de referencia: no se "
+            "infieren, se leen tal cual de la definición de las tablas.")),
+    ]
+
+    if pks:
+        rows = [[model._safe_str(p.get("table")),
+                 ", ".join(model._safe_str(c) for c in (p.get("columns") or []))]
+                for p in pks]
+        blocks.append(model.DataTable(
+            header=["Tabla", "Columna(s) PK"], rows=rows,
+            title="Claves primarias declaradas",
+            note="Cada fila: la clave primaria declarada de una tabla."))
+
+    if fks:
+        rows = []
+        for f in fks:
+            src = ", ".join(model._safe_str(c) for c in (f.get("columns") or []))
+            dst = ", ".join(
+                model._safe_str(c) for c in (f.get("referenced_columns") or []))
+            rows.append([
+                model._safe_str(f.get("table")), src,
+                model._safe_str(f.get("referenced_table")), dst])
+        blocks.append(model.DataTable(
+            header=["Tabla origen", "Columna(s) FK", "→ Tabla destino",
+                    "Columna(s) destino"],
+            rows=rows, title="Claves foráneas declaradas",
+            note="Cada fila: una FK declarada — origen → destino."))
+
+    if uqs:
+        rows = [[model._safe_str(u.get("table")),
+                 ", ".join(model._safe_str(c) for c in (u.get("columns") or []))]
+                for u in uqs]
+        blocks.append(model.DataTable(
+            header=["Tabla", "Columna(s) UNIQUE"], rows=rows,
+            title="Restricciones UNIQUE declaradas"))
+
+    return blocks
+
+
+# --------------------------------------------------------------------------- #
+# Layer 2 — primary-key candidates (from the profile).
+# --------------------------------------------------------------------------- #
+def _pk_candidates_section(profile: dict, mark: bool) -> list:
+    """Blocks for the PK-candidates layer, or [] if there are none."""
+    keys = [k for k in (profile.get("key_candidates") or []) if k is not None]
+    if not keys:
+        return []
+    by_name = _columns_by_name(profile)
+
+    pk = ("[[term:pk]]**clave primaria**[[/term]]" if mark
+          else "**clave primaria**")
+    intro = (
+        f"Columnas **candidatas a {pk}**: su "
+        "[[term:cardinalidad]]cardinalidad[[/term]] iguala al número de filas y "
+        "no tienen nulos. Son candidatas, no una clave declarada: la base no "
+        "las marca como tal."
+        if mark else
+        "Columnas **candidatas a clave primaria**: su cardinalidad iguala al "
+        "número de filas y no tienen nulos. Son candidatas, no una clave "
+        "declarada.")
+
+    rows = []
+    for name in keys:
+        col = by_name.get(name) or {}
+        rows.append([
+            model._safe_str(name),
+            _fmt_int(col.get("distinct_count")),
+            _fmt_pct_fraction(col.get("unique_pct")),
+            model._safe_str(col.get("inferred_type") or col.get("physical_type") or "—"),
+        ])
+    return [
+        model.Heading(text="Candidatos a clave primaria", level=2),
+        model.Markdown(text=intro),
+        model.DataTable(
+            header=["Columna", "Valores distintos", "% único", "Tipo"],
+            rows=rows, title="Candidatas a clave primaria",
+            note=f"{_fmt_int(profile.get('n_rows'))} filas en total como referencia."),
+    ]
+
+
+# --------------------------------------------------------------------------- #
+# Layer 3a — inter-table FK candidates (containment) + join graph.
+# --------------------------------------------------------------------------- #
+def _list_source_tables(db_path: str) -> list:
+    """List the tables in the DuckDB source, or [] if it can't be listed."""
+    if not db_path or duckdb_list_tables is None:
+        return []
+    try:
+        out = duckdb_list_tables(db_path)
+    except Exception:  # noqa: BLE001
+        return []
+    if not _is_dict(out) or out.get("status") != "ok":
+        return []
+    return [t for t in (out.get("tables") or []) if isinstance(t, str)]
+
+
+def _inter_table_section(db_path: str, tables: list, mark: bool) -> list:
+    """Blocks for the inter-table FK layer (containment + join graph), or []."""
+    if infer_fk_containment_duckdb is None or len(tables) < 2:
+        return []
+    try:
+        fk = infer_fk_containment_duckdb(db_path, tables=tables)
+    except Exception:  # noqa: BLE001
+        return []
+    if not _is_dict(fk) or fk.get("status") != "ok":
+        return []
+    candidates = [c for c in (fk.get("fk_candidates") or []) if _is_dict(c)]
+    if not candidates:
+        return []
+
+    containment = ("[[term:containment]]containment (inclusión de valores)[[/term]]"
+                   if mark else "containment (inclusión de valores)")
+    fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**"
+    blocks = [
+        model.Heading(text="Claves foráneas candidatas (inter-tabla)", level=2),
+        model.Markdown(text=(
+            f"La fuente tiene varias tablas. Estas {fk_term} candidatas se "
+            f"infieren por señal de nombre y por {containment}. No están "
+            "declaradas por la base; son la relación más probable según los "
+            "datos.")),
+    ]
+
+    shown = candidates[:MAX_FK_ROWS]
+    rows = []
+    for c in shown:
+        rows.append([
+            f"{model._safe_str(c.get('from_table'))}.{model._safe_str(c.get('from_col'))}",
+            f"{model._safe_str(c.get('to_table'))}.{model._safe_str(c.get('to_col'))}",
+            _fmt_ratio(c.get("inclusion")),
+            model._safe_str(c.get("cardinality") or "—"),
+            "sí" if c.get("name_match") else "no",
+        ])
+    note = "Ordenadas por señal de nombre e inclusión."
+    if len(candidates) > len(shown):
+        note += f" Se muestran {len(shown)} de {len(candidates)} candidatas."
+    blocks.append(model.DataTable(
+        header=["Origen", "→ Destino", "Inclusión", "Cardinalidad", "Coincide nombre"],
+        rows=rows, title="FK candidatas por containment", note=note))
+
+    # Join graph: node roles + a pasteable Mermaid diagram, kept together.
+    if build_join_graph is not None:
+        try:
+            graph = build_join_graph(candidates, tables=tables)
+        except Exception:  # noqa: BLE001
+            graph = None
+        if _is_dict(graph):
+            graph_blocks = [model.Heading(text="Grafo de relaciones", level=3)]
+            nodes = [n for n in (graph.get("nodes") or []) if _is_dict(n)]
+            if nodes:
+                node_rows = [[
+                    model._safe_str(n.get("table")),
+                    model._safe_str(n.get("role") or "—"),
+                    _fmt_int(n.get("out_degree")),
+                    _fmt_int(n.get("in_degree")),
+                ] for n in nodes]
+                graph_blocks.append(model.DataTable(
+                    header=["Tabla", "Rol", "FK salientes", "FK entrantes"],
+                    rows=node_rows, title="Tablas y su rol en el grafo",
+                    note="Rol: fact (apunta a otras), dimension (referenciada), "
+                         "bridge (ambas), standalone (aislada)."))
+            hubs = [h for h in (graph.get("hubs") or []) if h]
+            if hubs:
+                graph_blocks.append(model.Markdown(text=(
+                    "Tablas con más relaciones salientes (candidatas a tabla de "
+                    "hechos): " + ", ".join(model._safe_str(h) for h in hubs) + ".")))
+            mermaid = model._safe_str(graph.get("mermaid")).strip()
+            if mermaid:
+                graph_blocks.append(model.Markdown(text=(
+                    "Diagrama de las relaciones (pegable en un bloque Mermaid):")))
+                graph_blocks.append(model.Markdown(
+                    text="```mermaid\n" + mermaid + "\n```"))
+            if len(graph_blocks) > 1:
+                blocks.append(model.Group(blocks=graph_blocks,
+                                          title="Grafo de relaciones"))
+
+    skipped = [s for s in (fk.get("skipped") or []) if s]
+    if skipped:
+        blocks.append(model.Note(
+            "Algunos pares se omitieron por tamaño: "
+            + "; ".join(model._safe_str(s) for s in skipped) + "."))
+    return blocks
+
+
+# --------------------------------------------------------------------------- #
+# Layer 3b — intra-table FK candidates (name+cardinality heuristic).
+# --------------------------------------------------------------------------- #
+def _intra_table_section(profile: dict, mark: bool) -> list:
+    """Blocks for the intra-table FK heuristic layer, or [] if no candidates."""
+    if suggest_intratable_fk_candidates is None:
+        return []
+    try:
+        cands = suggest_intratable_fk_candidates(profile)
+    except Exception:  # noqa: BLE001
+        return []
+    cands = [c for c in (cands or []) if _is_dict(c)]
+    if not cands:
+        return []
+
+    fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**"
+    blocks = [
+        model.Heading(text="Posibles claves foráneas (heurística de nombre)", level=2),
+        model.Markdown(text=(
+            f"No hay otras tablas que referenciar, pero algunas columnas **parecen** "
+            f"{fk_term} por su nombre (terminan en «id») y su cardinalidad (muchos "
+            "valores repetidos, N:1). Es una **sugerencia heurística**, no una "
+            "afirmación: el nombre de la tabla destino es una conjetura y no se "
+            "comprueba inclusión de valores contra ninguna tabla real.")),
+    ]
+    rows = []
+    for c in cands:
+        rows.append([
+            model._safe_str(c.get("column")),
+            model._safe_str(c.get("ref_table_guess") or "—"),
+            _fmt_int(c.get("distinct_count")),
+            _fmt_pct_fraction(c.get("unique_pct")),
+            model._safe_str(c.get("inferred_type") or c.get("physical_type") or "—"),
+            model._safe_str(c.get("reason") or ""),
+        ])
+    blocks.append(model.DataTable(
+        header=["Columna", "Posible tabla", "Valores distintos", "% único",
+                "Tipo", "Motivo"],
+        rows=rows, title="Posibles FK por nombre y cardinalidad",
+        note="Heurística: posibles falsos positivos/negativos. No confirma containment."))
+    blocks.append(model.Note(
+        "Estas sugerencias se basan solo en el nombre y la cardinalidad. Para "
+        "confirmarlas haría falta la tabla destino y comprobar la inclusión de "
+        "valores (containment)."))
+    return blocks
+
+
+# --------------------------------------------------------------------------- #
+# Entry point.
+# --------------------------------------------------------------------------- #
+def _intro_blocks(mark: bool) -> list:
+    pk = "[[term:pk]]clave primaria[[/term]]" if mark else "clave primaria"
+    fk = "[[term:fk]]clave foránea[[/term]]" if mark else "clave foránea"
+    text = (
+        f"Este capítulo analiza las **relaciones de clave** de la tabla: cuál es "
+        f"la {pk} y cuáles son las {fk}. Cuando la base las **declara** como "
+        "restricciones del esquema, se muestran tal cual; cuando no, se proponen "
+        "las más probables a partir de los datos —por containment entre tablas o, "
+        "en una sola tabla, por una heurística de nombre y cardinalidad— siempre "
+        "marcadas como candidatas, nunca como hechos.")
+    return [model.Heading(text=CHAPTER_TITLE, level=1), model.Markdown(text=text)]
+
+
+def build_relaciones(profile: dict, ctx: dict):
+    """Build the RELACIONES Chapter, or None if there is nothing to say.
+
+    Args:
+        profile: the ``eda`` group TableProfile dict (may be None/empty).
+        ctx: presentation context. Consumes ``db_path`` + ``table`` (to read
+            declared constraints, list sibling tables and run the containment FK
+            inference) and ``glossary`` (to register the relational terms).
+
+    Returns:
+        A ``model.Chapter`` with the applicable relation layers; or ``None`` when
+        the dataset has no declared key, no key candidates and no FK candidate
+        (neither inter- nor intra-table).
+    """
+    if not isinstance(profile, dict):
+        profile = {}
+    ctx = ctx if isinstance(ctx, dict) else {}
+    db_path = ctx.get("db_path")
+    table = ctx.get("table")
+
+    mark = _register_terms(ctx)
+
+    # Build each layer; the chapter is the concatenation of the non-empty ones.
+    declared = _declared_keys(db_path, table)
+    declared_blocks = _declared_section(declared) if declared else []
+    declared_has_fk = bool(declared and declared.get("foreign_keys"))
+
+    pk_blocks = _pk_candidates_section(profile, mark)
+
+    tables = _list_source_tables(db_path)
+    inter_blocks = _inter_table_section(db_path, tables, mark)
+
+    # The intra-table heuristic only makes sense when no real FK is available for
+    # this table — neither declared nor inferred inter-table. Otherwise the real
+    # relations already answer the question and the heuristic is just noise.
+    if declared_has_fk or inter_blocks:
+        intra_blocks = []
+    else:
+        intra_blocks = _intra_table_section(profile, mark)
+
+    body = declared_blocks + pk_blocks + inter_blocks + intra_blocks
+    if not body:
+        return None  # chapter does not apply: nothing to say about relations.
+
+    blocks = _intro_blocks(mark) + body
+    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
+                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,273 @@
+"""Tests for the RELACIONES chapter — DoD: golden(s) + edges + no-cut render.
+
+Two goldens covering the two real paths of the chapter:
+
+- **Intra-table** (a single table, no db source for relations): the chapter shows
+  the primary-key candidates from the profile and the heuristic foreign-key
+  suggestions (name + cardinality), explicitly flagged as a heuristic. Renders to
+  PDF and PPTX with nothing cut.
+- **Inter-table** (a real DuckDB file with two related tables, customers/orders,
+  with a declared FK): the chapter shows the declared keys, the containment-based
+  FK candidates and the join graph (roles + a pasteable Mermaid diagram).
+
+Edges: a profile with no key candidate and no FK-looking column returns None;
+``None`` / ``{}`` profiles do not raise. The chapter registers its glossary terms.
+
+Layers that depend on the sibling registry functions delegated alongside this
+chapter (``detect_declared_keys_duckdb``, ``suggest_intratable_fk_candidates``)
+are asserted **conditionally on the function being importable**, so the chapter's
+honest-degradation contract is what is tested, never a hard dependency on import
+timing.
+"""
+
+import os
+import tempfile
+
+import duckdb
+from pptx import Presentation
+from pypdf import PdfReader
+
+from datascience.automatic_eda.chapters.relaciones import build_relaciones
+from datascience.automatic_eda.model import Chapter, Group, GlossaryCollector
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+# The optional sibling functions: their layers are asserted only when present.
+try:
+    from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb
+except Exception:  # noqa: BLE001
+    detect_declared_keys_duckdb = None
+try:
+    from datascience.suggest_intratable_fk_candidates import (
+        suggest_intratable_fk_candidates,
+    )
+except Exception:  # noqa: BLE001
+    suggest_intratable_fk_candidates = None
+
+
+# --------------------------------------------------------------------------- #
+# Helpers.
+# --------------------------------------------------------------------------- #
+def _flatten(blocks) -> list:
+    """Flatten Group blocks so a test can inspect every leaf block."""
+    out = []
+    for b in blocks:
+        if isinstance(b, Group):
+            out.extend(_flatten(b.blocks))
+        else:
+            out.append(b)
+    return out
+
+
+def _text_of(chapter: Chapter) -> str:
+    """Collect all visible text of a chapter's blocks into one string."""
+    parts = []
+    for b in _flatten(chapter.blocks):
+        for attr in ("text", "title", "note"):
+            v = getattr(b, attr, None)
+            if isinstance(v, str):
+                parts.append(v)
+        header = getattr(b, "header", None)
+        if isinstance(header, list):
+            parts.extend(str(c) for c in header)
+        rows = getattr(b, "rows", None)
+        if isinstance(rows, list):
+            for r in rows:
+                if isinstance(r, (list, tuple)):
+                    parts.extend(str(c) for c in r)
+                else:
+                    parts.append(str(r))
+    return "\n".join(parts)
+
+
+def _render_both(chapter: Chapter, tag: str):
+    """Render the chapter to PDF and PPTX; return (pdf_text, n_slides)."""
+    tmp = tempfile.mkdtemp(prefix=f"relaciones_{tag}_")
+    pdf_path = os.path.join(tmp, "out.pdf")
+    pptx_path = os.path.join(tmp, "out.pptx")
+    meta = {"title": f"EDA — {tag}"}
+    render_automatic_eda_pdf([chapter], pdf_path, meta)
+    render_automatic_eda_pptx([chapter], pptx_path, meta)
+    assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
+    assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0
+    text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages)
+    n_slides = len(Presentation(pptx_path).slides)
+    return text, n_slides
+
+
+# --------------------------------------------------------------------------- #
+# Fixtures.
+# --------------------------------------------------------------------------- #
+def _titanic_profile() -> dict:
+    """A single-table profile: a PK candidate + a column that looks like a FK."""
+    return {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "n_rows": 891,
+        "n_cols": 4,
+        "key_candidates": ["PassengerId"],
+        "columns": [
+            {"name": "PassengerId", "inferred_type": "numeric",
+             "physical_type": "BIGINT", "distinct_count": 891,
+             "unique_pct": 1.0, "flags": ["possible_id"]},
+            {"name": "ticket_id", "inferred_type": "numeric",
+             "physical_type": "BIGINT", "distinct_count": 681,
+             "unique_pct": 0.76, "flags": []},
+            {"name": "fare", "inferred_type": "numeric",
+             "physical_type": "DOUBLE", "distinct_count": 248,
+             "unique_pct": 0.28, "flags": []},
+            {"name": "sex", "inferred_type": "categorical",
+             "physical_type": "VARCHAR", "distinct_count": 2,
+             "unique_pct": 0.002, "flags": []},
+        ],
+    }
+
+
+def _make_relational_db(path: str) -> None:
+    """Create a small DuckDB with customers(id) <- orders(customer_id), real FK."""
+    con = duckdb.connect(path)
+    con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)")
+    con.execute(
+        "CREATE TABLE orders(id INTEGER PRIMARY KEY, "
+        "customer_id INTEGER REFERENCES customers(id), amount DOUBLE)")
+    con.execute("INSERT INTO customers VALUES "
+                "(1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e')")
+    con.execute("INSERT INTO orders VALUES "
+                "(1,1,10.0),(2,1,20.0),(3,2,30.0),(4,3,40.0),"
+                "(5,3,50.0),(6,4,60.0),(7,5,70.0),(8,2,80.0)")
+    con.close()
+
+
+def _orders_profile() -> dict:
+    """A profile for the `orders` table of the relational DB."""
+    return {
+        "table": "orders",
+        "source": "orders",
+        "n_rows": 8,
+        "n_cols": 3,
+        "key_candidates": ["id"],
+        "columns": [
+            {"name": "id", "inferred_type": "numeric", "physical_type": "INTEGER",
+             "distinct_count": 8, "unique_pct": 1.0, "flags": ["possible_id"]},
+            {"name": "customer_id", "inferred_type": "numeric",
+             "physical_type": "INTEGER", "distinct_count": 5, "unique_pct": 0.625,
+             "flags": []},
+            {"name": "amount", "inferred_type": "numeric", "physical_type": "DOUBLE",
+             "distinct_count": 8, "unique_pct": 1.0, "flags": []},
+        ],
+    }
+
+
+# --------------------------------------------------------------------------- #
+# Golden 1 — intra-table.
+# --------------------------------------------------------------------------- #
+def test_golden_intra_table_pk_and_fk_heuristic():
+    """Single table: PK candidate shown; FK heuristic shown (if fn available);
+    renders to PDF + PPTX with nothing cut."""
+    prof = _titanic_profile()
+    glossary = GlossaryCollector()
+    # No db_path: only the profile-derived layers apply (no declared, no inter).
+    chapter = build_relaciones(prof, {"glossary": glossary})
+
+    assert isinstance(chapter, Chapter)
+    assert chapter.id == "relaciones"
+    text = _text_of(chapter)
+
+    # PK candidate is always present (comes from the profile).
+    assert "Candidatos a clave primaria" in text
+    assert "PassengerId" in text
+
+    # Glossary terms got registered.
+    for key in ("pk", "fk", "cardinalidad"):
+        assert glossary.has(key)
+
+    # FK heuristic layer: present iff the delegated function is importable.
+    if suggest_intratable_fk_candidates is not None:
+        assert "Posibles claves foráneas" in text
+        assert "ticket_id" in text
+        # The float measure and the PK itself are NOT suggested as FKs.
+        assert "Posibles FK por nombre" in text
+
+    pdf_text, n_slides = _render_both(chapter, "intra")
+    assert "PassengerId" in pdf_text
+    assert n_slides >= 1
+
+
+# --------------------------------------------------------------------------- #
+# Golden 2 — inter-table (real DuckDB).
+# --------------------------------------------------------------------------- #
+def test_golden_inter_table_containment_and_join_graph():
+    """Two related tables: declared FK (if fn available) + containment FK
+    candidate + Mermaid join graph."""
+    tmp = tempfile.mkdtemp(prefix="relaciones_db_")
+    db_path = os.path.join(tmp, "shop.duckdb")
+    _make_relational_db(db_path)
+
+    prof = _orders_profile()
+    glossary = GlossaryCollector()
+    chapter = build_relaciones(
+        prof, {"db_path": db_path, "table": "orders", "glossary": glossary})
+
+    assert isinstance(chapter, Chapter)
+    text = _text_of(chapter)
+
+    # Inter-table containment FK candidate: customer_id -> customers.id. This path
+    # uses infer_fk_containment_duckdb + build_join_graph, both already in the
+    # registry, so it must be present.
+    assert "Claves foráneas candidatas (inter-tabla)" in text
+    assert "orders.customer_id" in text
+    assert "customers.id" in text
+    # Join graph with a pasteable Mermaid diagram.
+    assert "Grafo de relaciones" in text
+    assert "mermaid" in text
+    assert "graph LR" in text
+    assert "containment" in text.lower()
+
+    # Declared-keys layer: present iff the delegated function is importable.
+    if detect_declared_keys_duckdb is not None:
+        assert "Claves declaradas en el esquema" in text
+        assert "Claves foráneas declaradas" in text
+
+    pdf_text, n_slides = _render_both(chapter, "inter")
+    assert "customer_id" in pdf_text
+    assert n_slides >= 1
+
+
+# --------------------------------------------------------------------------- #
+# Edges.
+# --------------------------------------------------------------------------- #
+def test_none_when_no_relations():
+    """No key candidates, no FK-looking columns, no db source -> None."""
+    prof = {
+        "table": "flat", "n_rows": 100, "n_cols": 2, "key_candidates": [],
+        "columns": [
+            {"name": "value", "inferred_type": "numeric", "physical_type": "DOUBLE",
+             "distinct_count": 50, "unique_pct": 0.5, "flags": []},
+            {"name": "label", "inferred_type": "categorical",
+             "physical_type": "VARCHAR", "distinct_count": 3, "unique_pct": 0.03,
+             "flags": []},
+        ],
+    }
+    assert build_relaciones(prof, {}) is None
+
+
+def test_empty_and_none_profile_do_not_raise():
+    """None / {} profile and missing ctx degrade to None without raising."""
+    assert build_relaciones(None, None) is None
+    assert build_relaciones({}, {}) is None
+    assert build_relaciones({}, {"glossary": GlossaryCollector()}) is None
+
+
+def test_pk_candidate_only_builds_chapter():
+    """A profile with only a key candidate (no FK anything, no db) still builds:
+    the relations chapter applies because there is a PK candidate to report."""
+    prof = {
+        "table": "t", "n_rows": 10, "n_cols": 1, "key_candidates": ["row_id"],
+        "columns": [
+            {"name": "row_id", "inferred_type": "numeric", "physical_type": "BIGINT",
+             "distinct_count": 10, "unique_pct": 1.0, "flags": ["possible_id"]},
+        ],
+    }
+    chapter = build_relaciones(prof, {})
+    assert isinstance(chapter, Chapter)
+    assert "Candidatos a clave primaria" in _text_of(chapter)
@@ -33,6 +33,7 @@ CHAPTER_ORDER = [
    "cat_distr",     # categorical distributions
    "calidad",       # data quality
    "correlacion",   # correlations / associations
+    "relaciones",    # key relations: declared/candidate PK + FK (inter/intra-table)
    "modelos",       # cheap models (PCA/KMeans/outliers)
    "timeseries",    # time-series analysis
    "geospatial",    # geospatial
@@ -0,0 +1,253 @@
+"""Tests for the Markdown completeness appendix (report 2053).
+
+The AutomaticEDA Markdown is the output meant to be *pasted into an LLM*, so it
+must carry EVERYTHING the engine computed — even the numbers the human-facing
+chapters (shared with the PDF/PPTX) drop for readability. ``render_md`` appends a
+full-data appendix built from ``meta['profile']`` that closes the six losses the
+evaluation found:
+
+1. the complete association matrix (every pair, incl. correlation_ratio /
+   cramers_v) — not just the top extremes;
+2. every numeric statistic for every numeric column (skew/kurtosis/percentiles);
+3. the concrete recommended re-expression;
+4. KMeans ``scores_by_k``;
+5. the normality test statistics;
+6. correct headers for bar/scree figure tables (not ``Desde/Hasta/Frecuencia``).
+
+Self-contained: a synthetic profile, no DuckDB, no heavy renderer.
+"""
+
+import os
+import sys
+
+import pytest  # noqa: F401
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))  # python/functions
+if _FUNCTIONS not in sys.path:
+    sys.path.insert(0, _FUNCTIONS)
+
+from datascience.automatic_eda import model  # noqa: E402
+from datascience.automatic_eda.render_md_impl import (  # noqa: E402
+    _bars_table,
+    _is_histogram_caption,
+    _profile_appendix,
+    render_md,
+)
+
+
+# --------------------------------------------------------------------------- #
+# Synthetic profile fixtures.
+# --------------------------------------------------------------------------- #
+def _numeric(skew, kurtosis):
+    """A numeric stat block with every key the appendix serializes."""
+    return {
+        "count": 100, "min": 0.0, "max": 10.0, "mean": 5.0, "median": 5.0,
+        "mode": 4.0, "std": 2.0, "variance": 4.0, "cv": 0.4,
+        "p1": 0.1, "p5": 0.5, "p25": 2.5, "p50": 5.0, "p75": 7.5,
+        "p95": 9.5, "p99": 9.9, "iqr": 5.0, "skew": skew, "kurtosis": kurtosis,
+        "n_outliers": 1, "distribution_type": "normal",
+    }
+
+
+def _profile():
+    """A small but structurally faithful TableProfile (3 numeric, 2 categorical)."""
+    pairs = [
+        {"a": "A", "b": "B", "a_type": "numeric", "b_type": "numeric",
+         "method": "pearson/spearman", "value": 0.8,
+         "p_value": 1e-9, "p_value_adjusted": 2e-9, "significant": True},
+        {"a": "A", "b": "C", "a_type": "numeric", "b_type": "numeric",
+         "method": "pearson/spearman", "value": -0.3,
+         "p_value": 0.01, "p_value_adjusted": 0.02, "significant": True},
+        {"a": "A", "b": "Cat1", "a_type": "numeric", "b_type": "categorical",
+         "method": "correlation_ratio", "value": 0.45,
+         "p_value": 0.001, "p_value_adjusted": 0.002, "significant": True},
+        # The single cat-cat pair the human chapter never shows.
+        {"a": "Cat1", "b": "Cat2", "a_type": "categorical",
+         "b_type": "categorical", "method": "cramers_v", "value": 0.11,
+         "p_value": 0.04, "p_value_adjusted": 0.05, "significant": False},
+    ]
+    return {
+        "correlations": {
+            "pairs": pairs,
+            "multiple_testing": {"method": "bh", "n_tests": 4, "n_rejected": 3},
+        },
+        "columns": [
+            {"name": "A", "count": 100, "numeric": _numeric(0.0, -1.2),
+             "reexpression": {"recommended": "none", "ladder_power": 1.0,
+                              "reason": "symmetric", "alternatives": []}},
+            {"name": "B", "count": 100, "numeric": _numeric(4.77, 33.1),
+             "reexpression": {"recommended": "log1p", "ladder_power": 0.0,
+                              "reason": "skew 4.77 with zeros",
+                              "alternatives": [{"transform": "yeo-johnson"},
+                                               {"transform": "sqrt"}]}},
+            {"name": "C", "count": 100, "numeric": _numeric(-0.6, 0.2)},
+            {"name": "Cat1", "categorical": {"top": [], "mode": "x"}},
+            {"name": "Cat2", "categorical": {"top": [], "mode": "y"}},
+        ],
+        "models": {
+            "kmeans": {
+                "best_k": 3,
+                "scores_by_k": [
+                    {"k": 2, "silhouette": 0.46, "inertia": 900.0},
+                    {"k": 3, "silhouette": 0.50, "inertia": 550.0},
+                    {"k": 4, "silhouette": 0.38, "inertia": 430.0},
+                ],
+                "cluster_sizes": [40, 35, 25],
+            },
+            "normality": {
+                "A": {"n": 100,
+                      "jarque_bera": {"stat": 18.7, "p": 8e-5, "normal": False},
+                      "dagostino": {"stat": 18.1, "p": 1e-4, "normal": False},
+                      "shapiro": {"stat": 0.98, "p": 7e-8, "normal": False},
+                      "is_normal": False},
+                "C": {"n": 100,
+                      "jarque_bera": {"stat": 2.1, "p": 0.35, "normal": True},
+                      "dagostino": {"stat": 1.9, "p": 0.38, "normal": True},
+                      "shapiro": {"stat": 0.99, "p": 0.12, "normal": True},
+                      "is_normal": True},
+            },
+        },
+    }
+
+
+def _dummy_chapters():
+    """A minimal one-chapter document so render_md does not early-return empty."""
+    return model.as_chapters([
+        {"id": "intro", "title": "Intro",
+         "blocks": [{"kind": "markdown", "text": "cuerpo del informe"}]},
+    ])
+
+
+def _render(tmp_path, profile):
+    out = os.path.join(str(tmp_path), "out.md")
+    res = render_md(_dummy_chapters(), out, {"title": "EDA — t", "profile": profile})
+    assert res["path"] == out
+    return open(out, encoding="utf-8").read()
+
+
+def _table_rows(md, section_title):
+    """Count data rows of the first Markdown table under ``section_title``."""
+    seg = md.split(section_title, 1)[1]
+    rows, in_t, seen_sep = 0, False, False
+    for ln in seg.splitlines():
+        if ln.startswith("|"):
+            in_t = True
+            stripped = ln.replace("|", "").replace(" ", "")
+            if stripped and set(stripped) == {"-"}:
+                seen_sep = True
+                continue
+            if seen_sep:
+                rows += 1
+        elif in_t and not ln.strip():
+            break
+    return rows
+
+
+# --------------------------------------------------------------------------- #
+# Golden: every datum the profile holds reaches the .md.
+# --------------------------------------------------------------------------- #
+def test_appendix_lists_all_correlation_pairs(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "## Apéndice — Datos completos del perfil" in md
+    # All 4 pairs (the real titanic profile has 28; here 4 synthetic).
+    assert _table_rows(md, "### Matriz de asociación") == 4
+    # The cat-cat Cramér's V pair the human chapter drops is present.
+    assert "Cat1 ↔ Cat2" in md
+    assert "cramers_v" in md
+    assert "correlation_ratio" in md
+
+
+def test_appendix_has_skew_kurtosis_for_every_numeric(tmp_path):
+    md = _render(tmp_path, _profile())
+    seg = md.split("### Estadísticos numéricos completos", 1)[1].split("###", 1)[0]
+    lines = [l for l in seg.splitlines() if l.startswith("|")]
+    header = [h.strip() for h in lines[0].strip("|").split("|")]
+    assert "skew" in header and "kurtosis" in header
+    ski, kui = header.index("skew"), header.index("kurtosis")
+    data = lines[2:]  # skip header + separator
+    assert len(data) == 3  # exactly the 3 numeric columns
+    for row in data:
+        cells = [c.strip() for c in row.strip("|").split("|")]
+        assert cells[ski] != "", f"missing skew in {cells[0]}"
+        assert cells[kui] != "", f"missing kurtosis in {cells[0]}"
+
+
+def test_appendix_has_extended_percentiles(tmp_path):
+    md = _render(tmp_path, _profile())
+    seg = md.split("### Estadísticos numéricos completos", 1)[1]
+    header = [h.strip() for h in seg.splitlines()[2].strip("|").split("|")]
+    for p in ("p1", "p5", "p25", "p75", "p95", "p99"):
+        assert p in header, f"percentile {p} missing from describe header"
+
+
+def test_appendix_names_concrete_reexpression(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "### Re-expresión recomendada" in md
+    assert "log1p" in md  # the concrete transform, not just "consider re-expressing"
+    assert "yeo-johnson" in md  # alternatives listed too
+
+
+def test_appendix_has_kmeans_scores_by_k(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "scores_by_k" in md
+    assert _table_rows(md, "#### KMeans — selección de k") == 3  # k=2,3,4
+
+
+def test_appendix_has_normality_statistics(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "JB stat" in md  # the statistic, not only the p-value
+    assert "Shapiro stat" in md
+    assert _table_rows(md, "#### Tests de normalidad") == 2  # cols A and C
+
+
+# --------------------------------------------------------------------------- #
+# Edge: a profile missing models / correlations degrades, never raises.
+# --------------------------------------------------------------------------- #
+def test_lite_profile_without_models(tmp_path):
+    prof = _profile()
+    prof.pop("models")  # lite: no KMeans/normality
+    md = _render(tmp_path, prof)
+    assert "scores_by_k" not in md  # section skipped
+    assert "Matriz de asociación" in md  # correlations still dumped
+    assert "## Apéndice" in md
+
+
+def test_profile_without_correlations(tmp_path):
+    prof = _profile()
+    prof.pop("correlations")
+    md = _render(tmp_path, prof)  # must not raise
+    assert "Matriz de asociación" not in md
+    assert "Estadísticos numéricos completos" in md  # numeric section still there
+
+
+def test_no_profile_means_no_appendix(tmp_path):
+    out = os.path.join(str(tmp_path), "noprof.md")
+    res = render_md(_dummy_chapters(), out, {"title": "x"})
+    assert res["path"] == out
+    assert "## Apéndice" not in open(out, encoding="utf-8").read()
+
+
+def test_appendix_helper_is_defensive():
+    assert _profile_appendix(None) == ""
+    assert _profile_appendix({}) == ""
+    assert _profile_appendix({"columns": []}) == ""
+
+
+# --------------------------------------------------------------------------- #
+# Loss #6: bar/scree figure tables get a non-misleading header.
+# --------------------------------------------------------------------------- #
+def test_histogram_caption_detection():
+    assert _is_histogram_caption("Histograma de Age")
+    assert _is_histogram_caption("Distribución de Fare")
+    assert not _is_histogram_caption("Media de Survived por Sex")
+    assert not _is_histogram_caption("Varianza explicada (scree PCA)")
+
+
+def test_bars_table_custom_header():
+    bars = [(0.0, 1.0, 5.0), (1.0, 2.0, 3.0)]
+    hist = _bars_table(bars)  # default histogram header
+    assert "| Desde | Hasta | Frecuencia |" in hist
+    bar = _bars_table(bars, ("Inicio", "Fin", "Valor"))
+    assert "| Inicio | Fin | Valor |" in bar
+    assert "Frecuencia" not in bar
@@ -139,10 +139,17 @@ class Group:
    it starts on a fresh page and flows (honest degradation, never cut). Use it to
    bind ``Heading`` + ``Markdown`` + ``Figure`` of one idea together (see the
    DISTR NUM / AGREGACION chapters).
+
+    When ``page_break_before`` is True the renderer additionally forces the group
+    to *start* on a fresh page/slide (unless the current one is already empty), so
+    a chapter can give each unit its own page — e.g. one categorical column per
+    page (see CAT DISTR). It is purely additive: the default False keeps the plain
+    keep-together behaviour for every existing chapter.
    """

    blocks: list = field(default_factory=list)
    title: Optional[str] = None
+    page_break_before: bool = False
    kind: str = field(default="group", init=False)


@@ -228,7 +235,9 @@ def as_block(obj: Any):
                return Note(text=_safe_str(obj.get("text")))
            if cls is Group:
                return Group(blocks=as_blocks(obj.get("blocks")),
-                             title=obj.get("title"))
+                             title=obj.get("title"),
+                             page_break_before=bool(
+                                 obj.get("page_break_before", False)))
            if cls is GlossaryEntry:
                return GlossaryEntry(key=_safe_str(obj.get("key")),
                                     label=_safe_str(obj.get("label")),
@@ -0,0 +1,748 @@
+"""AutomaticEDA Markdown serializer — one self-contained file to paste to an LLM.
+
+Same document model as the PDF/PPTX renderers (an ordered list of
+:class:`Chapter`, each a list of format-independent blocks) but emitted as plain
+**Markdown** instead of a binary. The goal is different from the other two
+renderers: a Markdown EDA is meant to be *pasted into an LLM*, so it prioritises
+TEXT and DATA over visuals. Tables become Markdown tables (every row dumped, no
+pagination — nothing is cut because there are no pages); a ``Figure`` becomes its
+caption plus, when possible, the underlying bar/histogram data as a Markdown
+table (an LLM cannot see the image); glossary term markers are stripped while
+``**bold**`` is kept (it is valid Markdown).
+
+dict-no-throw (the ``eda`` group style): :func:`render_md` never raises. On a
+fatal error it returns ``{path: None, ...}`` with a ``note`` explaining why; a
+malformed block degrades to a readable note rather than crashing the document.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+
+from . import model
+
+# Glossary span markers (kept text, dropped markers). We intentionally do NOT use
+# ``text_layout.strip_inline_md`` for Markdown blocks because that also removes
+# ``**bold**`` — valid Markdown we want to preserve when pasting to an LLM.
+_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")
+_MAX_BAR_ROWS = 100
+
+
+# --------------------------------------------------------------------------- #
+# Small helpers.
+# --------------------------------------------------------------------------- #
+def _clean_terms(s) -> str:
+    """Drop glossary term markers, keeping the visible text (and any **bold**)."""
+    s = model._safe_str(s)
+    s = _TERM_OPEN_RE.sub("", s)
+    return s.replace("[[/term]]", "")
+
+
+def _cell(v) -> str:
+    """Render a value as a safe Markdown table cell.
+
+    Escapes pipes (``|`` -> ``\\|``) so they do not break the column layout and
+    folds newlines to ``<br>`` so a multi-line value stays inside one cell. None
+    becomes an empty string.
+    """
+    s = model._safe_str(v)
+    s = s.replace("|", "\\|")
+    s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "<br>")
+    return s
+
+
+def _slug(text: str) -> str:
+    """GitHub-style heading anchor: lowercase, spaces->'-', drop other symbols."""
+    s = model._safe_str(text).strip().lower()
+    out = []
+    for ch in s:
+        if ch.isalnum():
+            out.append(ch)
+        elif ch in " -":
+            out.append("-")
+        # any other symbol is dropped.
+    slug = "".join(out)
+    while "--" in slug:
+        slug = slug.replace("--", "-")
+    return slug.strip("-")
+
+
+def _fmt_num(v) -> str:
+    """Compact number for the figure data tables (ints as ints, else 4 sig figs)."""
+    try:
+        f = float(v)
+    except Exception:  # noqa: BLE001
+        return model._safe_str(v)
+    if f != f:  # NaN
+        return "NaN"
+    if f == int(f) and abs(f) < 1e15:
+        return str(int(f))
+    return f"{f:.4g}"
+
+
+def _fmt_int(v) -> str:
+    try:
+        return str(int(v))
+    except Exception:  # noqa: BLE001
+        return model._safe_str(v)
+
+
+def _now_iso() -> str:
+    from datetime import datetime, timezone
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+
+
+# --------------------------------------------------------------------------- #
+# Document header (title + metadata blockquote + numbered index).
+# --------------------------------------------------------------------------- #
+def _meta_block(meta: dict) -> list:
+    """Build the metadata lines for the header blockquote (omitting absentees)."""
+    ctx = meta.get("ctx") if isinstance(meta.get("ctx"), dict) else {}
+    lines: list = []
+
+    def add(label, value) -> None:
+        if value is None:
+            return
+        s = model._safe_str(value).strip()
+        if s and s.lower() != "none":
+            lines.append(f"**{label}:** {s}")
+
+    add("Dataset", ctx.get("dataset_name") or meta.get("dataset_name"))
+    add("Fuente", ctx.get("source_origin") or meta.get("source_origin"))
+    add("Almacenamiento", ctx.get("storage") or meta.get("storage"))
+    n_rows = ctx.get("n_rows", meta.get("n_rows"))
+    n_cols = ctx.get("n_cols", meta.get("n_cols"))
+    if n_rows is not None and n_cols is not None:
+        lines.append(
+            f"**Dimensiones:** {_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas")
+    add("Generado", meta.get("generated_at") or _now_iso())
+    lines.append(f"**Motor:** {model.ENGINE_NAME} v{model.ENGINE_VERSION}")
+    return lines
+
+
+# --------------------------------------------------------------------------- #
+# Per-block serializers. Each returns a Markdown string (no surrounding blanks;
+# the caller separates blocks with a blank line).
+# --------------------------------------------------------------------------- #
+def _md_heading(block) -> str:
+    level = int(getattr(block, "level", 1) or 1)
+    hashes = "#" * min(level + 2, 6)  # level1 -> ###; '#'/'##' reserved for doc/chapter.
+    text = _clean_terms(getattr(block, "text", "")).strip()
+    return f"{hashes} {text}"
+
+
+def _md_markdown(block) -> str:
+    # Keep the text verbatim, dropping only glossary markers (keep **bold**).
+    return _clean_terms(getattr(block, "text", "")).rstrip("\n")
+
+
+def _md_kv_table(block) -> str:
+    lines: list = []
+    title = getattr(block, "title", None)
+    if title:
+        lines.append(f"**{_clean_terms(title).strip()}**")
+        lines.append("")
+    lines.append("| Campo | Valor |")
+    lines.append("| --- | --- |")
+    for row in (getattr(block, "rows", []) or []):
+        try:
+            label, value = row[0], row[1]
+        except Exception:  # noqa: BLE001
+            label, value = row, ""
+        lines.append(f"| {_cell(label)} | {_cell(value)} |")
+    return "\n".join(lines)
+
+
+def _md_data_table(block) -> str:
+    lines: list = []
+    title = getattr(block, "title", None)
+    if title:
+        lines.append(f"**{_clean_terms(title).strip()}**")
+        lines.append("")
+    header = list(getattr(block, "header", []) or [])
+    rows = list(getattr(block, "rows", []) or [])
+    if not header:
+        ncol = max((len(r) for r in rows), default=1)
+        header = [f"col{i + 1}" for i in range(ncol)]
+    ncol = len(header)
+    lines.append("| " + " | ".join(_cell(h) for h in header) + " |")
+    lines.append("| " + " | ".join(["---"] * ncol) + " |")
+    for r in rows:  # dump every row — no pagination, nothing cut.
+        cells = [_cell(r[c]) if c < len(r) else "" for c in range(ncol)]
+        lines.append("| " + " | ".join(cells) + " |")
+    note = getattr(block, "note", None)
+    if note:
+        lines.append("")
+        lines.append(f"*{_clean_terms(note).strip()}*")
+    return "\n".join(lines)
+
+
+def _bars_table(bars: list, header: tuple = ("Desde", "Hasta", "Frecuencia")) -> str:
+    """Render extracted bar/histogram data as a Markdown table.
+
+    ``header`` is the 3-column header to use. Histogram bars are
+    ``(Desde, Hasta, Frecuencia)``; bar/scree charts (means by group, PCA
+    explained variance) are *not* bins, so the caller passes a semantically
+    correct header (e.g. ``(Inicio, Fin, Valor)``) to avoid the misleading
+    "Frecuencia" label — see report 2053, loss #6.
+    """
+    h0, h1, h2 = header
+    lines = [f"| {h0} | {h1} | {h2} |", "| --- | --- | --- |"]
+    shown = bars[:_MAX_BAR_ROWS]
+    for x0, x1, h in shown:
+        lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
+    out = "\n".join(lines)
+    extra = len(bars) - len(shown)
+    if extra > 0:
+        out += f"\n\n*… ({extra} filas más)*"
+    return out
+
+
+def _is_histogram_caption(caption: str) -> bool:
+    """True when a figure caption describes a histogram (genuine numeric bins).
+
+    Histograms are the only figures whose bars are real ``[Desde, Hasta)`` bins
+    with a frequency count. Bar charts (means by group) and the PCA scree plot
+    carry per-category / per-component values, not bins — they must not inherit
+    the ``Desde/Hasta/Frecuencia`` header.
+    """
+    c = (caption or "").lower()
+    return "histograma" in c or "distribución" in c or "distribucion" in c
+
+
+def _extract_bars(fig) -> list:
+    """Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
+
+    Histogram / bar-chart bars are ``matplotlib.patches.Rectangle`` with positive
+    width and height; spines, legends and zero-area artists are skipped. Never
+    raises — returns ``[]`` on any problem.
+    """
+    bars: list = []
+    try:
+        for ax in fig.get_axes():
+            # Collect this axes' positive-area rectangles, then keep only the ones
+            # that look like actual histogram/bar bins. Reference shapes that
+            # matplotlib also stores in ``ax.patches`` — most notably the ``±1σ``
+            # band drawn by ``axvspan`` (a single rectangle far wider than a bin)
+            # and a lone Tukey boxplot box — would otherwise show up as fake
+            # "bins". A histogram axes has several near-equal-width bars, so we
+            # drop any rectangle whose width is more than twice the median width
+            # of that axes' rectangles (the σ-band spans many bins; uniform bins
+            # all sit at the median width and stay).
+            ax_bars: list = []
+            for patch in list(getattr(ax, "patches", []) or []):
+                try:
+                    w = patch.get_width()
+                    h = patch.get_height()
+                    x = patch.get_x()
+                except Exception:  # noqa: BLE001 — not a Rectangle-like patch.
+                    continue
+                if w and w > 0 and h and h > 0:
+                    ax_bars.append((x, x + w, h))
+            if len(ax_bars) >= 3:
+                widths = sorted(b[1] - b[0] for b in ax_bars)
+                median_w = widths[len(widths) // 2]
+                if median_w > 0:
+                    ax_bars = [b for b in ax_bars
+                               if (b[1] - b[0]) <= 2.0 * median_w]
+            bars.extend(ax_bars)
+    except Exception:  # noqa: BLE001
+        return []
+    return bars
+
+
+def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
+    """Serialize a Figure prioritising TEXT + DATA (an LLM cannot see the image).
+
+    Emits the caption, then — if the matplotlib figure has bars — a Markdown table
+    of the underlying (Desde, Hasta, Frecuencia) values. Optionally (when
+    ``meta['embed_figures']`` is True) also exports a PNG beside the .md and adds
+    an image link; off by default so the Markdown stays self-contained.
+    """
+    caption = model._safe_str(getattr(block, "caption", "")).strip()
+    parts = [f"*Figura: {caption}*" if caption else "*Figura*"]
+    fig = None
+    try:
+        import matplotlib
+        matplotlib.use("Agg")  # defensive: headless rasterization backend.
+        fig = getattr(block, "fig", None)
+        make = getattr(block, "make", None)
+        if fig is None and callable(make):
+            fig = make()
+        if fig is not None:
+            bars = _extract_bars(fig)
+            if bars:
+                # A histogram's bars are genuine numeric bins (Desde/Hasta/
+                # Frecuencia). Bar charts and the PCA scree plot are not bins —
+                # give them a header that does not lie about "Frecuencia".
+                header = (("Desde", "Hasta", "Frecuencia")
+                          if _is_histogram_caption(caption)
+                          else ("Inicio", "Fin", "Valor"))
+                parts.append(_bars_table(bars, header))
+            if meta.get("embed_figures"):
+                png = _embed_png(fig, out_path, counter)
+                if png:
+                    parts.append(f"![{caption}]({png})")
+    except Exception:  # noqa: BLE001 — a bad figure degrades to just its caption.
+        pass
+    finally:
+        if fig is not None:
+            try:
+                import matplotlib.pyplot as plt
+                plt.close(fig)
+            except Exception:  # noqa: BLE001
+                pass
+    return "\n\n".join(parts)
+
+
+def _embed_png(fig, out_path: str, counter: list) -> str:
+    """Export the figure to ``<basename>_figN.png`` beside the .md; return its name."""
+    try:
+        counter[0] += 1
+        base = os.path.splitext(os.path.basename(out_path))[0] or "figura"
+        name = f"{base}_fig{counter[0]}.png"
+        path = os.path.join(os.path.dirname(os.path.abspath(out_path)), name)
+        fig.savefig(path, format="png", dpi=120, bbox_inches="tight")
+        return name
+    except Exception:  # noqa: BLE001
+        return ""
+
+
+def _md_image(block) -> str:
+    path = model._safe_str(getattr(block, "path", ""))
+    caption = model._safe_str(getattr(block, "caption", "")).strip()
+    out = f"![{caption}]({path})"
+    if caption:
+        out += f"\n\n*{caption}*"
+    return out
+
+
+def _md_caption(block) -> str:
+    return f"*{_clean_terms(getattr(block, 'text', '')).strip()}*"
+
+
+def _md_note(block) -> str:
+    text = _clean_terms(getattr(block, "text", "")).strip()
+    lines = text.split("\n")
+    return "\n".join((f"> {ln}" if ln.strip() else ">") for ln in lines)
+
+
+def _md_group(block, meta: dict, out_path: str, counter: list) -> str:
+    parts: list = []
+    title = getattr(block, "title", None)
+    if title:
+        parts.append(f"### {_clean_terms(title).strip()}")
+    for b in (getattr(block, "blocks", []) or []):
+        try:
+            seg = _serialize_block(b, meta, out_path, counter)
+        except Exception:  # noqa: BLE001
+            seg = ""
+        if seg:
+            parts.append(seg)
+    return "\n\n".join(parts)
+
+
+def _md_glossary_entry(block) -> str:
+    label = (model._safe_str(getattr(block, "label", "")).strip()
+             or model._safe_str(getattr(block, "key", "")).strip())
+    definition = _clean_terms(getattr(block, "definition", "")).strip()
+    out = f"### {label}"
+    if definition:
+        out += f"\n\n{definition}"
+    return out
+
+
+def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
+    """Dispatch a single block to its Markdown serializer. Unknown -> note."""
+    kind = getattr(block, "kind", "")
+    if kind == "heading":
+        return _md_heading(block)
+    if kind == "markdown":
+        return _md_markdown(block)
+    if kind == "kv_table":
+        return _md_kv_table(block)
+    if kind == "data_table":
+        return _md_data_table(block)
+    if kind == "figure":
+        return _md_figure(block, meta, out_path, counter)
+    if kind == "image":
+        return _md_image(block)
+    if kind == "caption":
+        return _md_caption(block)
+    if kind == "note":
+        return _md_note(block)
+    if kind == "group":
+        return _md_group(block, meta, out_path, counter)
+    if kind == "glossary_entry":
+        return _md_glossary_entry(block)
+    # Unknown content -> readable note (mirrors the model's defensive coercion).
+    return _md_note(model.Note(text=model._safe_str(block)))
+
+
+# --------------------------------------------------------------------------- #
+# Profile appendix — the data the human-facing chapters drop.
+#
+# The chapter document (shared with the PDF/PPTX renderers) is designed for human
+# reading and intentionally omits raw numbers: the correlation matrix shows only
+# the top extremes, the numeric blocks skip skew/kurtosis/extended percentiles,
+# the model chapter does not list ``scores_by_k`` or the normality test
+# statistics. But the Markdown is meant to be *pasted into an LLM*, so it should
+# carry EVERYTHING the engine computed. This appendix serializes the full
+# ``profile`` (passed via ``meta['profile']``) as Markdown tables, additively:
+# the PDF/PPTX are untouched, the .md simply has more than they do. Each section
+# is emitted only when its source data is present, so a ``lite`` profile (no
+# models) or a profile without correlations degrades cleanly instead of raising.
+# See report 2053 for the six losses this closes.
+# --------------------------------------------------------------------------- #
+def _pair_types(a_type, b_type) -> str:
+    """Short ``num↔cat`` label for an association pair's variable types."""
+    def short(t):
+        t = model._safe_str(t).lower()
+        if t.startswith("num"):
+            return "num"
+        if t.startswith("cat"):
+            return "cat"
+        return t or "?"
+    return f"{short(a_type)}↔{short(b_type)}"
+
+
+def _app_correlations(corr: dict) -> str:
+    """Loss #1 — every association pair (not just the top extremes).
+
+    Dumps all of ``correlations['pairs']`` as a table (pair · types · method ·
+    value · p · p-FDR · significant), ordered by |value| desc so the strongest
+    associations lead while nothing is cut. Includes the ``correlation_ratio``
+    (num↔cat) and ``cramers_v`` (cat↔cat) pairs the human chapter never shows.
+    """
+    pairs = list(corr.get("pairs", []) or [])
+    if not pairs:
+        return ""
+    def keyfn(p):
+        try:
+            return -abs(float(p.get("value")))
+        except Exception:  # noqa: BLE001
+            return 0.0
+    pairs_sorted = sorted(pairs, key=keyfn)
+    lines = ["### Matriz de asociación — todos los pares",
+             "",
+             ("| Par | Tipos | Método | Valor | p-value | p-ajustado (FDR) "
+              "| ¿Sig? |"),
+             "| --- | --- | --- | --- | --- | --- | --- |"]
+    for p in pairs_sorted:
+        par = f"{_cell(p.get('a'))} ↔ {_cell(p.get('b'))}"
+        types = _pair_types(p.get("a_type"), p.get("b_type"))
+        method = _cell(p.get("method"))
+        val = _fmt_num(p.get("value"))
+        pv = _fmt_num(p.get("p_value")) if p.get("p_value") is not None else ""
+        padj = (_fmt_num(p.get("p_value_adjusted"))
+                if p.get("p_value_adjusted") is not None else "")
+        sig = "sí" if p.get("significant") else "no"
+        lines.append(
+            f"| {par} | {types} | {method} | {val} | {pv} | {padj} | {sig} |")
+    mt = corr.get("multiple_testing") or {}
+    n_tests = mt.get("n_tests", corr.get("n_tests"))
+    n_rej = mt.get("n_rejected")
+    note_bits = [f"{len(pairs)} pares en total"]
+    if n_tests is not None and n_rej is not None:
+        note_bits.append(
+            f"{n_rej} de {n_tests} significativos tras corrección "
+            f"{model._safe_str(mt.get('method', 'FDR')).upper()}")
+    lines.append("")
+    lines.append(f"*{'; '.join(note_bits)}.*")
+    return "\n".join(lines)
+
+
+# Numeric statistics, in serialization order: (profile key, column header).
+_NUM_STATS = [
+    ("count", "n"), ("mean", "mean"), ("median", "median"), ("mode", "mode"),
+    ("std", "std"), ("variance", "variance"), ("cv", "cv"),
+    ("skew", "skew"), ("kurtosis", "kurtosis"),
+    ("min", "min"), ("p1", "p1"), ("p5", "p5"), ("p25", "p25"), ("p50", "p50"),
+    ("p75", "p75"), ("p95", "p95"), ("p99", "p99"), ("iqr", "iqr"),
+    ("max", "max"), ("n_outliers", "outliers"),
+    ("distribution_type", "distribución"),
+]
+
+
+def _app_numeric_describe(columns: list) -> str:
+    """Loss #2 — every numeric statistic for every numeric column.
+
+    One row per numeric column with the full describe: mean/median/mode/std/
+    variance/cv, skew & kurtosis (for ALL columns, not only the skewed ones),
+    p1/p5/p25/p50/p75/p95/p99, iqr, min/max, outliers and distribution_type.
+    """
+    rows = []
+    for info in (columns or []):
+        num = info.get("numeric") if isinstance(info, dict) else None
+        if not num:
+            continue
+        name = _cell(info.get("name"))
+        cells = [name]
+        for key, _hdr in _NUM_STATS:
+            v = num.get("count" if key == "count" else key)
+            if key == "count":
+                v = num.get("count", info.get("count"))
+            if key == "distribution_type":
+                cells.append(_cell(v))
+            else:
+                cells.append(_fmt_num(v) if v is not None else "")
+        rows.append(cells)
+    if not rows:
+        return ""
+    header = ["Columna"] + [hdr for _k, hdr in _NUM_STATS]
+    lines = ["### Estadísticos numéricos completos (describe)",
+             "",
+             "| " + " | ".join(header) + " |",
+             "| " + " | ".join(["---"] * len(header)) + " |"]
+    for cells in rows:
+        lines.append("| " + " | ".join(cells) + " |")
+    return "\n".join(lines)
+
+
+def _app_reexpression(columns: list) -> str:
+    """Loss #3 — the concrete recommended re-expression per column.
+
+    Names the transform (log1p/sqrt/yeo-johnson/none) instead of a vague
+    "consider re-expressing", with the ladder power, reason and alternatives.
+    """
+    rows = []
+    for info in (columns or []):
+        rx = info.get("reexpression") if isinstance(info, dict) else None
+        if not rx or not isinstance(rx, dict):
+            continue
+        rec = model._safe_str(rx.get("recommended")).strip()
+        if not rec:
+            continue
+        alts = rx.get("alternatives") or []
+        alt_txt = ", ".join(
+            model._safe_str(a.get("transform")) for a in alts
+            if isinstance(a, dict) and a.get("transform")) or "—"
+        rows.append([
+            _cell(info.get("name")), _cell(rec),
+            _fmt_num(rx.get("ladder_power")) if rx.get("ladder_power") is not None else "",
+            _cell(rx.get("reason")), _cell(alt_txt),
+        ])
+    if not rows:
+        return ""
+    lines = ["### Re-expresión recomendada (escalera de Tukey)",
+             "",
+             "| Columna | Recomendada | Potencia | Razón | Alternativas |",
+             "| --- | --- | --- | --- | --- |"]
+    for r in rows:
+        lines.append("| " + " | ".join(r) + " |")
+    return "\n".join(lines)
+
+
+def _app_kmeans_scores(kmeans: dict) -> str:
+    """Loss #4 — KMeans silhouette + inertia per k (justifies the chosen k)."""
+    scores = list(kmeans.get("scores_by_k", []) or [])
+    if not scores:
+        return ""
+    best_k = kmeans.get("best_k")
+    lines = ["#### KMeans — selección de k (`scores_by_k`)",
+             "",
+             "| k | Silhouette | Inercia | Elegido |",
+             "| --- | --- | --- | --- |"]
+    for s in scores:
+        if not isinstance(s, dict):
+            continue
+        k = s.get("k")
+        chosen = "✓" if best_k is not None and k == best_k else ""
+        lines.append(
+            f"| {_fmt_num(k)} | {_fmt_num(s.get('silhouette'))} "
+            f"| {_fmt_num(s.get('inertia'))} | {chosen} |")
+    return "\n".join(lines)
+
+
+def _app_normality(normality: dict) -> str:
+    """Loss #5 — each normality test's statistic next to its p-value."""
+    if not isinstance(normality, dict) or not normality:
+        return ""
+    lines = ["#### Tests de normalidad (estadístico + p-value)",
+             "",
+             ("| Columna | n | JB stat | JB p | D'Agostino stat | D'Agostino p "
+              "| Shapiro stat | Shapiro p | ¿Normal? |"),
+             "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"]
+    any_row = False
+    for col, res in normality.items():
+        if not isinstance(res, dict):
+            continue
+        jb = res.get("jarque_bera") or {}
+        da = res.get("dagostino") or {}
+        sh = res.get("shapiro") or {}
+        is_norm = "sí" if res.get("is_normal") else "no"
+        lines.append(
+            f"| {_cell(col)} | {_fmt_num(res.get('n')) if res.get('n') is not None else ''} "
+            f"| {_fmt_num(jb.get('stat'))} | {_fmt_num(jb.get('p'))} "
+            f"| {_fmt_num(da.get('stat'))} | {_fmt_num(da.get('p'))} "
+            f"| {_fmt_num(sh.get('stat'))} | {_fmt_num(sh.get('p'))} | {is_norm} |")
+        any_row = True
+    return "\n".join(lines) if any_row else ""
+
+
+def _profile_appendix(profile: dict) -> str:
+    """Build the full-data appendix from a TableProfile dict (additive).
+
+    Returns a Markdown ``## Apéndice`` section with one sub-table per loss the
+    human chapters drop, or ``""`` when the profile carries none of them. Never
+    raises: a missing/oddly-shaped section is skipped, not fatal.
+    """
+    if not isinstance(profile, dict):
+        return ""
+    sections: list = []
+    try:
+        corr = profile.get("correlations") or {}
+        seg = _app_correlations(corr) if isinstance(corr, dict) else ""
+        if seg:
+            sections.append(seg)
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        columns = profile.get("columns") or []
+        seg = _app_numeric_describe(columns)
+        if seg:
+            sections.append(seg)
+        seg = _app_reexpression(columns)
+        if seg:
+            sections.append(seg)
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        models = profile.get("models") or {}
+        if isinstance(models, dict):
+            model_segs = []
+            seg = _app_kmeans_scores(models.get("kmeans") or {})
+            if seg:
+                model_segs.append(seg)
+            seg = _app_normality(models.get("normality") or {})
+            if seg:
+                model_segs.append(seg)
+            if model_segs:
+                sections.append(
+                    "### Modelos — detalle\n\n" + "\n\n".join(model_segs))
+    except Exception:  # noqa: BLE001
+        pass
+    if not sections:
+        return ""
+    intro = ("Volcado completo de los datos que el motor computó y que los "
+             "capítulos (pensados para lectura humana / PDF) resumen. "
+             "Pensado para que un LLM reconstruya el análisis entero.")
+    return ("## Apéndice — Datos completos del perfil\n\n"
+            f"*{intro}*\n\n" + "\n\n".join(sections))
+
+
+# --------------------------------------------------------------------------- #
+# Entry point.
+# --------------------------------------------------------------------------- #
+def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
+    """Serialize a list of Chapters into a single self-contained Markdown file.
+
+    The output leads with ``# <title>``, a metadata blockquote and a numbered
+    ``## Índice`` linking each chapter, then one ``## N. <title>`` section per
+    chapter with its blocks. Tables become Markdown tables (every row dumped),
+    figures become caption + underlying data table, glossary markers are stripped
+    while ``**bold**`` is kept. Designed to be pasted into an LLM.
+
+    Args:
+        chapters: a list of ``Chapter`` (dataclasses or dicts); normalized
+            defensively with ``model.as_chapters``.
+        out_path: filesystem path for the ``.md`` (parent dirs are created).
+        meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
+            ``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
+            ``generated_at``, ``embed_figures`` (export PNGs beside the .md,
+            default False).
+
+    Returns:
+        dict (never raises): ``{path: str|None, n_chars: int,
+        chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
+        None and ``note`` explains why.
+    """
+    meta = meta or {}
+    chapters = model.as_chapters(chapters)
+    title = model._safe_str(meta.get("title")) or model.ENGINE_NAME
+
+    # Edge: nothing to render -> a minimal but valid Markdown document.
+    if not chapters:
+        content = (f"# {title}\n\n"
+                   "*(documento vacío — sin capítulos aplicables)*\n")
+        return _write(out_path, content, [], "documento vacío")
+
+    counter = [0]  # document-wide figure counter for unique PNG names.
+    notes: list = []
+    segments: list = [f"# {title}"]
+
+    meta_lines = _meta_block(meta)
+    if meta_lines:
+        segments.append("\n".join(f"> {ln}" for ln in meta_lines))
+
+    # Numbered index. The anchor matches the chapter heading emitted below
+    # (``## N. <title>``) in GitHub slug style.
+    chap_heads = []
+    idx_lines = ["## Índice"]
+    for i, ch in enumerate(chapters, 1):
+        head_text = f"{i}. {model._safe_str(ch.title)}"
+        anchor = _slug(head_text)
+        chap_heads.append((head_text, anchor))
+        idx_lines.append(f"{i}. [{model._safe_str(ch.title)}](#{anchor})")
+    segments.append("\n".join(idx_lines))
+
+    chapters_meta = []
+    for i, ch in enumerate(chapters, 1):
+        segments.append("---")
+        head_text, _anchor = chap_heads[i - 1]
+        segments.append(f"## {head_text}")
+
+        blocks = list(ch.blocks or [])
+        # Omit a leading level-1 Heading that just repeats the chapter title.
+        if blocks:
+            b0 = blocks[0]
+            if (getattr(b0, "kind", "") == "heading"
+                    and int(getattr(b0, "level", 1) or 1) == 1
+                    and _clean_terms(getattr(b0, "text", "")).strip()
+                    == model._safe_str(ch.title).strip()):
+                blocks = blocks[1:]
+
+        for block in blocks:
+            try:
+                seg = _serialize_block(block, meta, out_path, counter)
+            except Exception as e:  # noqa: BLE001
+                seg = _md_note(model.Note(text=model._safe_str(block)))
+                notes.append(
+                    f"bloque '{getattr(block, 'kind', '?')}' del capítulo "
+                    f"'{ch.id}' degradado: {e}")
+            if seg:
+                segments.append(seg)
+        chapters_meta.append({"id": ch.id, "version": ch.version})
+
+    # Full-data appendix: dump everything the profile holds that the human
+    # chapters drop (additive — the .md ends up with more than the PDF/PPTX).
+    # Emitted only when a profile is supplied via meta['profile']; never fatal.
+    try:
+        appendix = _profile_appendix(meta.get("profile"))
+    except Exception as e:  # noqa: BLE001
+        appendix = ""
+        notes.append(f"apéndice de perfil omitido: {e}")
+    if appendix:
+        segments.append("---")
+        segments.append(appendix)
+
+    content = "\n\n".join(segments) + "\n"
+    note = f"{len(content)} caracteres"
+    if notes:
+        note += " · " + "; ".join(notes)
+    return _write(out_path, content, chapters_meta, note)
+
+
+def _write(out_path: str, content: str, chapters_meta: list, note: str) -> dict:
+    """Write the Markdown to disk (creating parents). dict-no-throw."""
+    try:
+        parent = os.path.dirname(os.path.abspath(out_path))
+        os.makedirs(parent, exist_ok=True)
+        with open(out_path, "w", encoding="utf-8") as fh:
+            fh.write(content)
+    except Exception as e:  # noqa: BLE001 — never raise from the writer.
+        return {"path": None, "n_chars": 0, "chapters": [],
+                "note": f"no se pudo escribir el Markdown: {e}"}
+    return {"path": out_path, "n_chars": len(content),
+            "chapters": chapters_meta, "note": note}
@@ -675,6 +675,61 @@ def _measure_figure_like(block) -> float:
    return target_h + 0.04 + cap_h + _GAP


+def _measure_kv_table(block) -> float:
+    """Faithful height of a KVTable — matches ``_place_kv_table``.
+
+    Counts the optional title heading and, per row, the wrapped VALUE column
+    (the label column never wraps in the placer). The previous estimate assumed
+    one line per row and ignored the title, so a column's keep-together Group
+    under-budgeted the figure and the chart spilled to the next page. Keep this in
+    sync with ``_place_kv_table``."""
+    h = 0.0
+    title = getattr(block, "title", None)
+    if title:
+        h += _measure_heading_text(title, 2)
+    rows = getattr(block, "rows", []) or []
+    key_w = 1.9
+    val_chars = tl.chars_per_line(_USABLE_W - key_w - 0.1, _FS_BODY)
+    lh = tl.line_height_in(_FS_BODY)
+    for row in rows:
+        try:
+            value = row[1]
+        except Exception:  # noqa: BLE001
+            value = ""
+        v_lines = tl.wrap(model._safe_str(value), val_chars)
+        h += lh * len(v_lines) + _ROW_VPAD
+    return h + _GAP
+
+
+def _measure_data_table(block) -> float:
+    """Faithful height of a DataTable — matches ``_place_data_table``.
+
+    Counts the optional title heading, the wrapped header row, every wrapped data
+    row (per-column wrap via the same ``_col_widths``/``_wrap_row`` the placer
+    uses) and the optional note. Keep this in sync with ``_place_data_table``."""
+    h = 0.0
+    title = getattr(block, "title", None)
+    if title:
+        h += _measure_heading_text(title, 2)
+    header = list(getattr(block, "header", []) or [])
+    rows = list(getattr(block, "rows", []) or [])
+    fs = _FS_CELL
+    widths = _col_widths(header, rows, fs)
+    lh = tl.line_height_in(fs)
+    if header:
+        header_lines = _wrap_row(header, widths, fs)
+        h += lh * max((len(c) for c in header_lines), default=1) + _ROW_VPAD * 2
+    for r in rows:
+        cells_lines = _wrap_row(r, widths, fs)
+        h += lh * max((len(c) for c in cells_lines), default=1) + _ROW_VPAD * 2
+    note = getattr(block, "note", None)
+    if note:
+        nlines = tl.wrap(model._safe_str(note),
+                         tl.chars_per_line(_USABLE_W, _FS_NOTE))
+        h += tl.line_height_in(_FS_NOTE) * len(nlines)
+    return h + _GAP
+
+
 def _measure_block(st: _PdfState, block) -> float:
    kind = getattr(block, "kind", "")
    try:
@@ -690,13 +745,9 @@ def _measure_block(st: _PdfState, block) -> float:
                            tl.chars_per_line(_USABLE_W, _FS_NOTE))
            return tl.line_height_in(_FS_NOTE) * len(lines) + _GAP
        if kind == "kv_table":
-            rows = getattr(block, "rows", []) or []
-            return (tl.line_height_in(_FS_BODY) + _ROW_VPAD) * (len(rows) + 1) \
-                + _GAP
+            return _measure_kv_table(block)
        if kind == "data_table":
-            rows = getattr(block, "rows", []) or []
-            return (tl.line_height_in(_FS_CELL) + _ROW_VPAD * 2) \
-                * (len(rows) + 1) + _GAP
+            return _measure_data_table(block)
        if kind == "group":
            return sum(_measure_block(st, b)
                       for b in (getattr(block, "blocks", []) or []))
@@ -735,6 +786,10 @@ def _place_group(st: _PdfState, block) -> None:
    blocks = getattr(block, "blocks", []) or []
    if not blocks:
        return
+    # Opt-in page break: start this group on a fresh page unless the current one
+    # is still empty (so a chapter can give each unit its own page).
+    if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
+        _new_page(st)
    avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
    _shrink_group_figures(st, blocks, avail_full)
    total = sum(_measure_block(st, b) for b in blocks)
@@ -625,6 +625,55 @@ def _measure_figure_like(block) -> float:
    return target_h + 0.05 + cap_h + _GAP


+def _measure_kv_table(block) -> float:
+    """Faithful KVTable height — matches ``_place_kv_table`` (rendered as a
+    Campo/Valor data table with wrapped cells). The previous estimate assumed one
+    line per row and ignored the title, so a keep-together Group under-budgeted
+    the figure and the chart spilled to the next slide. Keep in sync."""
+    h = 0.0
+    title = getattr(block, "title", None)
+    if title:
+        h += _measure_heading_text(title, 2)
+    rows = getattr(block, "rows", []) or []
+    data_rows = []
+    for row in rows:
+        try:
+            label, value = row[0], row[1]
+        except Exception:  # noqa: BLE001
+            label, value = str(row), ""
+        data_rows.append([model._safe_str(label), model._safe_str(value)])
+    header = ["Campo", "Valor"]
+    widths = _col_widths(header, data_rows)
+    fs = _FS_CELL
+    h += _row_height_in(header, widths, fs)
+    for r in data_rows:
+        h += _row_height_in(r, widths, fs)
+    return h + _GAP
+
+
+def _measure_data_table(block) -> float:
+    """Faithful DataTable height — matches ``_place_data_table`` (title heading +
+    wrapped header + every wrapped row + optional note). Keep in sync."""
+    h = 0.0
+    title = getattr(block, "title", None)
+    if title:
+        h += _measure_heading_text(title, 2)
+    header = list(getattr(block, "header", []) or [])
+    rows = list(getattr(block, "rows", []) or [])
+    fs = _FS_CELL
+    widths = _col_widths(header, rows)
+    if header:
+        h += _row_height_in(header, widths, fs)
+    for r in rows:
+        h += _row_height_in(r, widths, fs)
+    note = getattr(block, "note", None)
+    if note:
+        nlines = tl.wrap(model._safe_str(note),
+                         tl.chars_per_line(_USABLE_W, _FS_NOTE))
+        h += tl.line_height_in(_FS_NOTE) * len(nlines) + 0.05
+    return h + _GAP
+
+
 def _measure_block(st: _PptxState, block) -> float:
    kind = getattr(block, "kind", "")
    try:
@@ -639,9 +688,10 @@ def _measure_block(st: _PptxState, block) -> float:
            lines = tl.wrap(getattr(block, "text", ""),
                            tl.chars_per_line(_USABLE_W, _FS_NOTE))
            return tl.line_height_in(_FS_NOTE) * len(lines) + 0.05 + _GAP
-        if kind in ("kv_table", "data_table"):
-            rows = getattr(block, "rows", []) or []
-            return (tl.line_height_in(_FS_CELL) + 0.10) * (len(rows) + 1) + _GAP
+        if kind == "kv_table":
+            return _measure_kv_table(block)
+        if kind == "data_table":
+            return _measure_data_table(block)
        if kind == "group":
            return sum(_measure_block(st, b)
                       for b in (getattr(block, "blocks", []) or []))
@@ -664,10 +714,14 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
                   if getattr(b, "kind", "") not in ("figure", "image"))
    fig_overhead = tl.line_height_in(_FS_NOTE) + 0.05 + 0.05 + _GAP
    budget = avail_full - nonfig_h - 0.10 * len(fig_blocks)
-    if budget <= 1.0:
+    # Low thresholds: a 16:9 slide is short, so a content-heavy column (cardinality
+    # table + top-k + chart) only fits if the chart is allowed to shrink small.
+    # Prefer a small-but-present chart on the SAME slide over splitting the column
+    # across slides (matches the PDF renderer's keep-together philosophy).
+    if budget <= 0.6:
        return  # not enough room to keep together; let it flow (degrade).
    per = budget / len(fig_blocks) - fig_overhead
-    if per <= 0.8:
+    if per <= 0.35:
        return
    for fb in fig_blocks:
        cur = getattr(fb, "height_in", None)
@@ -675,12 +729,90 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
                        if isinstance(cur, (int, float)) and cur > 0 else per)


+# Minimum height (inches) reserved for a figure inside a keep-together group on
+# the short 16:9 slide. When a high-cardinality column's table(s) would otherwise
+# leave no room, the data table is trimmed (with an honest note) so the chart
+# stays on the SAME slide next to its table instead of spilling to the next one.
+_GROUP_MIN_FIG_H = 1.3
+
+
+def _trim_data_table_to_budget(block, budget: float):
+    """Return a copy of a DataTable whose rows fit within ``budget`` inches.
+
+    Keeps the title, header, as many leading rows as fit (at least one) and an
+    honest note reporting how many of the original rows are shown. NEVER mutates
+    the original block — the same Chapter blocks are rendered by the PDF renderer,
+    which keeps the full table (an A5 page fits it)."""
+    header = list(getattr(block, "header", []) or [])
+    rows = list(getattr(block, "rows", []) or [])
+    title = getattr(block, "title", None)
+    fs = _FS_CELL
+    widths = _col_widths(header, rows)
+    fixed = 0.0
+    if title:
+        fixed += _measure_heading_text(title, 2)
+    if header:
+        fixed += _row_height_in(header, widths, fs)
+    note_h = tl.line_height_in(_FS_NOTE) + 0.05
+    avail_rows = budget - fixed - note_h - _GAP
+    kept = []
+    used = 0.0
+    for r in rows:
+        rh = _row_height_in(r, widths, fs)
+        if used + rh > avail_rows and kept:
+            break
+        kept.append(r)
+        used += rh
+    if len(kept) >= len(rows):
+        return block  # already fits; keep the original (with its own note).
+    note = (f"top {len(kept)} de {len(rows)} categorías mostradas "
+            "(recortado para caber en el slide; el PDF muestra más)")
+    return model.DataTable(header=header, rows=kept, title=title, note=note)
+
+
+def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list:
+    """Return a slide-fitting copy of a keep-together group's blocks.
+
+    On the short 16:9 slide a high-cardinality column's top-k table plus its
+    chart can overflow. Reserve ``_GROUP_MIN_FIG_H`` for the (later shrunk) figure
+    and trim the data table(s) to what is left, so every column keeps its chart
+    next to its table on ONE slide. No-op when the group has no figure+table pair
+    (e.g. id-like columns already drop the top-k upstream, or it already fits)."""
+    has_fig = any(getattr(b, "kind", "") in ("figure", "image") for b in blocks)
+    tbls = [b for b in blocks if getattr(b, "kind", "") == "data_table"]
+    if not (has_fig and tbls):
+        return blocks
+    fixed_h = sum(_measure_block(st, b) for b in blocks
+                  if getattr(b, "kind", "") not in ("figure", "image",
+                                                    "data_table"))
+    tables_h = sum(_measure_block(st, b) for b in tbls)
+    budget_tables = avail_full - fixed_h - _GROUP_MIN_FIG_H
+    if tables_h <= budget_tables:
+        return blocks  # already fits next to a min-height figure; leave intact.
+    out = []
+    for b in blocks:
+        if getattr(b, "kind", "") != "data_table":
+            out.append(b)
+            continue
+        trimmed = _trim_data_table_to_budget(b, max(budget_tables, 0.8))
+        out.append(trimmed)
+        budget_tables -= _measure_data_table(trimmed)
+    return out
+
+
 def _place_group(st: _PptxState, block) -> None:
    """Render a keep-together Group: move it whole to the next slide if needed."""
    blocks = getattr(block, "blocks", []) or []
    if not blocks:
        return
+    # Opt-in slide break: start this group on a fresh slide unless the current one
+    # is still empty (so a chapter can give each unit its own slide).
+    if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
+        _new_slide(st, cont=True)
    avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
+    # Trim oversized tables first (keeps the chart on the same slide), then shrink
+    # the figure to share the remaining room.
+    blocks = _fit_group_blocks(st, blocks, avail_full)
    _shrink_group_figures(st, blocks, avail_full)
    total = sum(_measure_block(st, b) for b in blocks)
    if total <= avail_full:
@@ -4,10 +4,10 @@ name: column_quality_score
 kind: function
 lang: py
 domain: datascience
-version: "1.0.0"
+version: "2.0.0"
 purity: pure
 signature: "def column_quality_score(col: dict) -> dict"
-description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda, con desglose completeness/validity/consistency y lista de issues legibles. Funcion pura, no muta el input."
+description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda. Combina completeness (0.6) y validity (0.4) con renormalizacion por aplicabilidad; los outliers, columnas constantes e ids NO bajan el score (van a observations). Devuelve desglose por dimension, issues (defectos) y observations (señales analiticas). Funcion pura, no muta el input."
 tags: [eda, data-quality, profiling, scoring, datascience]
 uses_functions: []
 uses_types: []
@@ -17,20 +17,26 @@ error_type: ""
 imports: []
 example: |
  from datascience import column_quality_score
-  col = {"name": "precio", "inferred_type": "float", "null_pct": 0.2,
-         "unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 0.08}}
+  col = {"name": "precio", "inferred_type": "numeric", "null_pct": 0.2,
+         "unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 8.0}}
  column_quality_score(col)
-  # {"score": 86.8, "completeness": 0.8, "validity": 0.92,
-  #  "consistency": 1.0, "issues": ["20% nulos", "8% outliers"]}
+  # {"score": 88.0, "completeness": 0.8, "validity": 1.0,
+  #  "applicable": ["completeness", "validity"], "issues": ["20% nulos"],
+  #  "observations": ["8% de valores atípicos (z-score>3): ..."]}
 tested: true
 tests:
  - "test_clean_column_high_score"
-  - "test_half_null_lowers_completeness_and_score"
-  - "test_constant_column_flags_issue"
+  - "test_weights_60_40_native_type"
+  - "test_outliers_do_not_penalize_score"
+  - "test_nulls_lower_score_more_than_outliers"
+  - "test_validity_from_parse_rate_lowers_score"
+  - "test_validity_from_match_rate"
+  - "test_free_text_renormalizes_to_completeness_only"
+  - "test_all_null_column_scores_zero"
+  - "test_constant_column_scores_full_and_is_observation"
+  - "test_high_cardinality_id_scores_full_and_is_observation"
+  - "test_mostly_null_no_double_counts_validity"
  - "test_empty_dict_does_not_crash"
-  - "test_outliers_penalize_validity"
-  - "test_mostly_null_flag_halves_validity"
-  - "test_high_cardinality_text_flagged_as_id"
  - "test_none_values_treated_defensively"
  - "test_does_not_mutate_input"
 test_file_path: "python/functions/datascience/column_quality_score_test.py"
@@ -38,16 +44,22 @@ file_path: "python/functions/datascience/column_quality_score.py"
 params:
  - name: col
    desc: >
-      ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb).
-      Se leen sus claves de forma defensiva con .get(...) y se toleran valores
-      None. Claves usadas: null_pct (0-1), inferred_type, semantic_type,
-      unique_pct (0-1), flags (list[str], reconoce "constant"/"mostly_null"),
-      numeric ({outlier_pct: 0-1, ...}|None) y match_rate (opcional, 0-1).
+      ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb /
+      profile_table). Se leen sus claves de forma defensiva con .get(...) y se
+      toleran valores None. Claves usadas: null_pct (0-1), n_rows, empty_count
+      (texto), inferred_type, semantic_type, validity_rate (0-1, lo expone
+      profile_table al promocionar texto a numero/fecha), match_rate (0-1),
+      unique_pct (0-1), flags (list[str], reconoce
+      "constant"/"possible_id"/"high_cardinality") y numeric ({outlier_pct: 0-100,
+      skew, ...}|None).
 output: >
-  dict con score (float 0-100, redondeado a 1 decimal), completeness (0-1),
-  validity (0-1), consistency (0-1) e issues (list[str] de descripciones
-  legibles de los problemas detectados). score = round(100 * (0.5*completeness
-  + 0.3*validity + 0.2*consistency), 1).
+  dict con score (float 0-100, 1 decimal), completeness (0-1), validity (0-1 o
+  None si no aplicable), dimensions ({completeness, validity}), applicable
+  (list[str] de dimensiones que entraron en el score), issues (list[str] SOLO de
+  defectos de calidad: nulos, vacios, valores no conformes) y observations
+  (list[str] de señales analiticas que NO bajan el score: outliers, columna
+  constante, posible id, asimetria). score = round(100 * (0.6*completeness +
+  0.4*validity) / pesos_aplicables, 1), renormalizado cuando validity no aplica.
 ---

 ## Ejemplo
@@ -59,51 +71,71 @@ from datascience import column_quality_score
 col = {
    "name": "precio",
    "physical_type": "DOUBLE",
-    "inferred_type": "float",
+    "inferred_type": "numeric",
    "semantic_type": "",
-    "count": 800,
    "n_rows": 1000,
    "null_count": 200,
    "null_pct": 0.20,
    "distinct_count": 400,
    "unique_pct": 0.40,
    "flags": [],
-    "numeric": {"outlier_pct": 0.08},
+    "numeric": {"outlier_pct": 8.0, "skew": 0.3},
    "categorical": None,
    "datetime": None,
 }

 column_quality_score(col)
 # {
-#   "score": 86.8,
-#   "completeness": 0.8,    # 1 - 0.20
-#   "validity": 0.92,       # 1 - min(0.08, 0.3)
-#   "consistency": 1.0,
-#   "issues": ["20% nulos", "8% outliers"],
+#   "score": 88.0,            # 100 * (0.6*0.8 + 0.4*1.0)
+#   "completeness": 0.8,      # 1 - 0.20
+#   "validity": 1.0,          # numerica nativa: el tipo es conforme
+#   "dimensions": {"completeness": 0.8, "validity": 1.0},
+#   "applicable": ["completeness", "validity"],
+#   "issues": ["20% nulos"],                       # SOLO defectos de calidad
+#   "observations": ["8% de valores atípicos (z-score>3): ..."],  # NO bajan score
 # }
 ```

 ## Cuando usarla

 Cuando hayas perfilado una tabla con el grupo `eda` (p.ej.
-`summarize_table_duckdb`) y necesites un numero 0-100 por columna para
-ordenar/priorizar limpieza de datos, pintar semaforos de calidad en un
-dashboard, o decidir que columnas descartar antes de modelar. Es la capa de
-scoring sobre el ColumnProfile crudo: lee el perfil, no toca los datos.
+`summarize_table_duckdb` / `profile_table`) y necesites un numero 0-100 por
+columna para ordenar/priorizar limpieza de datos, pintar semaforos de calidad,
+o decidir que columnas descartar antes de modelar. Separa los **defectos de
+calidad reales** (`issues`: nulos, vacios, valores que no parsean a su tipo) de
+las **observaciones analiticas** (`observations`: outliers, columnas constantes,
+ids), que se reportan pero no penalizan. Es la capa de scoring sobre el
+ColumnProfile crudo: lee el perfil, no toca los datos.

-## Notas
+## Gotchas

-Funcion pura, sin I/O ni dependencias externas, no muta `col`. Lee todas las
-claves con `.get(...)` y tolera que vengan en `None` (un ColumnProfile recien
-salido de `summarize_table_duckdb` trae muchas claves a `None`), por lo que
-nunca falla por claves ausentes — un `{}` produce un resultado bien definido.
+Funcion pura, sin I/O, no muta `col`. Aun asi conviene saber:

-Pesos del score: completeness 0.5, validity 0.3, consistency 0.2.
+- **Los outliers NO bajan el score.** Un valor extremo puede ser real y correcto
+  (un cliente que compra mucho); detectar atipicos es analisis de la
+  distribucion, no un juicio de correccion. Salen en `observations`, no en
+  `issues`. Mismo trato para columnas constantes e identificadores de alta
+  cardinalidad: son observaciones, no defectos.
+- **`validity` puede ser `None`** (no aplicable): texto libre sin `semantic_type`
+  ni `validity_rate`, o columna 100% nula. En ese caso el score se renormaliza a
+  solo `completeness` (la columna no se premia ni castiga por algo no medible).
+- **`outlier_pct` se interpreta en escala 0-100** (la que emite
+  `describe_numeric`, z-score>3). Pasar una fraccion 0-1 produce un texto de
+  observacion con el % equivocado, pero NUNCA afecta al score.
+- **`validity_rate` lo puebla `profile_table`** al promocionar una columna de
+  texto a numero/fecha (fraccion que parsea). Si no esta presente y el tipo es
+  nativo numerico/fecha/bool, `validity = 1.0`.
+- Sin doble conteo: la falta de datos cuenta solo en `completeness` (el antiguo
+  castigo de `mostly_null` sobre `validity` se elimino).

- **completeness** = `1 - null_pct` (None -> 0 nulls -> 1.0).
- **validity**: parte de 1.0 y penaliza `min(outlier_pct, 0.3)` en columnas
-  numericas, `0.5 * (1 - match_rate)` si hay `semantic_type` declarado con
-  `match_rate` bajo disponible, y multiplica por 0.5 si el flag `mostly_null`
-  esta presente.
- **consistency**: 1.0 salvo flag `constant` (-> 0.3, columna poco informativa)
-  o texto con `unique_pct > 0.9` (-> 0.6, posible id de alta cardinalidad).
+## Capability growth log
+
+- v2.0.0 (2026-06-30) — nueva formula de calidad (report 2046): pesos 60/40
+  (completeness/validity) con renormalizacion por aplicabilidad; se elimina la
+  dimension `consistency`-como-informatividad y el doble castigo de
+  `mostly_null`; los outliers/constantes/ids salen del score a `observations`;
+  validity mide conformidad real (parse rate / match rate / tipo nativo). Salida
+  ampliada con `dimensions`, `applicable` y `observations`.
+- v1.0.0 — version inicial: pesos 50/30/20 (completeness/validity/consistency),
+  los outliers penalizaban validity (con bug de escala) y consistency penalizaba
+  informatividad.
@@ -1,34 +1,78 @@
 """Score de calidad de datos (0-100) para un ColumnProfile del grupo eda.

 Funcion pura: dado el perfil de una columna producido por el grupo de
-capacidad `eda` (p.ej. summarize_table_duckdb), calcula un score agregado
-de calidad junto a su desglose en completeness / validity / consistency y
-una lista de issues legibles. No realiza I/O ni muta el input.
+capacidad `eda` (p.ej. summarize_table_duckdb / profile_table), calcula un
+score agregado de calidad junto a su desglose por dimension y dos listas
+legibles separadas: `issues` (defectos de calidad reales que SI bajan el
+score) y `observations` (señales analiticas que NO bajan el score). No
+realiza I/O ni muta el input.
+
+Modelo (DAMA-DMBOK / ISO 8000), ver report 2046:
+
+- Solo entran en el score las dimensiones medibles automaticamente desde el
+  perfil, sin fuente externa de verdad: completeness y validity por columna.
+- Renormalizacion por aplicabilidad: si una dimension no es medible en la
+  columna (texto libre sin semantica -> validity no aplica; columna 100% nula
+  -> validity no medible), se excluye y los pesos se renormalizan sobre las
+  aplicables. Una columna ni se premia ni se castiga por algo no medible.
+- Sin doble conteo: la falta de datos cuenta solo en completeness (se elimino
+  el antiguo castigo extra de `mostly_null` sobre validity).
+- Los OUTLIERS NO bajan la calidad. Un valor extremo puede ser real y
+  correcto; detectar atipicos es analisis de la distribucion, no un juicio de
+  coreccion. Outliers, columnas constantes e identificadores de alta
+  cardinalidad pasan a `observations`, nunca a `issues`.
 """


+# Pesos base de las dimensiones de columna (se renormalizan por aplicabilidad).
+_W_COMPLETENESS = 0.6
+_W_VALIDITY = 0.4
+
+# Tipos inferidos cuyo almacen garantiza la conformidad de tipo (validity=1.0)
+# cuando NO vienen de una promocion de texto (en cuyo caso manda validity_rate).
+_NATIVE_TYPED = ("numeric", "integer", "float", "datetime", "date", "boolean", "bool")
+
+
 def column_quality_score(col: dict) -> dict:
    """Calcula un score de calidad de datos 0-100 para un ColumnProfile.

-    El score pondera tres dimensiones:
-      - completeness (0.5): proporcion de valores no nulos.
-      - validity     (0.3): ausencia de outliers / heuristicas de validez.
-      - consistency  (0.2): la columna aporta informacion (no constante, no ruido).
+    El score combina solo dimensiones de calidad medibles desde el perfil, con
+    renormalizacion por aplicabilidad:
+
+      - completeness (peso base 0.6, siempre aplica): proporcion de valores
+        presentes = 1 - null_pct. En texto, las celdas vacias (`empty_count`)
+        tambien cuentan como faltantes.
+      - validity (peso base 0.4, cuando hay un criterio de validacion real):
+        fraccion de valores no nulos conformes a su tipo/semantica. Tipo nativo
+        numerico/fecha/bool = 1.0; texto promovido a numero/fecha = parse rate
+        (`validity_rate`); texto con `semantic_type` regexable = `match_rate`;
+        texto libre o columna 100% nula = NO aplicable (renormaliza a solo
+        completeness).
+
+    Los outliers, columnas constantes, identificadores y asimetria fuerte NO
+    bajan el score: se devuelven en `observations`.

    Args:
        col: ColumnProfile dict del grupo eda. Se leen las claves de forma
            defensiva con .get(...) y se tolera que muchas vengan en None.
-            Claves relevantes: null_pct, inferred_type, semantic_type,
-            unique_pct, flags (list[str]), numeric ({outlier_pct, ...}|None),
-            match_rate (opcional).
+            Claves relevantes: null_pct (0-1), n_rows, empty_count,
+            inferred_type, semantic_type, validity_rate (0-1, lo expone
+            profile_table al promocionar texto a numero/fecha), match_rate
+            (0-1), unique_pct (0-1), flags (list[str], reconoce
+            "constant"/"possible_id"/"high_cardinality"), numeric
+            ({outlier_pct: 0-100, skew, ...}|None).

    Returns:
        dict con:
-          score        (float, 0-100, redondeado a 1 decimal),
-          completeness (float, 0-1),
-          validity     (float, 0-1),
-          consistency  (float, 0-1),
-          issues       (list[str]) descripciones legibles de los problemas.
+          score        (float 0-100, redondeado a 1 decimal),
+          completeness (float 0-1),
+          validity     (float 0-1 | None si no aplicable),
+          dimensions   ({completeness, validity}),
+          applicable   (list[str] de dimensiones que entraron en el score),
+          issues       (list[str]) SOLO defectos de calidad (nulos, vacios,
+                       valores no conformes a su tipo/semantica),
+          observations (list[str]) señales analiticas que NO bajan el score
+                       (outliers, columna constante, posible id, asimetria).
    """
    if not isinstance(col, dict):
        col = {}
@@ -39,103 +83,153 @@ def column_quality_score(col: dict) -> dict:
    flags = set(flags)

    issues: list[str] = []
+    observations: list[str] = []
+
+    inferred_type = col.get("inferred_type") or ""
+    semantic_type = col.get("semantic_type") or ""

    # --- completeness -------------------------------------------------
-    null_pct = col.get("null_pct")
-    if null_pct is None:
-        null_pct = 0.0
-    try:
-        null_pct = float(null_pct)
-    except (TypeError, ValueError):
-        null_pct = 0.0
-    null_pct = _clamp(null_pct, 0.0, 1.0)
+    # Falta de datos = nulos + (en texto) celdas vacias. Es el unico sitio
+    # donde la falta de datos cuenta: nunca se duplica en validity.
+    null_pct = _clamp(_num(col.get("null_pct"), 0.0), 0.0, 1.0)
    completeness = 1.0 - null_pct
    if null_pct > 0:
-        issues.append(f"{round(null_pct * 100)}% nulos")
+        issues.append(f"{_pct(null_pct)} nulos")

-    # --- validity -----------------------------------------------------
-    validity = 1.0
-    inferred_type = col.get("inferred_type") or ""
+    empty_frac = 0.0
+    n_rows = col.get("n_rows")
+    empty_count = col.get("empty_count")
+    if (
+        isinstance(n_rows, (int, float)) and not isinstance(n_rows, bool) and n_rows > 0
+        and isinstance(empty_count, (int, float)) and not isinstance(empty_count, bool)
+        and empty_count > 0
+    ):
+        empty_frac = _clamp(float(empty_count) / float(n_rows), 0.0, 1.0)
+        completeness = _clamp(completeness - empty_frac, 0.0, 1.0)
+        issues.append(f"{_pct(empty_frac)} vacíos")

-    numeric = col.get("numeric")
-    is_numeric = inferred_type in ("integer", "float", "numeric") or isinstance(numeric, dict)
-    if isinstance(numeric, dict):
-        outlier_pct = numeric.get("outlier_pct")
-        if outlier_pct is not None:
-            try:
-                outlier_pct = float(outlier_pct)
-            except (TypeError, ValueError):
-                outlier_pct = 0.0
-            outlier_pct = _clamp(outlier_pct, 0.0, 1.0)
-            if outlier_pct > 0:
-                penalty = min(outlier_pct, 0.3)
-                validity -= penalty
-                issues.append(f"{round(outlier_pct * 100)}% outliers")
-
-    # semantic_type declarado pero con baja tasa de match (si la conocemos).
-    semantic_type = col.get("semantic_type") or ""
-    match_rate = col.get("match_rate")
-    if semantic_type and match_rate is not None:
-        try:
-            match_rate = float(match_rate)
-        except (TypeError, ValueError):
-            match_rate = None
-        if match_rate is not None:
-            match_rate = _clamp(match_rate, 0.0, 1.0)
-            if match_rate < 1.0:
-                shortfall = 1.0 - match_rate
-                validity -= 0.5 * shortfall
-                issues.append(
-                    f"semantic_type '{semantic_type}' con baja coincidencia "
-                    f"({round(match_rate * 100)}%)"
-                )
-
-    if "mostly_null" in flags:
-        validity *= 0.5
-        issues.append("mayoritariamente nula")
-
-    validity = _clamp(validity, 0.0, 1.0)
-
-    # --- consistency --------------------------------------------------
-    consistency = 1.0
-    if "constant" in flags:
-        consistency = 0.3
-        issues.append("columna constante")
+    # --- validity (con renormalizacion por aplicabilidad) -------------
+    # None = no medible -> se excluye del score (no penaliza ni premia).
+    validity = None
+    if completeness <= 0.0:
+        # Columna 100% faltante: no hay valores no nulos sobre los que medir
+        # conformidad. validity no aplica -> el score sale solo de completeness
+        # (= 0). Es el peor defecto de calidad posible.
+        validity = None
    else:
-        unique_pct = col.get("unique_pct")
-        if unique_pct is not None:
-            try:
-                unique_pct = float(unique_pct)
-            except (TypeError, ValueError):
-                unique_pct = None
-        if (
-            inferred_type == "text"
+        validity_rate = col.get("validity_rate")
+        match_rate = col.get("match_rate")
+        if validity_rate is not None:
+            # Texto promovido a numero/fecha: parse rate real de la muestra.
+            v = _num(validity_rate, None)
+            if v is not None:
+                validity = _clamp(v, 0.0, 1.0)
+                if validity < 1.0:
+                    kind = (
+                        "número" if inferred_type == "numeric"
+                        else "fecha" if inferred_type == "datetime"
+                        else inferred_type or "su tipo"
+                    )
+                    issues.append(
+                        f"{_pct(1.0 - validity)} no parsea al tipo {kind}"
+                    )
+        elif inferred_type in _NATIVE_TYPED:
+            # Tipo nativo garantizado por el almacen: no hay valores que no
+            # parseen. validity = 1.0 (no se confunde con tener outliers).
+            validity = 1.0
+        elif semantic_type and match_rate is not None:
+            v = _num(match_rate, None)
+            if v is not None:
+                validity = _clamp(v, 0.0, 1.0)
+                if validity < 1.0:
+                    issues.append(
+                        f"{_pct(1.0 - validity)} no casa con el "
+                        f"formato «{semantic_type}»"
+                    )
+        else:
+            # Texto libre / categorica sin semantica: no hay criterio honesto
+            # de validez. No aplica.
+            validity = None
+
+    # --- observations (NO bajan el score) -----------------------------
+    numeric = col.get("numeric")
+    if isinstance(numeric, dict):
+        # outlier_pct viene en escala 0-100 desde describe_numeric (z-score>3).
+        outlier_pct = _num(numeric.get("outlier_pct"), None)
+        if outlier_pct is not None and outlier_pct >= 0.05:
+            observations.append(
+                f"{_pct(outlier_pct / 100.0)} de valores atípicos (z-score>3): "
+                "revisar si son errores u observaciones legítimas"
+            )
+        skew = _num(numeric.get("skew"), None)
+        if skew is not None and abs(skew) >= 1.0:
+            observations.append(
+                f"asimetría fuerte (skew={round(skew, 2)}): considerar "
+                "re-expresión antes de modelar"
+            )
+
+    if "constant" in flags:
+        observations.append(
+            "columna constante: aporta poca información para el análisis"
+        )
+
+    unique_pct = _num(col.get("unique_pct"), None)
+    is_id = (
+        "possible_id" in flags
+        or "high_cardinality" in flags
+        or (
+            inferred_type in ("text", "categorical")
            and unique_pct is not None
            and _clamp(unique_pct, 0.0, 1.0) > 0.9
-        ):
-            consistency = 0.6
-            issues.append("posible id de alta cardinalidad")
-
-    consistency = _clamp(consistency, 0.0, 1.0)
-
-    # --- score agregado ----------------------------------------------
-    score = round(
-        100.0 * (0.5 * completeness + 0.3 * validity + 0.2 * consistency),
-        1,
+        )
    )
+    if is_id:
+        observations.append(
+            "valores casi únicos: posible identificador (no es un defecto de calidad)"
+        )

-    # Silencia warnings sobre la variable de tipo no usada.
-    _ = is_numeric
+    # --- score agregado con renormalizacion ---------------------------
+    applicable = ["completeness"]
+    num = _W_COMPLETENESS * completeness
+    den = _W_COMPLETENESS
+    if validity is not None:
+        applicable.append("validity")
+        num += _W_VALIDITY * validity
+        den += _W_VALIDITY
+    score = round(100.0 * num / den, 1) if den > 0 else 0.0

    return {
        "score": score,
        "completeness": completeness,
        "validity": validity,
-        "consistency": consistency,
+        "dimensions": {"completeness": completeness, "validity": validity},
+        "applicable": applicable,
        "issues": issues,
+        "observations": observations,
    }


+def _pct(frac: float) -> str:
+    """Formatea una fraccion 0-1 como porcentaje honesto: «N%» si >=1%, «0.N%»
+    por debajo (para no mostrar «0%» cuando hay un defecto real pequeño)."""
+    p = frac * 100.0
+    if p >= 1.0:
+        return f"{round(p)}%"
+    return f"{p:.1f}%"
+
+
+def _num(x, default):
+    """Convierte x a float; devuelve `default` si es None o no parseable."""
+    if x is None:
+        return default
+    if isinstance(x, bool):
+        return default
+    try:
+        return float(x)
+    except (TypeError, ValueError):
+        return default
+
+
 def _clamp(x: float, lo: float, hi: float) -> float:
    """Recorta x al rango [lo, hi]."""
    if x < lo:
@@ -1,4 +1,12 @@
-"""Tests para column_quality_score."""
+"""Tests para column_quality_score (nueva fórmula, report 2046).
+
+Verifica las invariantes de la fórmula de calidad:
+  - completeness (0.6) + validity (0.4) con renormalización por aplicabilidad.
+  - Los OUTLIERS no bajan el score (van a observations, no a issues).
+  - Columnas constantes e ids no bajan el score (observations).
+  - Sin doble conteo de la falta de datos.
+  - all-null -> score 0; función pura (no muta el input).
+"""

 import os
 import sys
@@ -9,11 +17,11 @@ from column_quality_score import column_quality_score


 def _clean_numeric_col() -> dict:
-    """ColumnProfile de una columna numerica sana, sin problemas."""
+    """ColumnProfile de una columna numérica nativa sana, sin problemas."""
    return {
        "name": "edad",
        "physical_type": "INTEGER",
-        "inferred_type": "integer",
+        "inferred_type": "numeric",
        "semantic_type": "",
        "count": 1000,
        "n_rows": 1000,
@@ -28,85 +36,163 @@ def _clean_numeric_col() -> dict:
    }


+# --------------------------------------------------------------------------- #
+# Golden
+# --------------------------------------------------------------------------- #
 def test_clean_column_high_score():
    out = column_quality_score(_clean_numeric_col())
-    assert out["score"] > 90
+    assert out["score"] == 100.0
    assert out["completeness"] == 1.0
    assert out["validity"] == 1.0
-    assert out["consistency"] == 1.0
+    assert out["applicable"] == ["completeness", "validity"]
    assert out["issues"] == []
+    assert out["observations"] == []


-def test_half_null_lowers_completeness_and_score():
+def test_weights_60_40_native_type():
+    """30% nulos en numérica nativa: score = 100*(0.6*0.7 + 0.4*1.0) = 82."""
    col = _clean_numeric_col()
-    col["null_count"] = 500
-    col["null_pct"] = 0.5
-    clean_score = column_quality_score(_clean_numeric_col())["score"]
+    col["null_pct"] = 0.30
+    col["null_count"] = 300
    out = column_quality_score(col)
-    assert out["completeness"] == 0.5
-    assert out["score"] < clean_score
-    assert any("nulos" in issue for issue in out["issues"])
+    assert out["completeness"] == 0.7
+    assert out["validity"] == 1.0
+    assert out["score"] == 82.0
+    assert any("nulos" in i for i in out["issues"])


-def test_constant_column_flags_issue():
+# --------------------------------------------------------------------------- #
+# Outliers FUERA del score
+# --------------------------------------------------------------------------- #
+def test_outliers_do_not_penalize_score():
+    """Columna con outliers pero sin nulos -> score máximo; outliers en observations."""
+    col = _clean_numeric_col()
+    col["numeric"] = {"outlier_pct": 18.0, "skew": 0.2}  # 18% atípicos (escala 0-100)
+    out = column_quality_score(col)
+    assert out["score"] == 100.0  # los outliers NO bajan la calidad
+    assert out["validity"] == 1.0
+    # No aparecen como problema de calidad...
+    assert not any("atípic" in i or "outlier" in i for i in out["issues"])
+    # ...sino como observación analítica.
+    assert any("atípic" in o for o in out["observations"])
+
+
+def test_nulls_lower_score_more_than_outliers():
+    """Vacíos sí penalizan; outliers no: comparar las dos columnas."""
+    con_nulos = _clean_numeric_col()
+    con_nulos["null_pct"] = 0.30
+    con_outliers = _clean_numeric_col()
+    con_outliers["numeric"] = {"outlier_pct": 30.0}
+    assert column_quality_score(con_nulos)["score"] < \
+        column_quality_score(con_outliers)["score"]
+
+
+# --------------------------------------------------------------------------- #
+# Validity: aplicabilidad y renormalización
+# --------------------------------------------------------------------------- #
+def test_validity_from_parse_rate_lowers_score():
+    """Numérica como texto con 20% basura: validity=0.8 -> score=92."""
+    col = {
+        "name": "precio_txt", "inferred_type": "numeric", "semantic_type": "decimal",
+        "null_pct": 0.0, "validity_rate": 0.80, "flags": [], "numeric": None,
+    }
+    out = column_quality_score(col)
+    assert out["validity"] == 0.8
+    assert out["score"] == 92.0  # 100*(0.6 + 0.4*0.8)
+    assert any("no parsea" in i for i in out["issues"])
+
+
+def test_validity_from_match_rate():
+    """Texto con semantic_type y 5% no conforme: validity=0.95."""
+    col = {
+        "name": "email", "inferred_type": "text", "semantic_type": "email",
+        "null_pct": 0.0, "match_rate": 0.95, "unique_pct": 0.5, "flags": [],
+    }
+    out = column_quality_score(col)
+    assert out["validity"] == 0.95
+    assert out["score"] == 98.0  # 100*(0.6 + 0.4*0.95)
+    assert any("no casa" in i for i in out["issues"])
+
+
+def test_free_text_renormalizes_to_completeness_only():
+    """Texto libre sin semántica: validity no aplica -> score = 100*completeness."""
+    col = {
+        "name": "comentario", "inferred_type": "text", "semantic_type": "",
+        "null_pct": 0.30, "unique_pct": 0.5, "flags": [], "numeric": None,
+    }
+    out = column_quality_score(col)
+    assert out["validity"] is None
+    assert out["applicable"] == ["completeness"]
+    assert out["completeness"] == 0.7
+    assert out["score"] == 70.0  # renormalizado a solo completeness
+
+
+# --------------------------------------------------------------------------- #
+# Casos límite (report §4.6)
+# --------------------------------------------------------------------------- #
+def test_all_null_column_scores_zero():
+    col = _clean_numeric_col()
+    col["null_pct"] = 1.0
+    col["null_count"] = 1000
+    out = column_quality_score(col)
+    assert out["completeness"] == 0.0
+    assert out["validity"] is None  # no medible sin valores no nulos
+    assert out["score"] == 0.0
+
+
+def test_constant_column_scores_full_and_is_observation():
+    """Columna constante: dato válido y completo -> score 100; baja info = observación."""
    col = _clean_numeric_col()
    col["flags"] = ["constant"]
    col["distinct_count"] = 1
    col["unique_pct"] = 0.001
    out = column_quality_score(col)
-    assert out["consistency"] == 0.3
-    assert any("constante" in issue for issue in out["issues"])
+    assert out["score"] == 100.0  # NO se castiga la baja informatividad
+    assert not any("constante" in i for i in out["issues"])
+    assert any("constante" in o for o in out["observations"])


+def test_high_cardinality_id_scores_full_and_is_observation():
+    """Id de alta cardinalidad: unicidad perfecta -> score 100; posible id = observación."""
+    col = {
+        "name": "uuid", "inferred_type": "text", "semantic_type": "",
+        "null_pct": 0.0, "unique_pct": 0.99, "flags": ["possible_id"],
+        "numeric": None,
+    }
+    out = column_quality_score(col)
+    assert out["score"] == 100.0
+    assert not any("identificador" in i for i in out["issues"])
+    assert any("identificador" in o for o in out["observations"])
+
+
+def test_mostly_null_no_double_counts_validity():
+    """85% nulos: solo completeness penaliza; validity nativa sigue 1.0 (sin doble castigo)."""
+    col = _clean_numeric_col()
+    col["null_pct"] = 0.85
+    col["flags"] = ["mostly_null"]
+    out = column_quality_score(col)
+    assert out["validity"] == 1.0  # ya no se multiplica por 0.5
+    # score = 100*(0.6*0.15 + 0.4*1.0) = 49
+    assert out["score"] == 49.0
+    assert not any("mayoritariamente" in o for o in out["observations"])
+
+
+# --------------------------------------------------------------------------- #
+# Robustez
+# --------------------------------------------------------------------------- #
 def test_empty_dict_does_not_crash():
    out = column_quality_score({})
    assert isinstance(out["score"], float)
    assert out["completeness"] == 1.0
    assert 0.0 <= out["score"] <= 100.0
    assert isinstance(out["issues"], list)
-
-
-def test_outliers_penalize_validity():
-    col = _clean_numeric_col()
-    col["numeric"] = {"outlier_pct": 0.2}
-    out = column_quality_score(col)
-    assert out["validity"] < 1.0
-    assert any("outliers" in issue for issue in out["issues"])
-
-
-def test_mostly_null_flag_halves_validity():
-    col = _clean_numeric_col()
-    col["null_pct"] = 0.85
-    col["flags"] = ["mostly_null"]
-    out = column_quality_score(col)
-    assert out["validity"] == 0.5
-    assert any("mayoritariamente nula" in issue for issue in out["issues"])
-
-
-def test_high_cardinality_text_flagged_as_id():
-    col = {
-        "name": "uuid",
-        "inferred_type": "text",
-        "semantic_type": "",
-        "null_pct": 0.0,
-        "unique_pct": 0.99,
-        "flags": [],
-        "numeric": None,
-    }
-    out = column_quality_score(col)
-    assert out["consistency"] < 1.0
-    assert any("alta cardinalidad" in issue for issue in out["issues"])
+    assert isinstance(out["observations"], list)


 def test_none_values_treated_defensively():
    col = {
-        "name": "x",
-        "inferred_type": None,
-        "semantic_type": None,
-        "null_pct": None,
-        "unique_pct": None,
-        "flags": None,
-        "numeric": None,
+        "name": "x", "inferred_type": None, "semantic_type": None,
+        "null_pct": None, "unique_pct": None, "flags": None, "numeric": None,
    }
    out = column_quality_score(col)
    assert out["completeness"] == 1.0
@@ -0,0 +1,107 @@
+---
+name: detect_declared_keys_duckdb
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def detect_declared_keys_duckdb(db_path: str, table: str = None) -> dict"
+description: "Detecta las claves DECLARADAS (constraints reales) de un schema DuckDB leyendo la table function duckdb_constraints(): extrae PRIMARY KEY, FOREIGN KEY y UNIQUE (ignora NOT NULL y CHECK) y las devuelve normalizadas con sus columnas, y para las FK con su tabla y columnas referenciadas. Con table=None procesa todas las tablas; con table='X' filtra a PK/UNIQUE de X y a FK cuyo origen es X (case-sensitive). A diferencia de infer_fk_containment_duckdb (que INFIERE FKs candidatas por containment de valores cuando el schema no las declara), esta funcion devuelve las relaciones de clave REALES del schema. Estilo dict-no-throw: nunca lanza. Parte del grupo eda (relaciones de clave)."
+tags: [eda, duckdb, datascience, relations, primary-key, foreign-key, schema, exploratory-data-analysis]
+params:
+  - name: db_path
+    desc: "Ruta al archivo DuckDB. Debe existir (lectura read-only via duckdb_query_readonly; no se crea). Un path inexistente devuelve {status:'error', ...}."
+  - name: table
+    desc: "Si se pasa, filtra los resultados a esa tabla: incluye PRIMARY KEY y UNIQUE cuya tabla sea `table`, y FOREIGN KEY cuya tabla ORIGEN sea `table` (no la referenciada). None (default) devuelve los constraints de todas las tablas. La comparacion es case-sensitive (nombres tal cual los devuelve DuckDB)."
+output: "dict dict-no-throw. En exito {status:'ok', primary_keys:[{table:str, columns:[str,...]}, ...], foreign_keys:[{table:str, columns:[str,...], referenced_table:str, referenced_columns:[str,...]}, ...], unique:[{table:str, columns:[str,...]}, ...], tables:[str,...]} donde tables es la lista ordenada de tablas (origen) que poseen al menos un constraint PK/FK/UNIQUE emitido. Solo se emiten constraints de clave: NOT NULL y CHECK se ignoran. En error {status:'error', error:str}."
+uses_functions: [duckdb_query_readonly_py_infra]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+tested: true
+tests: ["test_golden_detecta_pks_y_fk", "test_golden_ignora_not_null_y_check", "test_edge_filtra_por_tabla_orders", "test_edge_filtra_por_tabla_customers", "test_edge_unique_declarado", "test_edge_sin_constraints_listas_vacias", "test_error_db_inexistente_no_lanza", "test_shape_resultado"]
+test_file_path: "python/functions/datascience/detect_declared_keys_duckdb_test.py"
+file_path: "python/functions/datascience/detect_declared_keys_duckdb.py"
+---
+
+## Ejemplo
+
+```python
+import sys, os, duckdb
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience import detect_declared_keys_duckdb
+
+# Base de ejemplo en /tmp: orders.customer_id -> customers.id (FK declarada)
+path = "/tmp/declared_keys_demo.duckdb"
+if os.path.exists(path):
+    os.remove(path)
+con = duckdb.connect(path)
+con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)")
+con.execute(
+    "CREATE TABLE orders("
+    "  id INTEGER PRIMARY KEY,"
+    "  customer_id INTEGER REFERENCES customers(id),"
+    "  amt DOUBLE)"
+)
+con.close()
+
+res = detect_declared_keys_duckdb(path)
+if res["status"] == "ok":
+    for pk in res["primary_keys"]:
+        print(f"PK  {pk['table']}({', '.join(pk['columns'])})")
+    for fk in res["foreign_keys"]:
+        print(f"FK  {fk['table']}({', '.join(fk['columns'])}) -> "
+              f"{fk['referenced_table']}({', '.join(fk['referenced_columns'])})")
+    # PK  customers(id)
+    # PK  orders(id)
+    # FK  orders(customer_id) -> customers(id)
+else:
+    print("error:", res["error"])
+
+# Filtrar a una tabla concreta (PK/UNIQUE de orders + FK con origen orders):
+solo_orders = detect_declared_keys_duckdb(path, table="orders")
+print(solo_orders["tables"])  # ['orders']
+```
+
+## Cuando usarla
+
+- Cuando exploras un esquema DuckDB y quieres mostrar las relaciones de clave REALES (PK/FK/UNIQUE) que el schema ha declarado, sin inferir nada.
+- Como paso del capitulo RELACIONES del grupo `eda`: primero mira las claves declaradas con esta funcion; si el schema no declara FKs, complementa con `infer_fk_containment_duckdb` (inferencia por containment).
+- Antes de documentar o migrar un esquema, para listar el contrato de integridad referencial que el motor ya conoce.
+- Para validar que las constraints que esperas (esa FK que creaste con `REFERENCES`) realmente estan declaradas en la base materializada.
+
+## Gotchas
+
+- **Impura**: lee de disco via la primitiva read-only `duckdb_query_readonly` (no crea ni modifica la base). El `db_path` debe existir; un path inexistente devuelve `{status:'error'}` (read_only NO crea la base).
+- **Requiere `duckdb_constraints()`**: usa la table function `duckdb_constraints()`, disponible en DuckDB modernos (verificado en 1.5.2). En versiones antiguas sin esa funcion, la query falla y se devuelve `{status:'error'}`.
+- **Solo claves DECLARADAS**: devuelve lo que el schema declaro con `PRIMARY KEY` / `FOREIGN KEY (... REFERENCES ...)` / `UNIQUE`. Una tabla materializada con `CREATE TABLE AS SELECT` NO lleva constraints — para esos casos no habra claves que mostrar y hay que INFERIRLAS (`infer_fk_containment_duckdb`).
+- **NOT NULL y CHECK se ignoran**: `duckdb_constraints()` tambien emite filas `NOT NULL` (DuckDB genera una por cada columna PK) y `CHECK`; esta funcion las descarta y solo conserva PK/FK/UNIQUE.
+- **Nombres case-sensitive**: el filtro `table='Orders'` no casa con una tabla `orders`. Se comparan los nombres tal cual los devuelve DuckDB.
+- **FK atribuida al origen**: una FOREIGN KEY se atribuye a su tabla ORIGEN (el `table` de la entrada), no a la referenciada. El filtro `table='X'` trae las FK cuyo origen es X, no las que apuntan a X.
+- **`tables` = tablas dueñas de constraints emitidos**: la lista `tables` contiene solo las tablas que poseen al menos un PK/FK/UNIQUE en el resultado (su campo `table`), ordenadas. No incluye tablas referenciadas que no tengan constraint propio en la salida.
+- **Columnas como listas**: `constraint_column_names` y `referenced_column_names` son columnas LIST de DuckDB; en 1.5.2 llegan como listas Python. La funcion las normaliza a listas de strings con una red de seguridad por si llegaran como string.
+
+## Notas
+
+`duckdb_constraints()` devuelve una fila por constraint con los campos
+`table_name`, `constraint_type`, `constraint_column_names`, `referenced_table`,
+`referenced_column_names`. Mapeo a la salida:
+
+```text
+PRIMARY KEY -> primary_keys[]: {table, columns}
+UNIQUE      -> unique[]:       {table, columns}
+FOREIGN KEY -> foreign_keys[]: {table, columns, referenced_table, referenced_columns}
+NOT NULL    -> ignorado
+CHECK       -> ignorado
+```
+
+Para una FK, `referenced_table` y `referenced_column_names` vienen poblados; para
+PK/UNIQUE, `referenced_table` es NULL y `referenced_column_names` una lista vacia.
+
+Complementa a `infer_fk_containment_duckdb`: esta funcion devuelve las relaciones
+de clave REALES del schema (declaradas); la otra INFIERE FKs candidatas por
+containment de valores cuando el schema no las declaro. En el capitulo RELACIONES
+de AutomaticEDA se usan en orden: primero las declaradas, luego la inferencia como
+respaldo.
@@ -0,0 +1,127 @@
+"""detect_declared_keys_duckdb — lee las claves DECLARADAS de un schema DuckDB.
+
+Funcion impura: lee de disco a traves de la primitiva read-only del grupo
+`duckdb` (duckdb_query_readonly). Pertenece al grupo de capacidad `eda`
+(relaciones de clave): a diferencia de infer_fk_containment_duckdb, que INFIERE
+FOREIGN KEYs candidatas por containment de valores, esta funcion devuelve las
+constraints REALES que el schema ha declarado (PRIMARY KEY / FOREIGN KEY /
+UNIQUE) leyendo la table function `duckdb_constraints()`.
+
+Es la pieza del capitulo RELACIONES de AutomaticEDA que muestra las relaciones de
+clave reales cuando existen — frente a la inferencia, que se usa cuando el schema
+no las declaro.
+
+Estilo dict-no-throw del grupo duckdb: nunca lanza; captura cualquier error y
+devuelve {status:'error', error:str}.
+"""
+
+from infra import duckdb_query_readonly
+
+
+def _as_list(value) -> list:
+    """Normaliza el valor de una columna LIST de DuckDB a una lista de strings.
+
+    En DuckDB 1.5.2, `constraint_column_names` y `referenced_column_names` llegan
+    ya como listas Python a traves de duckdb_query_readonly. Este helper es solo
+    una red de seguridad: si por cualquier motivo llegara como string (p.ej. la
+    representacion `[id, customer_id]`), la parsea de forma defensiva.
+    """
+    if value is None:
+        return []
+    if isinstance(value, (list, tuple)):
+        return [str(v) for v in value]
+    if isinstance(value, str):
+        s = value.strip()
+        if s.startswith("[") and s.endswith("]"):
+            s = s[1:-1]
+        if not s.strip():
+            return []
+        return [
+            part.strip().strip("'\"")
+            for part in s.split(",")
+            if part.strip().strip("'\"")
+        ]
+    return [str(value)]
+
+
+def detect_declared_keys_duckdb(db_path: str, table: str = None) -> dict:
+    """Detecta las claves PRIMARY KEY / FOREIGN KEY / UNIQUE declaradas en DuckDB.
+
+    Lee la table function `duckdb_constraints()` y extrae solo las constraints de
+    clave (PRIMARY KEY, FOREIGN KEY, UNIQUE), ignorando NOT NULL y CHECK.
+
+    Args:
+        db_path: ruta al archivo DuckDB. Debe existir (lectura read-only; no se
+            crea). Un path inexistente devuelve {status:'error', ...} sin lanzar.
+        table: si se pasa, filtra los resultados a esa tabla: incluye PRIMARY KEY
+            y UNIQUE cuya tabla sea `table`, y FOREIGN KEY cuya tabla ORIGEN sea
+            `table`. None (default) devuelve los constraints de todas las tablas.
+            La comparacion de nombres es case-sensitive (tal cual los devuelve
+            DuckDB).
+
+    Returns:
+        dict dict-no-throw. En exito:
+            {status:'ok',
+             primary_keys:[{table:str, columns:[str, ...]}, ...],
+             foreign_keys:[{table:str, columns:[str, ...],
+                            referenced_table:str,
+                            referenced_columns:[str, ...]}, ...],
+             unique:[{table:str, columns:[str, ...]}, ...],
+             tables:[str, ...]}   # tablas (origen) con algun PK/FK/UNIQUE emitido
+        En error (sin lanzar): {status:'error', error:str}.
+    """
+    try:
+        sql = (
+            "SELECT table_name, constraint_type, constraint_column_names, "
+            "referenced_table, referenced_column_names FROM duckdb_constraints()"
+        )
+        res = duckdb_query_readonly(db_path, sql)
+        if res["status"] != "ok":
+            return {"status": "error", "error": res["error"]}
+
+        primary_keys = []
+        foreign_keys = []
+        unique = []
+        tables = set()
+
+        for row in res["rows"]:
+            ctype = row["constraint_type"]
+            tname = row["table_name"]
+
+            # Filtro por tabla origen: para PK/FK/UNIQUE el dueño del constraint es
+            # `table_name`. Una FK se atribuye a su tabla origen (no a la
+            # referenciada), igual que el filtro pide.
+            if table is not None and tname != table:
+                continue
+
+            cols = _as_list(row["constraint_column_names"])
+
+            if ctype == "PRIMARY KEY":
+                primary_keys.append({"table": tname, "columns": cols})
+                tables.add(tname)
+            elif ctype == "UNIQUE":
+                unique.append({"table": tname, "columns": cols})
+                tables.add(tname)
+            elif ctype == "FOREIGN KEY":
+                foreign_keys.append(
+                    {
+                        "table": tname,
+                        "columns": cols,
+                        "referenced_table": row["referenced_table"],
+                        "referenced_columns": _as_list(
+                            row["referenced_column_names"]
+                        ),
+                    }
+                )
+                tables.add(tname)
+            # NOT NULL y CHECK se ignoran: no son relaciones de clave.
+
+        return {
+            "status": "ok",
+            "primary_keys": primary_keys,
+            "foreign_keys": foreign_keys,
+            "unique": unique,
+            "tables": sorted(tables),
+        }
+    except Exception as e:  # noqa: BLE001
+        return {"status": "error", "error": str(e)}
@@ -0,0 +1,167 @@
+"""Tests para detect_declared_keys_duckdb."""
+
+import duckdb
+import pytest
+
+from .detect_declared_keys_duckdb import detect_declared_keys_duckdb
+
+
+@pytest.fixture
+def db(tmp_path):
+    """DuckDB temporal con claves declaradas.
+
+    - customers(id PRIMARY KEY, name)
+    - orders(id PRIMARY KEY, customer_id REFERENCES customers(id), amt)
+
+    Esto declara dos PRIMARY KEY (customers.id, orders.id) y una FOREIGN KEY
+    (orders.customer_id -> customers.id). DuckDB ademas genera constraints
+    NOT NULL para las columnas PK, que la funcion debe ignorar.
+    """
+    path = str(tmp_path / "keys_test.duckdb")
+    con = duckdb.connect(path)
+    con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)")
+    con.execute(
+        "CREATE TABLE orders("
+        "  id INTEGER PRIMARY KEY,"
+        "  customer_id INTEGER REFERENCES customers(id),"
+        "  amt DOUBLE"
+        ")"
+    )
+    con.close()
+    return path
+
+
+def _pk_for(res, table):
+    """Devuelve la entrada primary_keys cuya tabla es `table`, o None."""
+    for pk in res["primary_keys"]:
+        if pk["table"] == table:
+            return pk
+    return None
+
+
+def test_golden_detecta_pks_y_fk(db):
+    """Golden: detecta las dos PK y la FK declaradas, con valores concretos."""
+    res = detect_declared_keys_duckdb(db)
+    assert res["status"] == "ok"
+
+    # PRIMARY KEY de customers y de orders.
+    pk_customers = _pk_for(res, "customers")
+    pk_orders = _pk_for(res, "orders")
+    assert pk_customers is not None
+    assert pk_customers["columns"] == ["id"]
+    assert pk_orders is not None
+    assert pk_orders["columns"] == ["id"]
+
+    # FOREIGN KEY orders.customer_id -> customers.id.
+    assert len(res["foreign_keys"]) == 1
+    fk = res["foreign_keys"][0]
+    assert fk["table"] == "orders"
+    assert fk["columns"] == ["customer_id"]
+    assert fk["referenced_table"] == "customers"
+    assert fk["referenced_columns"] == ["id"]
+
+    # tables incluye ambas (origen de algun constraint).
+    assert res["tables"] == ["customers", "orders"]
+
+
+def test_golden_ignora_not_null_y_check(db):
+    """NOT NULL (auto-generado por las PK) no aparece como clave."""
+    res = detect_declared_keys_duckdb(db)
+    assert res["status"] == "ok"
+    # Solo 2 PK reales (no las NOT NULL que DuckDB genera por cada columna PK).
+    assert len(res["primary_keys"]) == 2
+    # No hay UNIQUE declarado en este schema.
+    assert res["unique"] == []
+
+
+def test_edge_filtra_por_tabla_orders(db):
+    """Edge table='orders': PK de orders + su FK; NO la PK de customers."""
+    res = detect_declared_keys_duckdb(db, table="orders")
+    assert res["status"] == "ok"
+
+    # Solo la PK de orders.
+    assert len(res["primary_keys"]) == 1
+    assert res["primary_keys"][0]["table"] == "orders"
+    assert res["primary_keys"][0]["columns"] == ["id"]
+    # La PK de customers NO esta.
+    assert _pk_for(res, "customers") is None
+
+    # La FK de orders si esta (origen = orders).
+    assert len(res["foreign_keys"]) == 1
+    assert res["foreign_keys"][0]["table"] == "orders"
+    assert res["foreign_keys"][0]["referenced_table"] == "customers"
+
+    # tables solo contiene orders (la dueña de los constraints emitidos).
+    assert res["tables"] == ["orders"]
+
+
+def test_edge_filtra_por_tabla_customers(db):
+    """Edge table='customers': solo su PK; ninguna FK (orders queda fuera)."""
+    res = detect_declared_keys_duckdb(db, table="customers")
+    assert res["status"] == "ok"
+    assert len(res["primary_keys"]) == 1
+    assert res["primary_keys"][0]["table"] == "customers"
+    assert res["foreign_keys"] == []
+    assert res["tables"] == ["customers"]
+
+
+def test_edge_unique_declarado(tmp_path):
+    """Edge: una constraint UNIQUE declarada aparece en `unique`."""
+    path = str(tmp_path / "unique_test.duckdb")
+    con = duckdb.connect(path)
+    con.execute("CREATE TABLE products(sku INTEGER UNIQUE, name TEXT)")
+    con.close()
+
+    res = detect_declared_keys_duckdb(path)
+    assert res["status"] == "ok"
+    assert len(res["unique"]) == 1
+    assert res["unique"][0]["table"] == "products"
+    assert res["unique"][0]["columns"] == ["sku"]
+    assert res["primary_keys"] == []
+    assert res["foreign_keys"] == []
+    assert res["tables"] == ["products"]
+
+
+def test_edge_sin_constraints_listas_vacias(tmp_path):
+    """Edge: tabla sin PK/FK/UNIQUE -> todas las listas vacias, status ok."""
+    path = str(tmp_path / "no_keys.duckdb")
+    con = duckdb.connect(path)
+    con.execute("CREATE TABLE log(a INTEGER, b INTEGER)")
+    con.close()
+
+    res = detect_declared_keys_duckdb(path)
+    assert res["status"] == "ok"
+    assert res["primary_keys"] == []
+    assert res["foreign_keys"] == []
+    assert res["unique"] == []
+    assert res["tables"] == []
+
+
+def test_error_db_inexistente_no_lanza(tmp_path):
+    """Error: db_path inexistente -> status error, sin lanzar excepcion."""
+    path = str(tmp_path / "does_not_exist.duckdb")
+    res = detect_declared_keys_duckdb(path)
+    assert res["status"] == "error"
+    assert isinstance(res["error"], str)
+    assert res["error"] != ""
+
+
+def test_shape_resultado(db):
+    """El retorno tiene exactamente las claves esperadas."""
+    res = detect_declared_keys_duckdb(db)
+    assert set(res.keys()) == {
+        "status",
+        "primary_keys",
+        "foreign_keys",
+        "unique",
+        "tables",
+    }
+    for pk in res["primary_keys"]:
+        assert set(pk.keys()) == {"table", "columns"}
+    for fk in res["foreign_keys"]:
+        assert set(fk.keys()) == {
+            "table",
+            "columns",
+            "referenced_table",
+            "referenced_columns",
+        }
@@ -0,0 +1,89 @@
+---
+name: render_automatic_eda_markdown
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def render_automatic_eda_markdown(chapters_or_profile, out_path: str, meta: dict = None) -> dict"
+description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en un único MARKDOWN autocontenido pensado para PEGAR A UN LLM. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (construye los capítulos canónicos con build_document). Prioriza TEXTO + DATOS sobre lo visual: las tablas se vuelcan como tablas markdown con TODAS las filas (sin paginar — no hay páginas que cortar), una figura matplotlib se reduce a su caption más la tabla de datos subyacente (Desde/Hasta/Frecuencia de las barras del histograma) porque un LLM no ve la imagen, y los marcadores de glosario se eliminan conservando el **negrita**. Lleva cabecera (# título), bloque de metadatos en blockquote e índice numerado con anclas GitHub. Espejo de render_automatic_eda_pdf/render_automatic_eda_pptx pero SIN manifest (KISS, el markdown es un único artefacto de texto). dict-no-throw: nunca lanza, devuelve {path, n_chars, chapters, note}; en error fatal path es None y note explica la causa. Flag opcional meta['embed_figures'] exporta PNGs junto al .md (off por defecto)."
+tags: [eda, markdown, render, report, llm, automatic-eda, chapters, versioned, no-cut, text, datascience, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [os, re, matplotlib, "datascience.automatic_eda"]
+params:
+  - name: chapters_or_profile
+    desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Bloques soportados: heading, markdown, kv_table, data_table, figure, image, caption, note, group, glossary_entry. Lectura defensiva: lo no reconocido se degrada a Note, nunca lanza."
+  - name: out_path
+    desc: "ruta del archivo .md de salida. Los directorios padre se crean si faltan. Directorio no escribible → {path:None, note:<causa>} sin lanzar."
+  - name: meta
+    desc: "dict opcional. Claves: title (título del documento), ctx (dict con dataset_name→Dataset, source_origin→Fuente, storage→Almacenamiento, n_rows/n_cols→Dimensiones; también lo consumen los builders de capítulo cuando se da un profile), generated_at (timestamp; si falta se genera ISO UTC), embed_figures (True para exportar PNGs <basename>_figN.png junto al .md; por defecto False y el markdown queda autocontenido)."
+output: "dict (nunca lanza): {path: str|None, n_chars: int, chapters: list[{id,version}], note: str}. En error fatal (p.ej. directorio no escribible) path es None y note explica la causa. Un documento sin capítulos aplicables produce un markdown mínimo válido con 'documento vacío' y chapters=[]."
+tested: true
+tests: ["test_golden_bloques_sinteticos_serializa_todo_a_markdown", "test_edge_documento_vacio_no_revienta", "test_profile_path_construye_capitulos_y_escribe"]
+test_file_path: "python/functions/datascience/render_automatic_eda_markdown_test.py"
+file_path: "python/functions/datascience/render_automatic_eda_markdown.py"
+---
+
+## Ejemplo
+
+```python
+from datascience import render_automatic_eda_markdown
+
+# Desde un TableProfile del grupo eda (mismo modelo que los renderers PDF/PPTX).
+profile = {
+    "table": "ventas", "source": "/data/ventas.csv",
+    "n_rows": 1000, "n_cols": 2, "quality_score": 92.5,
+    "columns": [
+        {"name": "precio", "inferred_type": "numeric", "null_pct": 0.01,
+         "numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0,
+                     "std": 12.3}},
+        {"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0,
+         "categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
+    ],
+}
+res = render_automatic_eda_markdown(
+    profile, "reports/ventas_aeda.md",
+    {"title": "EDA — ventas",
+     "ctx": {"dataset_name": "Ventas", "source_origin": "ERP export",
+             "n_rows": 1000, "n_cols": 2}})
+print(res["path"], res["n_chars"], res["chapters"])
+# -> reports/ventas_aeda.md 4123 [{'id':'portada','version':'1.0.0'}, ...]
+```
+
+## Cuando usarla
+
+Cuando quieras **pegar el EDA a un LLM** (ChatGPT, Claude, ...) o tenerlo en texto
+plano versionable: mismo documento por capítulos que el PDF/PPTX, pero serializado a
+Markdown sin binarios. Úsala como tercera salida junto a `render_automatic_eda_pdf`
+(móvil) y `render_automatic_eda_pptx` (compartir) desde el MISMO modelo de capítulos.
+A diferencia de esas dos, no hay páginas ni slides: todas las filas de cada tabla se
+vuelcan (nada se corta) y cada figura se reduce a su caption + la tabla de datos
+subyacente, que es lo que un LLM puede leer. Para añadir capítulos al documento, ver
+`docs/capabilities/automatic_eda.md`.
+
+## Gotchas
+
+- **Impura**: escribe el `.md` en `out_path` (crea los directorios padre). Con
+  `meta['embed_figures']=True` además exporta un PNG `<basename>_figN.png` por figura
+  junto al `.md`; por defecto NO exporta nada y el markdown queda autocontenido.
+- **Nunca lanza** (dict-no-throw): un bloque que falle se degrada a una nota y se anota
+  en `note`; el documento se escribe igual. Un profile/lista vacíos producen un markdown
+  mínimo válido con `*(documento vacío …)*` y `chapters=[]`.
+- **Figuras = datos, no imagen**: un bloque `figure` se serializa como `*Figura: caption*`
+  más, si la figura matplotlib trae barras (histograma / barras), una tabla
+  `| Desde | Hasta | Frecuencia |` extraída de los `Rectangle` patches (máx 100 filas;
+  el resto se trunca con `*… (N filas más)*`). Si no hay barras o algo falla, solo sale
+  el caption. La figura se cierra (`plt.close`) tras leerla.
+- **Glosario vs negrita**: se eliminan SOLO los marcadores de glosario
+  `[[term:key]]visible[[/term]]` (queda `visible`); el `**negrita**` markdown SE
+  CONSERVA (es válido). No se usa `strip_inline_md` aquí porque ese también quita el bold.
+- **Anclas del índice**: el `## Índice` enlaza cada capítulo con un ancla estilo GitHub
+  del encabezado `## N. Título` (minúsculas, espacios→`-`, sin signos). Si dos capítulos
+  comparten título exacto sus anclas colisionan (caso raro; los capítulos canónicos tienen
+  títulos únicos).
+- **Tablas**: las celdas escapan `|` (→ `\|`) y pliegan saltos de línea a `<br>` para no
+  romper la columna. No hay reparto por ancho — un LLM no lo necesita.
@@ -0,0 +1,55 @@
+"""render_automatic_eda_markdown — chapter-based EDA report as one Markdown file.
+
+Public ``eda``-group entry point that serializes an AutomaticEDA document (a list
+of chapters, or an ``eda`` TableProfile from which the canonical chapters are
+built) into a single self-contained Markdown file optimised to be **pasted into
+an LLM**: plain text, Markdown tables (every row dumped — there are no pages to
+cut), figures reduced to caption + underlying data, no binaries. It mirrors
+``render_automatic_eda_pdf`` / ``render_automatic_eda_pptx`` but for text output;
+unlike those it writes no manifest (KISS — Markdown is a single text artefact).
+
+dict-no-throw: never raises. Returns ``{path, n_chars, chapters, note}``; on a
+fatal error ``path`` is None and ``note`` explains why.
+"""
+
+from __future__ import annotations
+
+from datascience.automatic_eda import build_document, render_md
+from datascience.automatic_eda.model import as_chapter, as_chapters
+
+
+def _coerce_chapters(chapters_or_profile, meta: dict) -> list:
+    """Accept chapters OR an eda profile and return a list of Chapter."""
+    arg = chapters_or_profile
+    if isinstance(arg, (list, tuple)):
+        return as_chapters(list(arg))
+    if isinstance(arg, dict):
+        if "blocks" in arg and "columns" not in arg:
+            ch = as_chapter(arg)
+            return [ch] if ch is not None else []
+        return build_document(arg, (meta or {}).get("ctx"))
+    return []
+
+
+def render_automatic_eda_markdown(chapters_or_profile, out_path: str,
+                                  meta: dict = None) -> dict:
+    """Render an AutomaticEDA document into a single self-contained Markdown file.
+
+    Args:
+        chapters_or_profile: a list of chapters (``Chapter`` dataclasses or
+            dicts) or an ``eda`` TableProfile dict (chapters built via
+            ``build_document(profile, meta['ctx'])``).
+        out_path: filesystem path for the ``.md`` (parent dirs are created).
+        meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
+            ``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
+            ``generated_at``, ``embed_figures`` (export PNGs beside the .md,
+            default False — off keeps the Markdown self-contained).
+
+    Returns:
+        dict (never raises): ``{path: str|None, n_chars: int,
+        chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
+        None and ``note`` explains the cause.
+    """
+    meta = dict(meta or {})
+    chapters = _coerce_chapters(chapters_or_profile, meta)
+    return render_md(chapters, out_path, meta)
@@ -0,0 +1,168 @@
+"""Tests for render_automatic_eda_markdown — DoD: golden + edge + profile path.
+
+Self-contained synthetic blocks (no DuckDB). Verifies every block kind serializes
+to Markdown (heading, markdown with glossary+bold, kv/data tables, a figure whose
+histogram bars become a data table, caption, note, group, glossary entry), that a
+leading level-1 heading equal to the chapter title is omitted, that an empty
+document degrades to a valid minimal Markdown without raising, and that passing a
+minimal TableProfile builds chapters and writes the file.
+"""
+
+import os
+import tempfile
+
+from datascience.render_automatic_eda_markdown import render_automatic_eda_markdown
+from datascience.automatic_eda.model import (
+    Caption, Chapter, DataTable, Figure, GlossaryEntry, Group, Heading, KVTable,
+    Markdown, Note,
+)
+
+
+def _hist_fig():
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    ax.hist([1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5], bins=5)
+    return fig
+
+
+def _chapters() -> list:
+    blocks = [
+        Heading("Demo", 1),                       # == chapter title -> omitted.
+        Heading("Seccion dos", 2),                # -> ####
+        Markdown("Texto con [[term:ent]]entropia[[/term]] y **bold** aqui."),
+        KVTable(rows=[("Filas", 1000), ("Columnas", 5)], title="Resumen"),
+        DataTable(header=["col", "valor"],
+                  rows=[["alpha", "111"], ["beta", "222"], ["gamma", "333"]],
+                  title="Datos", note="nota inferior"),
+        Figure(make=_hist_fig, caption="Histograma demo"),
+        Caption("pie de figura"),
+        Note("una nota aparte"),
+        Group(title="Grupo X", blocks=[Markdown("dentro del grupo")]),
+        GlossaryEntry(key="ent", label="Entropia",
+                      definition="Medida de incertidumbre."),
+    ]
+    return [Chapter(id="demo", title="Demo", version="1.0.0", blocks=blocks)]
+
+
+def _read(path: str) -> str:
+    with open(path, "r", encoding="utf-8") as fh:
+        return fh.read()
+
+
+def test_golden_bloques_sinteticos_serializa_todo_a_markdown():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "demo.md")
+        res = render_automatic_eda_markdown(
+            _chapters(), out,
+            {"title": "EDA Demo",
+             "ctx": {"dataset_name": "Demo", "n_rows": 12, "n_cols": 2}})
+        assert res["path"] == out
+        assert os.path.exists(out)
+        assert res["n_chars"] > 0
+        assert res["chapters"] == [{"id": "demo", "version": "1.0.0"}]
+
+        content = _read(out)
+        # Document structure.
+        assert content.startswith("# ")
+        assert "## Índice" in content
+        # A Markdown table is present (header + separator row).
+        assert "| " in content and "| --- " in content
+        # DataTable values are all dumped.
+        for v in ("alpha", "111", "beta", "222", "gamma", "333"):
+            assert v in content
+        # Glossary markers stripped, bold kept.
+        assert "[[term" not in content
+        assert "[[/term]]" not in content
+        assert "**bold**" in content
+        assert "entropia" in content  # visible glossary text preserved.
+        # Figure histogram bars became a data table.
+        assert "| Desde | Hasta | Frecuencia |" in content
+        # Glossary entry rendered as a level-3 heading.
+        assert "### Entropia" in content
+        # Level-2 heading -> ####.
+        assert "#### Seccion dos" in content
+        # Leading level-1 heading equal to the title was omitted.
+        assert "### Demo" not in content
+        # Group title rendered.
+        assert "### Grupo X" in content
+
+
+def _hist_fig_with_span():
+    """Histogram with a wide ``axvspan`` (±1σ band) over it.
+
+    Reproduces the num_distr figure shape: matplotlib keeps the span as a lone
+    Rectangle in ``ax.patches`` alongside the bin bars; it must NOT leak into the
+    extracted bins table as a fake bin (it is ~5x wider than a bin)."""
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    data = [1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5]
+    ax.hist(data, bins=5)
+    ax.axvspan(2.0, 4.0, alpha=0.2)   # mean±σ band — a wide stray rectangle.
+    return fig
+
+
+def test_figura_descarta_axvspan_de_la_tabla_de_bins():
+    """The ±1σ band rectangle must not appear as a row in the bins table."""
+    blocks = [Figure(make=_hist_fig_with_span, caption="Hist con banda")]
+    chapters = [Chapter(id="f", title="Fig", version="1.0.0", blocks=blocks)]
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "fig.md")
+        render_automatic_eda_markdown(chapters, out, {"title": "T"})
+        content = _read(out)
+        assert "| Desde | Hasta | Frecuencia |" in content
+        # Extract the rows of the bins table: lines between the header/separator
+        # and the next blank line.
+        lines = content.splitlines()
+        hi = next(i for i, ln in enumerate(lines)
+                  if ln.startswith("| Desde | Hasta | Frecuencia |"))
+        rows = []
+        for ln in lines[hi + 2:]:           # skip header + separator
+            if not ln.startswith("|"):
+                break
+            rows.append(ln)
+        # 5 histogram bins, no extra wide span row.
+        assert len(rows) == 5, rows
+        # No row spans a width of ~2.0 (the axvspan from x=2 to x=4).
+        for ln in rows:
+            cells = [c.strip() for c in ln.strip("|").split("|")]
+            lo, hi_v = float(cells[0]), float(cells[1])
+            assert (hi_v - lo) < 1.5, f"wide span leaked: {ln}"
+
+
+def test_edge_documento_vacio_no_revienta():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "empty.md")
+        res = render_automatic_eda_markdown([], out, {})
+        assert res["path"] == out
+        assert os.path.exists(out)
+        assert res["chapters"] == []
+        content = _read(out)
+        assert "documento vacío" in content
+        assert content.startswith("# ")
+
+
+def test_profile_path_construye_capitulos_y_escribe():
+    profile = {
+        "table": "mini",
+        "source": "/data/mini.csv",
+        "n_rows": 10,
+        "n_cols": 1,
+        "quality_score": 88.0,
+        "columns": [
+            {"name": "x", "inferred_type": "numeric", "null_pct": 0.0,
+             "null_count": 0,
+             "numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
+                         "std": 0.5}},
+        ],
+    }
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "mini.md")
+        res = render_automatic_eda_markdown(
+            profile, out, {"title": "Mini", "ctx": {"dataset_name": "Mini"}})
+        assert res["path"] == out  # not None — no exception, file written.
+        assert os.path.exists(out)
+        assert res["n_chars"] > 0
@@ -0,0 +1,91 @@
+---
+name: suggest_intratable_fk_candidates
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: pure
+signature: "def suggest_intratable_fk_candidates(profile: dict, max_candidates: int = 20) -> list"
+description: "Sobre el TableProfile de UNA tabla (el dict de profile_table), sugiere por heuristica de nombre + cardinalidad que columnas PARECEN una clave foranea hacia otra tabla, cuando no hay relaciones inter-tabla que medir (una sola tabla). Es una SUGERENCIA, no una afirmacion: el ref_table_guess es el stem del nombre (customer_id -> customer) y NO confirma containment. Pura: solo lee el dict, sin I/O; nunca lanza (devuelve [])."
+tags: [eda, datascience, relationships, foreign-key, fk, heuristic, schema, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+params:
+  - name: profile
+    desc: "TableProfile (dict que produce profile_table / summarize_table_*). Se leen de forma defensiva `columns` (lista de ColumnProfile con name/inferred_type/physical_type/distinct_count/unique_pct/flags), `n_rows` (int) y `key_candidates` (lista de nombres de columna ya candidatos a PK, que se excluyen). Si no es dict o no trae columns -> []."
+  - name: max_candidates
+    desc: "Tope de sugerencias devueltas (default 20). Las columnas candidatas se ordenan por distinct_count descendente (mas informativas primero) antes de cortar a este maximo."
+output: "list (posiblemente vacia) de dicts, uno por columna sugerida, con claves: `column` (nombre), `ref_table_guess` (tabla conjeturada por el stem del nombre, p.ej. customer_id -> 'customer'), `reason` (frase humana que deja claro que es heuristica sin confirmar containment), `distinct_count` (int|None), `unique_pct` (float|None, fraccion 0-1 tal como viene del profile), `inferred_type` (str), `physical_type` (str). Nunca lanza."
+tested: true
+tests: ["test_golden_customer_id_detectado_otras_no", "test_camelcase_albumid_detectado", "test_constante_status_id_no_aparece", "test_profile_vacio_y_none_devuelven_lista_vacia", "test_category_id_casi_unico_parece_pk_no_aparece", "test_ref_table_guess_multitoken_y_orden_por_distinct", "test_max_candidates_corta_la_lista", "test_id_generico_solo_nunca_es_fk"]
+test_file_path: "python/functions/datascience/suggest_intratable_fk_candidates_test.py"
+file_path: "python/functions/datascience/suggest_intratable_fk_candidates.py"
+---
+
+## Ejemplo
+
+```python
+from datascience import suggest_intratable_fk_candidates
+
+# TableProfile de UNA tabla (tipo titanic): customer_id es FK N:1; id es la PK;
+# amount es una medida float; name es categorica sin sufijo de id.
+profile = {
+    "n_rows": 891,
+    "key_candidates": ["id"],
+    "columns": [
+        {"name": "id", "inferred_type": "numeric", "physical_type": "BIGINT",
+         "distinct_count": 891, "unique_pct": 1.0, "flags": ["possible_id"]},
+        {"name": "customer_id", "inferred_type": "numeric", "physical_type": "BIGINT",
+         "distinct_count": 137, "unique_pct": 0.15, "flags": []},
+        {"name": "amount", "inferred_type": "numeric", "physical_type": "DOUBLE",
+         "distinct_count": 400, "unique_pct": 0.45, "flags": []},
+        {"name": "name", "inferred_type": "categorical", "physical_type": "VARCHAR",
+         "distinct_count": 700, "unique_pct": 0.78, "flags": []},
+    ],
+}
+
+out = suggest_intratable_fk_candidates(profile)
+[c["column"] for c in out]              # -> ["customer_id"]
+out[0]["ref_table_guess"]               # -> "customer"
+out[0]["reason"]
+# -> "el nombre termina en '_id' y es N:1 (137 valores distintos < 891 filas):
+#     parece (heuristica por nombre, sin confirmar containment) una referencia a
+#     una tabla «customer»"
+```
+
+## Cuando usarla
+
+Cuando el EDA tiene SOLO UNA tabla y, por tanto, no se puede inferir una FK
+inter-tabla por containment (no hay otra tabla cuyos valores contener). Es el plan B
+del capitulo RELACIONES de AutomaticEDA: en vez de medir solapamiento de valores
+entre tablas (lo correcto cuando hay varias, ver `infer_fk_containment_duckdb` /
+`build_join_graph`), conjetura por el NOMBRE de la columna (`<algo>_id`) y por su
+CARDINALIDAD N:1 que columnas parecen apuntar a una entidad externa. Usala para
+enriquecer el reporte con "estas columnas parecen referencias a otras tablas" sin
+prometer que esa tabla exista. NO la uses si tienes varias tablas: ahi mide
+containment de verdad.
+
+## Gotchas
+
+- Es **heuristica**, no una verdad: produce **falsos positivos** (una columna
+  `period_id` que en realidad es un codigo libre, no una FK) y **falsos negativos**
+  (una FK que no se llama `*_id`, p.ej. `parent`, `owner`, `sku`). No la trates como
+  una afirmacion de esquema.
+- `ref_table_guess` es una **conjetura por el nombre** (el stem sin el sufijo id):
+  `customer_id` -> `customer`, `AlbumId` -> `album`, `manager_staff_id` ->
+  `manager_staff`. Puede no coincidir con el nombre real de la tabla (plurales,
+  prefijos, alias). Es una pista, no un join garantizado.
+- **NO confirma containment**: no comprueba que los valores de la columna existan en
+  ninguna otra tabla (no puede — solo recibe el perfil de una tabla). Para confirmar
+  una FK real con varias tablas usa `infer_fk_containment_duckdb`.
+- Excluye deliberadamente: el `id`/`Id`/`ID` generico a secas (suele ser la PK
+  propia, no una referencia), las columnas constantes, las que parecen unicas
+  (`unique_pct >= 0.99`, mas PK que FK) y los tipos no-clave (float/decimal son
+  medidas; date/time/timestamp y boolean no son claves). En camelCase, `paid`,
+  `valid`, `grid` (con `id` en minuscula y sin separador) NO se confunden con FK.
+- `unique_pct` se interpreta como **fraccion 0-1** (tal como la emite el profile), no
+  como porcentaje 0-100.
@@ -0,0 +1,202 @@
+"""suggest_intratable_fk_candidates — heuristica de FK intra-tabla del grupo `eda`.
+
+Sobre el TableProfile de UNA tabla (el dict que produce ``profile_table``), sugiere
+por heuristica de NOMBRE + CARDINALIDAD que columnas PARECEN una clave foranea hacia
+otra tabla, util cuando no hay relaciones inter-tabla disponibles (una sola tabla y,
+por tanto, sin containment cruzado que medir). Es una SUGERENCIA, no una afirmacion:
+no confirma que exista la tabla referida ni que los valores esten contenidos en ella.
+
+La consume el capitulo RELACIONES de AutomaticEDA cuando solo hay una tabla.
+
+Funcion PURA: solo lee el dict (lectura defensiva con ``.get``), no hace I/O y nunca
+lanza por inputs raros (devuelve ``[]``).
+"""
+
+# inferred_type que es compatible con una clave foranea (entero/categorico).
+_FK_INFERRED_OK = {"numeric", "categorical", "integer"}
+
+# Prefijos de physical_type que admiten ser clave foranea (enteros, texto, uuid).
+_FK_PHYSICAL_PREFIXES = (
+    "int", "bigint", "smallint", "tinyint", "hugeint", "uint",
+    "varchar", "text", "char", "bpchar", "string", "uuid",
+)
+
+# Prefijos de physical_type que EXCLUYEN ser clave foranea: medidas en coma flotante
+# (float/double/decimal/numeric/real), temporales (date/time/timestamp/interval) y
+# boolean. Se comprueban ANTES que las senales positivas (la exclusion gana: una
+# columna numeric con physical DOUBLE es una medida, no una FK).
+_FK_PHYSICAL_EXCLUDE = (
+    "float", "double", "decimal", "numeric", "real",
+    "date", "time", "timestamp", "interval",
+    "bool",
+)
+
+
+def _fk_name_signal(name):
+    """Detecta el sufijo de clave foranea en el nombre y devuelve ``(stem, sufijo)``.
+
+    Reconoce ``<algo>_id`` (snake), ``<Algo>Id`` y ``<algo>ID`` (camel). NO reconoce
+    el ``id``/``Id``/``ID`` generico a secas (suele ser la PK propia de la tabla, no
+    una referencia). En camelCase la ``I`` mayuscula marca el limite de palabra, asi
+    que ``paid``/``valid``/``grid`` (``id`` en minuscula y sin separador) NO matchean.
+
+    El ``stem`` se devuelve en minusculas y sirve de ``ref_table_guess`` (la tabla a
+    la que probablemente apunta): ``customer_id`` -> ``"customer"``, ``AlbumId`` ->
+    ``"album"``, ``manager_staff_id`` -> ``"manager_staff"``. Devuelve ``None`` si no
+    hay senal de nombre.
+    """
+    if not isinstance(name, str):
+        return None
+    raw = name.strip()
+    if not raw:
+        return None
+    # Snake: termina en "_id" (indiferente a mayusculas en la parte "id").
+    if raw.lower().endswith("_id"):
+        stem = raw[:-3].rstrip("_-. ")
+        if not stem:
+            return None
+        return (stem.lower(), "_id")
+    # Camel todo-mayuscula: "...ID" (p.ej. customerID).
+    if raw.endswith("ID"):
+        stem = raw[:-2].rstrip("_-. ")
+        if not stem:
+            return None
+        return (stem.lower(), "ID")
+    # Camel: "...Id" (p.ej. AlbumId).
+    if raw.endswith("Id"):
+        stem = raw[:-2].rstrip("_-. ")
+        if not stem:
+            return None
+        return (stem.lower(), "Id")
+    return None
+
+
+def _fk_type_compatible(col):
+    """True si el tipo de la columna admite ser clave foranea.
+
+    Compatible si el ``physical_type`` NO es una medida flotante, una temporal ni
+    boolean, Y ademas (``inferred_type`` en {numeric, categorical, integer} O el
+    ``physical_type`` empieza por entero/varchar/text/char/uuid). La comparacion es
+    indistinta a mayusculas/minusculas.
+    """
+    phys = (col.get("physical_type") or "").strip().lower()
+    inferred = (col.get("inferred_type") or "").strip().lower()
+    # Exclusion por tipo fisico (gana sobre cualquier senal positiva).
+    for bad in _FK_PHYSICAL_EXCLUDE:
+        if phys.startswith(bad):
+            return False
+    # Senal positiva por tipo inferido.
+    if inferred in _FK_INFERRED_OK:
+        return True
+    # Senal positiva por tipo fisico (entero/texto/uuid).
+    for good in _FK_PHYSICAL_PREFIXES:
+        if phys.startswith(good):
+            return True
+    return False
+
+
+def suggest_intratable_fk_candidates(profile: dict, max_candidates: int = 20) -> list:
+    """Sugiere columnas que parecen una FK intra-tabla por nombre + cardinalidad.
+
+    Heuristica (no afirma nada): una columna es candidata a clave foranea si su nombre
+    tiene sufijo de id con stem no vacio (``<algo>_id`` / ``<Algo>Id`` / ``<algo>ID``,
+    NUNCA el ``id`` generico), no es ya candidata a PK, no es constante, tiene
+    cardinalidad alta pero por debajo del numero de filas (N:1, no unica) y un tipo
+    compatible con clave (entero/categorico/texto/uuid; nunca float/fecha/boolean).
+
+    Args:
+        profile: TableProfile (dict de ``profile_table``). Se leen, de forma
+            defensiva, ``columns`` (lista de ColumnProfile), ``n_rows`` y
+            ``key_candidates`` (nombres de columna ya candidatos a PK).
+        max_candidates: tope de sugerencias devueltas (default 20). Las columnas se
+            ordenan por ``distinct_count`` descendente (mas informativas primero)
+            antes de cortar.
+
+    Returns:
+        list de dicts (posiblemente vacia), uno por columna sugerida, con claves:
+        ``column``, ``ref_table_guess`` (stem del nombre), ``reason`` (frase humana),
+        ``distinct_count``, ``unique_pct`` (fraccion 0-1 tal como viene del profile),
+        ``inferred_type``, ``physical_type``. Nunca lanza: si ``profile`` no es dict o
+        no hay columnas, devuelve ``[]``.
+    """
+    if not isinstance(profile, dict):
+        return []
+    columns = profile.get("columns")
+    if not isinstance(columns, list):
+        return []
+
+    n_rows = profile.get("n_rows")
+    has_n_rows = (
+        isinstance(n_rows, int) and not isinstance(n_rows, bool) and n_rows > 0
+    )
+
+    key_candidates = profile.get("key_candidates")
+    if not isinstance(key_candidates, (list, tuple, set)):
+        key_candidates = []
+    key_set = set(key_candidates)
+
+    out = []
+    for col in columns:
+        if not isinstance(col, dict):
+            continue
+        name = col.get("name")
+
+        # 1) Senal de nombre: sufijo de id con stem no vacio.
+        signal = _fk_name_signal(name)
+        if signal is None:
+            continue
+        ref_guess, suffix = signal
+
+        # 2) No es ya candidata a PK (clave primaria de la propia tabla).
+        if name in key_set:
+            continue
+
+        # 3) No constante y con >= 2 valores distintos.
+        flags = col.get("flags") or []
+        if "constant" in flags:
+            continue
+        dc = col.get("distinct_count")
+        if not (isinstance(dc, int) and not isinstance(dc, bool) and dc >= 2):
+            continue
+
+        # 4) Cardinalidad alta pero < n_rows (no es PK) y no parece unica.
+        if has_n_rows and dc >= n_rows:
+            continue
+        unique_pct = col.get("unique_pct")
+        has_unique = (
+            isinstance(unique_pct, (int, float)) and not isinstance(unique_pct, bool)
+        )
+        if has_unique and unique_pct >= 0.99:
+            continue
+
+        # 5) Tipo compatible con clave foranea (entero/categorico/texto; no medida).
+        if not _fk_type_compatible(col):
+            continue
+
+        out.append(
+            {
+                "column": name,
+                "ref_table_guess": ref_guess,
+                "reason": _build_reason(suffix, dc, n_rows if has_n_rows else None, ref_guess),
+                "distinct_count": dc,
+                "unique_pct": float(unique_pct) if has_unique else None,
+                "inferred_type": col.get("inferred_type") or "",
+                "physical_type": col.get("physical_type") or "",
+            }
+        )
+
+    # Mas informativas primero (mayor cardinalidad), luego corte.
+    out.sort(key=lambda d: d.get("distinct_count") or 0, reverse=True)
+    return out[: max(0, int(max_candidates))]
+
+
+def _build_reason(suffix, dc, n_rows, ref_guess):
+    """Frase humana que deja claro que la sugerencia es heuristica, no confirmada."""
+    if n_rows is not None:
+        card = f"es N:1 ({dc} valores distintos < {n_rows} filas)"
+    else:
+        card = f"tiene {dc} valores distintos que se repiten (cardinalidad N:1)"
+    return (
+        f"el nombre termina en '{suffix}' y {card}: parece (heuristica por nombre, "
+        f"sin confirmar containment) una referencia a una tabla «{ref_guess}»"
+    )
@@ -0,0 +1,157 @@
+"""Tests para suggest_intratable_fk_candidates (funcion pura, sin I/O)."""
+
+from suggest_intratable_fk_candidates import suggest_intratable_fk_candidates
+
+
+def _col(name, inferred_type="numeric", physical_type="BIGINT", distinct_count=10,
+         unique_pct=0.1, flags=None):
+    """Construye un ColumnProfile minimo a mano (el dict que emite profile_table)."""
+    return {
+        "name": name,
+        "inferred_type": inferred_type,
+        "physical_type": physical_type,
+        "semantic_type": "",
+        "distinct_count": distinct_count,
+        "unique_pct": unique_pct,
+        "null_count": 0,
+        "null_pct": 0.0,
+        "flags": list(flags) if flags else [],
+    }
+
+
+def test_golden_customer_id_detectado_otras_no():
+    # Tabla tipo titanic: customer_id es FK N:1; id es la PK; amount es medida;
+    # name es categorica sin sufijo de id. Solo customer_id debe aparecer.
+    profile = {
+        "n_rows": 891,
+        "key_candidates": ["id"],
+        "columns": [
+            _col("id", inferred_type="numeric", physical_type="BIGINT",
+                 distinct_count=891, unique_pct=1.0, flags=["possible_id"]),
+            _col("customer_id", inferred_type="numeric", physical_type="BIGINT",
+                 distinct_count=137, unique_pct=0.15, flags=[]),
+            _col("amount", inferred_type="numeric", physical_type="DOUBLE",
+                 distinct_count=400, unique_pct=0.45),
+            _col("name", inferred_type="categorical", physical_type="VARCHAR",
+                 distinct_count=700, unique_pct=0.78),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile)
+    assert isinstance(out, list)
+    assert [c["column"] for c in out] == ["customer_id"]
+    cand = out[0]
+    assert cand["ref_table_guess"] == "customer"
+    assert cand["distinct_count"] == 137
+    assert cand["unique_pct"] == 0.15
+    assert cand["inferred_type"] == "numeric"
+    assert cand["physical_type"] == "BIGINT"
+    # La razon deja claro que es heuristica + cita el sufijo y la tabla.
+    assert "customer" in cand["reason"]
+    assert "_id" in cand["reason"]
+
+
+def test_camelcase_albumid_detectado():
+    # AlbumId (camelCase, VARCHAR) -> detectada, ref_table_guess "album".
+    profile = {
+        "n_rows": 3503,
+        "key_candidates": ["TrackId"],
+        "columns": [
+            _col("AlbumId", inferred_type="categorical", physical_type="VARCHAR",
+                 distinct_count=347, unique_pct=0.10),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile)
+    # TrackId es PK candidata (en key_candidates), AlbumId no -> AlbumId aparece.
+    assert [c["column"] for c in out] == ["AlbumId"]
+    assert out[0]["ref_table_guess"] == "album"
+
+
+def test_constante_status_id_no_aparece():
+    # status_id constante (flag "constant", distinct_count 1) NO es FK util.
+    profile = {
+        "n_rows": 1000,
+        "key_candidates": [],
+        "columns": [
+            _col("status_id", inferred_type="numeric", physical_type="INTEGER",
+                 distinct_count=1, unique_pct=0.001, flags=["constant"]),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile)
+    assert out == []
+
+
+def test_profile_vacio_y_none_devuelven_lista_vacia():
+    # Lectura defensiva: ni {} ni None lanzan; devuelven [].
+    assert suggest_intratable_fk_candidates({}) == []
+    assert suggest_intratable_fk_candidates(None) == []
+    # profile sin columns o con columns no-lista tampoco lanza.
+    assert suggest_intratable_fk_candidates({"n_rows": 10}) == []
+    assert suggest_intratable_fk_candidates({"columns": "no-soy-lista"}) == []
+
+
+def test_category_id_casi_unico_parece_pk_no_aparece():
+    # unique_pct 0.999 -> parece PK (no N:1) -> NO se sugiere como FK.
+    profile = {
+        "n_rows": 891,
+        "key_candidates": [],
+        "columns": [
+            _col("category_id", inferred_type="numeric", physical_type="BIGINT",
+                 distinct_count=890, unique_pct=0.999),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile)
+    assert out == []
+
+
+def test_ref_table_guess_multitoken_y_orden_por_distinct():
+    # manager_staff_id conserva los underscores del stem -> "manager_staff".
+    # Ademas, con varias candidatas, se ordenan por distinct_count descendente.
+    profile = {
+        "n_rows": 10000,
+        "key_candidates": ["staff_id"],  # staff_id es PK aqui, no debe aparecer
+        "columns": [
+            _col("staff_id", inferred_type="numeric", physical_type="BIGINT",
+                 distinct_count=10000, unique_pct=1.0, flags=["possible_id"]),
+            _col("store_id", inferred_type="numeric", physical_type="INTEGER",
+                 distinct_count=2, unique_pct=0.0002),
+            _col("manager_staff_id", inferred_type="numeric", physical_type="INTEGER",
+                 distinct_count=40, unique_pct=0.004),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile)
+    cols = [c["column"] for c in out]
+    # staff_id excluida (PK); las otras dos ordenadas por distinct desc.
+    assert cols == ["manager_staff_id", "store_id"]
+    refs = {c["column"]: c["ref_table_guess"] for c in out}
+    assert refs["manager_staff_id"] == "manager_staff"
+    assert refs["store_id"] == "store"
+
+
+def test_max_candidates_corta_la_lista():
+    # max_candidates limita el numero de sugerencias devueltas.
+    profile = {
+        "n_rows": 10000,
+        "key_candidates": [],
+        "columns": [
+            _col("a_id", distinct_count=300, unique_pct=0.03),
+            _col("b_id", distinct_count=200, unique_pct=0.02),
+            _col("c_id", distinct_count=100, unique_pct=0.01),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile, max_candidates=2)
+    assert [c["column"] for c in out] == ["a_id", "b_id"]
+
+
+def test_id_generico_solo_nunca_es_fk():
+    # 'id'/'Id'/'ID' a secas (sin stem) jamas se sugieren como FK.
+    profile = {
+        "n_rows": 500,
+        "key_candidates": [],
+        "columns": [
+            _col("id", distinct_count=500, unique_pct=1.0),
+            _col("Id", distinct_count=120, unique_pct=0.24),
+            _col("ID", distinct_count=80, unique_pct=0.16),
+        ],
+    }
+    out = suggest_intratable_fk_candidates(profile)
+    assert out == []
@@ -3,7 +3,7 @@ name: summarize_table_duckdb
 kind: function
 lang: py
 domain: datascience
-version: "1.0.0"
+version: "1.1.0"
 purity: impure
 signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict"
 description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)."
@@ -64,6 +64,7 @@ else:
 - **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima).
 - **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75.
 - **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`).
+- **`duplicate_rows`/`duplicate_pct` se pueblan push-down** (desde v1.1.0) con `count(*)` sobre `SELECT DISTINCT *` (sin traer filas a RAM): `duplicate_rows = n_rows - filas_distintas`, `duplicate_pct` en fraccion 0-1. Habilitan la dimension de unicidad de registro del score de dataset (`profile_table` paso 6). Si la tabla tiene tipos no comparables con `DISTINCT` (BLOB/LIST/MAP) la query degrada y ambas vuelven a `None` (renormaliza el score a solo `cell_quality`).
 - **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica).
 - **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage).
 - **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base.
@@ -196,6 +196,21 @@ def summarize_table_duckdb(
            sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
        )

+        # Unicidad de registro: filas duplicadas via COUNT de filas distintas
+        # push-down (DISTINCT *), sin traer filas a RAM. Habilita la dimension
+        # de uniqueness del score de dataset (1 - duplicate_pct). Degrada a None
+        # si la tabla tiene tipos no comparables con DISTINCT (BLOB/LIST/MAP).
+        duplicate_rows = None
+        duplicate_pct = None
+        if n_rows > 0:
+            dup_res = duckdb_query_readonly(
+                db_path, f"SELECT count(*) AS c FROM (SELECT DISTINCT * FROM {quoted})"
+            )
+            if dup_res["status"] == "ok" and dup_res["rows"]:
+                distinct_rows = int(dup_res["rows"][0]["c"])
+                duplicate_rows = max(0, n_rows - distinct_rows)
+                duplicate_pct = duplicate_rows / n_rows  # fraccion 0-1
+
        profile = {
            "table": table,
            "source": "duckdb",
@@ -203,8 +218,8 @@ def summarize_table_duckdb(
            "n_rows": n_rows,
            "n_cols": len(columns),
            "size_bytes": None,
-            "duplicate_rows": None,
-            "duplicate_pct": None,
+            "duplicate_rows": duplicate_rows,
+            "duplicate_pct": duplicate_pct,
            "constant_cols": constant_cols,
            "all_null_cols": all_null_cols,
            "null_cell_pct": null_cell_pct,
@@ -54,6 +54,30 @@ def test_shape_y_metadatos_tabla(db):
    assert profile["correlations"] is None


+def test_duplicate_pct_sin_duplicados(db):
+    """Tabla con todas las filas distintas: duplicate_pct = 0, no None."""
+    profile = summarize_table_duckdb(db, "ventas")["profile"]
+    assert profile["duplicate_rows"] == 0
+    assert profile["duplicate_pct"] == 0.0
+
+
+def test_duplicate_pct_con_duplicados(tmp_path):
+    """Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down."""
+    path = str(tmp_path / "dups.duckdb")
+    con = duckdb.connect(path)
+    con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)")
+    # 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4.
+    con.execute(
+        "INSERT INTO t VALUES "
+        "(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')"
+    )
+    con.close()
+    profile = summarize_table_duckdb(path, "t")["profile"]
+    assert profile["n_rows"] == 5
+    assert profile["duplicate_rows"] == 2
+    assert profile["duplicate_pct"] == 0.4
+
+
 def test_column_profile_shape(db):
    profile = summarize_table_duckdb(db, "ventas")["profile"]
    by_name = {c["name"]: c for c in profile["columns"]}
@@ -4,7 +4,7 @@ kind: pipeline
 lang: py
 domain: pipelines
 purity: impure
-version: "1.0.0"
+version: "1.1.0"
 signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict"
 description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla."
 tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries]
@@ -114,3 +114,12 @@ para auditar la calidad de una tabla ya productiva. Reemplaza orquestar a mano
  Formatos exoticos pueden descartarse silenciosamente del calculo numerico.
 - `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el
  sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red).
+- **Score de calidad (report 2046, desde v1.1.0).** Paso 5: cada columna recibe
+  `quality_score` de `column_quality_score` con la formula 60/40
+  (completeness/validity); al promocionar texto a numero/fecha se expone
+  `col["validity_rate"]` (parse rate de la muestra) para alimentar la dimension
+  validity. Paso 6: el score de dataset NO es la media simple — es
+  `100 * (0.85*cell_quality + 0.15*row_uniqueness)`, donde
+  `cell_quality = media(score_col/100)` y `row_uniqueness = 1 - duplicate_pct`.
+  Si `duplicate_pct` es `None` (backend sin calcularlo) el score se renormaliza
+  a solo `cell_quality`. Los outliers NO bajan el score (van a `observations`).
@@ -477,9 +477,18 @@ def profile_table(
                    if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
                        col["inferred_type"] = "numeric"
                        inferred = "numeric"
+                        # Tasa de parseo real de la muestra: alimenta la
+                        # dimension validity de column_quality_score (fraccion
+                        # de valores conformes al tipo numerico promovido).
+                        col["validity_rate"] = len(ok) / len(vals)
                elif semantic in _DATETIME_SEMANTIC:
                    col["inferred_type"] = "datetime"
                    inferred = "datetime"
+                    # Tasa de parseo de la muestra a fecha (mismo papel que el
+                    # parse rate numerico) para la dimension validity.
+                    parsed_dt = [_to_ordinal_days(v) for v in vals]
+                    ok_dt = [d for d in parsed_dt if d is not None]
+                    col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None

            # 4) Enriquecer segun el inferred_type final.
            if inferred == "numeric":
@@ -506,11 +515,36 @@ def profile_table(
            # 5) Score de calidad por columna.
            col["quality_score"] = column_quality_score(col).get("score")

-        # 6) Score agregado de la tabla (media de columnas).
+        # 6) Score agregado de la tabla (report 2046): NO media simple.
+        #   cell_quality   = media de los scores de columna, en [0,1].
+        #   row_uniqueness = 1 - duplicate_pct (unicidad de registro).
+        #   score = 100 * (0.85*cell_quality + 0.15*row_uniqueness).
+        # Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular.
        scores = [
            c["quality_score"] for c in cols if c.get("quality_score") is not None
        ]
-        prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None
+        if scores:
+            cell_quality = (sum(scores) / len(scores)) / 100.0
+            dup_pct = prof.get("duplicate_pct")
+            if dup_pct is not None:
+                try:
+                    d = float(dup_pct)
+                except (TypeError, ValueError):
+                    d = None
+            else:
+                d = None
+            if d is not None:
+                # Tolerar escala 0-100 por si algun backend la entrega asi.
+                if d > 1.0:
+                    d = d / 100.0
+                row_uniqueness = max(0.0, min(1.0, 1.0 - d))
+                prof["quality_score"] = round(
+                    100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1
+                )
+            else:
+                prof["quality_score"] = round(100.0 * cell_quality, 1)
+        else:
+            prof["quality_score"] = None

        # 7) Candidatos a clave.
        key_candidates = []
@@ -1,9 +1,10 @@
-"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX.
+"""render_automatic_eda — EDA completo one-shot: perfil → ctx → PDF + PPTX + MD.

 Pipeline impuro del grupo de capacidad `eda`. Dada UNA tabla DuckDB (o
-PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus dos formatos a la
-vez (PDF móvil A5 + PPTX 16:9) con los 11 capítulos POBLADOS, en una sola
-llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry:
+PostgreSQL), produce el informe AutomaticEDA COMPLETO en sus tres formatos a la
+vez (PDF móvil A5 + PPTX 16:9 + Markdown autocontenido para pegar a un LLM) con
+los capítulos POBLADOS, en una sola llamada. Compone, sin reimplementar su
+lógica, varias funciones del registry:

  - profile_table          : perfila la tabla end-to-end (TableProfile agregado),
                             opcionalmente con modelos baratos y análisis de serie.
@@ -12,8 +13,11 @@ llamada. Compone, sin reimplementar su lógica, cuatro funciones del registry:
                             modelos/geo, timeseries_raw para series, geo_points
                             para el mapa, db_path/table para la agregación
                             push-down). Sin él, esos capítulos degradan.
-  - render_automatic_eda_pdf  : renderiza el documento por capítulos a PDF.
-  - render_automatic_eda_pptx : renderiza el mismo documento a PPTX.
+  - render_automatic_eda_pdf      : renderiza el documento por capítulos a PDF.
+  - render_automatic_eda_pptx     : renderiza el mismo documento a PPTX.
+  - render_automatic_eda_markdown : serializa el mismo documento a Markdown
+                                    autocontenido (texto + tablas markdown, sin
+                                    binarios) para incorporar a un LLM.

 El TableProfile agregado basta para portada/overview/distribuciones/calidad/
 correlación, pero los capítulos `modelos`, `timeseries`, `geospatial` y
@@ -32,6 +36,7 @@ from datetime import datetime, timezone

 from datascience import (
    build_eda_render_ctx,
+    render_automatic_eda_markdown,
    render_automatic_eda_pdf,
    render_automatic_eda_pptx,
    run_eda_models,
@@ -93,6 +98,7 @@ def render_automatic_eda(
    out_dir: str = "reports",
    basename: str = None,
    ctx_extra: dict = None,
+    emit_md: bool = True,
 ) -> dict:
    """Perfila una tabla y emite el informe AutomaticEDA completo (PDF + PPTX).

@@ -140,13 +146,19 @@ def render_automatic_eda(
        ctx_extra: dict opcional con claves de presentación/contexto extra que se
            mezclan en el ctx (p.ej. dataset_name, description, source_origin).
            No pisan las claves de datos calculadas por build_eda_render_ctx.
+        emit_md: además del PDF y el PPTX, emite un Markdown autocontenido del
+            MISMO documento por capítulos (texto plano + tablas markdown, sin
+            binarios), pensado para pegar a un LLM. Default True. La ruta sale en
+            la clave de retorno ``aeda_md_path``. No altera las demás salidas.

    Returns:
        dict (nunca lanza). En éxito::

            {"status": "ok", "pdf_path": str, "pptx_path": str,
-             "manifest_path": str|None, "n_pages": int, "n_slides": int,
-             "pdf_note": str, "pptx_note": str, "profile": <TableProfile>}
+             "aeda_md_path": str|None, "manifest_path": str|None,
+             "n_pages": int, "n_slides": int, "md_chars": int|None,
+             "pdf_note": str, "pptx_note": str, "md_note": str|None,
+             "profile": <TableProfile>}

        En error: {"status": "error", "error": str}.
    """
@@ -243,15 +255,34 @@ def render_automatic_eda(
        rpdf = render_automatic_eda_pdf(prof, pdf_path, meta) or {}
        rpptx = render_automatic_eda_pptx(prof, pptx_path, meta) or {}

+        # Salida Markdown autocontenida (mismo documento por capítulos) para
+        # pegar a un LLM. Aditiva: no afecta a PDF/PPTX/manifest. dict-no-throw.
+        rmd = {}
+        md_path = None
+        if emit_md:
+            md_path = os.path.join(out_dir, base + ".md")
+            # El Markdown es la salida MÁS completa: además del documento por
+            # capítulos (compartido con PDF/PPTX) volca un apéndice con TODOS los
+            # datos numéricos del perfil (matriz de asociación completa, describe
+            # con skew/kurtosis/percentiles, re-expresiones, scores_by_k de
+            # KMeans, estadísticos de normalidad). Se le pasa el `prof` vía
+            # meta['profile']; un meta propio evita alterar el de PDF/PPTX.
+            md_meta = dict(meta)
+            md_meta["profile"] = prof
+            rmd = render_automatic_eda_markdown(prof, md_path, md_meta) or {}
+
        return {
            "status": "ok",
            "pdf_path": rpdf.get("path"),
            "pptx_path": rpptx.get("path"),
+            "aeda_md_path": rmd.get("path"),
            "manifest_path": rpdf.get("manifest_path"),
            "n_pages": rpdf.get("n_pages"),
            "n_slides": rpptx.get("n_slides"),
+            "md_chars": rmd.get("n_chars"),
            "pdf_note": rpdf.get("note"),
            "pptx_note": rpptx.get("note"),
+            "md_note": rmd.get("note"),
            "profile": prof,
        }
    except Exception as e:  # noqa: BLE001 — dict-no-throw: degradar, nunca lanzar.