merge(eda): MD del AutomaticEDA vuelca TODOS los datos del profile (28 pares, skew/kurtosis/percentiles, scores_by_k)

2026-06-30 20:31:50 +02:00
parent a1e2e3567c 7ec2bb1b45
commit e815f5b3b9
3 changed files with 556 additions and 5 deletions
@@ -0,0 +1,253 @@
+"""Tests for the Markdown completeness appendix (report 2053).
+
+The AutomaticEDA Markdown is the output meant to be *pasted into an LLM*, so it
+must carry EVERYTHING the engine computed — even the numbers the human-facing
+chapters (shared with the PDF/PPTX) drop for readability. ``render_md`` appends a
+full-data appendix built from ``meta['profile']`` that closes the six losses the
+evaluation found:
+
+1. the complete association matrix (every pair, incl. correlation_ratio /
+   cramers_v) — not just the top extremes;
+2. every numeric statistic for every numeric column (skew/kurtosis/percentiles);
+3. the concrete recommended re-expression;
+4. KMeans ``scores_by_k``;
+5. the normality test statistics;
+6. correct headers for bar/scree figure tables (not ``Desde/Hasta/Frecuencia``).
+
+Self-contained: a synthetic profile, no DuckDB, no heavy renderer.
+"""
+
+import os
+import sys
+
+import pytest  # noqa: F401
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))  # python/functions
+if _FUNCTIONS not in sys.path:
+    sys.path.insert(0, _FUNCTIONS)
+
+from datascience.automatic_eda import model  # noqa: E402
+from datascience.automatic_eda.render_md_impl import (  # noqa: E402
+    _bars_table,
+    _is_histogram_caption,
+    _profile_appendix,
+    render_md,
+)
+
+
+# --------------------------------------------------------------------------- #
+# Synthetic profile fixtures.
+# --------------------------------------------------------------------------- #
+def _numeric(skew, kurtosis):
+    """A numeric stat block with every key the appendix serializes."""
+    return {
+        "count": 100, "min": 0.0, "max": 10.0, "mean": 5.0, "median": 5.0,
+        "mode": 4.0, "std": 2.0, "variance": 4.0, "cv": 0.4,
+        "p1": 0.1, "p5": 0.5, "p25": 2.5, "p50": 5.0, "p75": 7.5,
+        "p95": 9.5, "p99": 9.9, "iqr": 5.0, "skew": skew, "kurtosis": kurtosis,
+        "n_outliers": 1, "distribution_type": "normal",
+    }
+
+
+def _profile():
+    """A small but structurally faithful TableProfile (3 numeric, 2 categorical)."""
+    pairs = [
+        {"a": "A", "b": "B", "a_type": "numeric", "b_type": "numeric",
+         "method": "pearson/spearman", "value": 0.8,
+         "p_value": 1e-9, "p_value_adjusted": 2e-9, "significant": True},
+        {"a": "A", "b": "C", "a_type": "numeric", "b_type": "numeric",
+         "method": "pearson/spearman", "value": -0.3,
+         "p_value": 0.01, "p_value_adjusted": 0.02, "significant": True},
+        {"a": "A", "b": "Cat1", "a_type": "numeric", "b_type": "categorical",
+         "method": "correlation_ratio", "value": 0.45,
+         "p_value": 0.001, "p_value_adjusted": 0.002, "significant": True},
+        # The single cat-cat pair the human chapter never shows.
+        {"a": "Cat1", "b": "Cat2", "a_type": "categorical",
+         "b_type": "categorical", "method": "cramers_v", "value": 0.11,
+         "p_value": 0.04, "p_value_adjusted": 0.05, "significant": False},
+    ]
+    return {
+        "correlations": {
+            "pairs": pairs,
+            "multiple_testing": {"method": "bh", "n_tests": 4, "n_rejected": 3},
+        },
+        "columns": [
+            {"name": "A", "count": 100, "numeric": _numeric(0.0, -1.2),
+             "reexpression": {"recommended": "none", "ladder_power": 1.0,
+                              "reason": "symmetric", "alternatives": []}},
+            {"name": "B", "count": 100, "numeric": _numeric(4.77, 33.1),
+             "reexpression": {"recommended": "log1p", "ladder_power": 0.0,
+                              "reason": "skew 4.77 with zeros",
+                              "alternatives": [{"transform": "yeo-johnson"},
+                                               {"transform": "sqrt"}]}},
+            {"name": "C", "count": 100, "numeric": _numeric(-0.6, 0.2)},
+            {"name": "Cat1", "categorical": {"top": [], "mode": "x"}},
+            {"name": "Cat2", "categorical": {"top": [], "mode": "y"}},
+        ],
+        "models": {
+            "kmeans": {
+                "best_k": 3,
+                "scores_by_k": [
+                    {"k": 2, "silhouette": 0.46, "inertia": 900.0},
+                    {"k": 3, "silhouette": 0.50, "inertia": 550.0},
+                    {"k": 4, "silhouette": 0.38, "inertia": 430.0},
+                ],
+                "cluster_sizes": [40, 35, 25],
+            },
+            "normality": {
+                "A": {"n": 100,
+                      "jarque_bera": {"stat": 18.7, "p": 8e-5, "normal": False},
+                      "dagostino": {"stat": 18.1, "p": 1e-4, "normal": False},
+                      "shapiro": {"stat": 0.98, "p": 7e-8, "normal": False},
+                      "is_normal": False},
+                "C": {"n": 100,
+                      "jarque_bera": {"stat": 2.1, "p": 0.35, "normal": True},
+                      "dagostino": {"stat": 1.9, "p": 0.38, "normal": True},
+                      "shapiro": {"stat": 0.99, "p": 0.12, "normal": True},
+                      "is_normal": True},
+            },
+        },
+    }
+
+
+def _dummy_chapters():
+    """A minimal one-chapter document so render_md does not early-return empty."""
+    return model.as_chapters([
+        {"id": "intro", "title": "Intro",
+         "blocks": [{"kind": "markdown", "text": "cuerpo del informe"}]},
+    ])
+
+
+def _render(tmp_path, profile):
+    out = os.path.join(str(tmp_path), "out.md")
+    res = render_md(_dummy_chapters(), out, {"title": "EDA — t", "profile": profile})
+    assert res["path"] == out
+    return open(out, encoding="utf-8").read()
+
+
+def _table_rows(md, section_title):
+    """Count data rows of the first Markdown table under ``section_title``."""
+    seg = md.split(section_title, 1)[1]
+    rows, in_t, seen_sep = 0, False, False
+    for ln in seg.splitlines():
+        if ln.startswith("|"):
+            in_t = True
+            stripped = ln.replace("|", "").replace(" ", "")
+            if stripped and set(stripped) == {"-"}:
+                seen_sep = True
+                continue
+            if seen_sep:
+                rows += 1
+        elif in_t and not ln.strip():
+            break
+    return rows
+
+
+# --------------------------------------------------------------------------- #
+# Golden: every datum the profile holds reaches the .md.
+# --------------------------------------------------------------------------- #
+def test_appendix_lists_all_correlation_pairs(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "## Apéndice — Datos completos del perfil" in md
+    # All 4 pairs (the real titanic profile has 28; here 4 synthetic).
+    assert _table_rows(md, "### Matriz de asociación") == 4
+    # The cat-cat Cramér's V pair the human chapter drops is present.
+    assert "Cat1 ↔ Cat2" in md
+    assert "cramers_v" in md
+    assert "correlation_ratio" in md
+
+
+def test_appendix_has_skew_kurtosis_for_every_numeric(tmp_path):
+    md = _render(tmp_path, _profile())
+    seg = md.split("### Estadísticos numéricos completos", 1)[1].split("###", 1)[0]
+    lines = [l for l in seg.splitlines() if l.startswith("|")]
+    header = [h.strip() for h in lines[0].strip("|").split("|")]
+    assert "skew" in header and "kurtosis" in header
+    ski, kui = header.index("skew"), header.index("kurtosis")
+    data = lines[2:]  # skip header + separator
+    assert len(data) == 3  # exactly the 3 numeric columns
+    for row in data:
+        cells = [c.strip() for c in row.strip("|").split("|")]
+        assert cells[ski] != "", f"missing skew in {cells[0]}"
+        assert cells[kui] != "", f"missing kurtosis in {cells[0]}"
+
+
+def test_appendix_has_extended_percentiles(tmp_path):
+    md = _render(tmp_path, _profile())
+    seg = md.split("### Estadísticos numéricos completos", 1)[1]
+    header = [h.strip() for h in seg.splitlines()[2].strip("|").split("|")]
+    for p in ("p1", "p5", "p25", "p75", "p95", "p99"):
+        assert p in header, f"percentile {p} missing from describe header"
+
+
+def test_appendix_names_concrete_reexpression(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "### Re-expresión recomendada" in md
+    assert "log1p" in md  # the concrete transform, not just "consider re-expressing"
+    assert "yeo-johnson" in md  # alternatives listed too
+
+
+def test_appendix_has_kmeans_scores_by_k(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "scores_by_k" in md
+    assert _table_rows(md, "#### KMeans — selección de k") == 3  # k=2,3,4
+
+
+def test_appendix_has_normality_statistics(tmp_path):
+    md = _render(tmp_path, _profile())
+    assert "JB stat" in md  # the statistic, not only the p-value
+    assert "Shapiro stat" in md
+    assert _table_rows(md, "#### Tests de normalidad") == 2  # cols A and C
+
+
+# --------------------------------------------------------------------------- #
+# Edge: a profile missing models / correlations degrades, never raises.
+# --------------------------------------------------------------------------- #
+def test_lite_profile_without_models(tmp_path):
+    prof = _profile()
+    prof.pop("models")  # lite: no KMeans/normality
+    md = _render(tmp_path, prof)
+    assert "scores_by_k" not in md  # section skipped
+    assert "Matriz de asociación" in md  # correlations still dumped
+    assert "## Apéndice" in md
+
+
+def test_profile_without_correlations(tmp_path):
+    prof = _profile()
+    prof.pop("correlations")
+    md = _render(tmp_path, prof)  # must not raise
+    assert "Matriz de asociación" not in md
+    assert "Estadísticos numéricos completos" in md  # numeric section still there
+
+
+def test_no_profile_means_no_appendix(tmp_path):
+    out = os.path.join(str(tmp_path), "noprof.md")
+    res = render_md(_dummy_chapters(), out, {"title": "x"})
+    assert res["path"] == out
+    assert "## Apéndice" not in open(out, encoding="utf-8").read()
+
+
+def test_appendix_helper_is_defensive():
+    assert _profile_appendix(None) == ""
+    assert _profile_appendix({}) == ""
+    assert _profile_appendix({"columns": []}) == ""
+
+
+# --------------------------------------------------------------------------- #
+# Loss #6: bar/scree figure tables get a non-misleading header.
+# --------------------------------------------------------------------------- #
+def test_histogram_caption_detection():
+    assert _is_histogram_caption("Histograma de Age")
+    assert _is_histogram_caption("Distribución de Fare")
+    assert not _is_histogram_caption("Media de Survived por Sex")
+    assert not _is_histogram_caption("Varianza explicada (scree PCA)")
+
+
+def test_bars_table_custom_header():
+    bars = [(0.0, 1.0, 5.0), (1.0, 2.0, 3.0)]
+    hist = _bars_table(bars)  # default histogram header
+    assert "| Desde | Hasta | Frecuencia |" in hist
+    bar = _bars_table(bars, ("Inicio", "Fin", "Valor"))
+    assert "| Inicio | Fin | Valor |" in bar
+    assert "Frecuencia" not in bar
@@ -178,9 +178,17 @@ def _md_data_table(block) -> str:
    return "\n".join(lines)


-def _bars_table(bars: list) -> str:
-    """Render extracted bar/histogram data as a Markdown table (Desde/Hasta/Frec)."""
-    lines = ["| Desde | Hasta | Frecuencia |", "| --- | --- | --- |"]
+def _bars_table(bars: list, header: tuple = ("Desde", "Hasta", "Frecuencia")) -> str:
+    """Render extracted bar/histogram data as a Markdown table.
+
+    ``header`` is the 3-column header to use. Histogram bars are
+    ``(Desde, Hasta, Frecuencia)``; bar/scree charts (means by group, PCA
+    explained variance) are *not* bins, so the caller passes a semantically
+    correct header (e.g. ``(Inicio, Fin, Valor)``) to avoid the misleading
+    "Frecuencia" label — see report 2053, loss #6.
+    """
+    h0, h1, h2 = header
+    lines = [f"| {h0} | {h1} | {h2} |", "| --- | --- | --- |"]
    shown = bars[:_MAX_BAR_ROWS]
    for x0, x1, h in shown:
        lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
@@ -191,6 +199,18 @@ def _bars_table(bars: list) -> str:
    return out


+def _is_histogram_caption(caption: str) -> bool:
+    """True when a figure caption describes a histogram (genuine numeric bins).
+
+    Histograms are the only figures whose bars are real ``[Desde, Hasta)`` bins
+    with a frequency count. Bar charts (means by group) and the PCA scree plot
+    carry per-category / per-component values, not bins — they must not inherit
+    the ``Desde/Hasta/Frecuencia`` header.
+    """
+    c = (caption or "").lower()
+    return "histograma" in c or "distribución" in c or "distribucion" in c
+
+
 def _extract_bars(fig) -> list:
    """Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.

@@ -253,7 +273,13 @@ def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
        if fig is not None:
            bars = _extract_bars(fig)
            if bars:
-                parts.append(_bars_table(bars))
+                # A histogram's bars are genuine numeric bins (Desde/Hasta/
+                # Frecuencia). Bar charts and the PCA scree plot are not bins —
+                # give them a header that does not lie about "Frecuencia".
+                header = (("Desde", "Hasta", "Frecuencia")
+                          if _is_histogram_caption(caption)
+                          else ("Inicio", "Fin", "Valor"))
+                parts.append(_bars_table(bars, header))
            if meta.get("embed_figures"):
                png = _embed_png(fig, out_path, counter)
                if png:
@@ -354,6 +380,258 @@ def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
    return _md_note(model.Note(text=model._safe_str(block)))


+# --------------------------------------------------------------------------- #
+# Profile appendix — the data the human-facing chapters drop.
+#
+# The chapter document (shared with the PDF/PPTX renderers) is designed for human
+# reading and intentionally omits raw numbers: the correlation matrix shows only
+# the top extremes, the numeric blocks skip skew/kurtosis/extended percentiles,
+# the model chapter does not list ``scores_by_k`` or the normality test
+# statistics. But the Markdown is meant to be *pasted into an LLM*, so it should
+# carry EVERYTHING the engine computed. This appendix serializes the full
+# ``profile`` (passed via ``meta['profile']``) as Markdown tables, additively:
+# the PDF/PPTX are untouched, the .md simply has more than they do. Each section
+# is emitted only when its source data is present, so a ``lite`` profile (no
+# models) or a profile without correlations degrades cleanly instead of raising.
+# See report 2053 for the six losses this closes.
+# --------------------------------------------------------------------------- #
+def _pair_types(a_type, b_type) -> str:
+    """Short ``num↔cat`` label for an association pair's variable types."""
+    def short(t):
+        t = model._safe_str(t).lower()
+        if t.startswith("num"):
+            return "num"
+        if t.startswith("cat"):
+            return "cat"
+        return t or "?"
+    return f"{short(a_type)}↔{short(b_type)}"
+
+
+def _app_correlations(corr: dict) -> str:
+    """Loss #1 — every association pair (not just the top extremes).
+
+    Dumps all of ``correlations['pairs']`` as a table (pair · types · method ·
+    value · p · p-FDR · significant), ordered by |value| desc so the strongest
+    associations lead while nothing is cut. Includes the ``correlation_ratio``
+    (num↔cat) and ``cramers_v`` (cat↔cat) pairs the human chapter never shows.
+    """
+    pairs = list(corr.get("pairs", []) or [])
+    if not pairs:
+        return ""
+    def keyfn(p):
+        try:
+            return -abs(float(p.get("value")))
+        except Exception:  # noqa: BLE001
+            return 0.0
+    pairs_sorted = sorted(pairs, key=keyfn)
+    lines = ["### Matriz de asociación — todos los pares",
+             "",
+             ("| Par | Tipos | Método | Valor | p-value | p-ajustado (FDR) "
+              "| ¿Sig? |"),
+             "| --- | --- | --- | --- | --- | --- | --- |"]
+    for p in pairs_sorted:
+        par = f"{_cell(p.get('a'))} ↔ {_cell(p.get('b'))}"
+        types = _pair_types(p.get("a_type"), p.get("b_type"))
+        method = _cell(p.get("method"))
+        val = _fmt_num(p.get("value"))
+        pv = _fmt_num(p.get("p_value")) if p.get("p_value") is not None else ""
+        padj = (_fmt_num(p.get("p_value_adjusted"))
+                if p.get("p_value_adjusted") is not None else "")
+        sig = "sí" if p.get("significant") else "no"
+        lines.append(
+            f"| {par} | {types} | {method} | {val} | {pv} | {padj} | {sig} |")
+    mt = corr.get("multiple_testing") or {}
+    n_tests = mt.get("n_tests", corr.get("n_tests"))
+    n_rej = mt.get("n_rejected")
+    note_bits = [f"{len(pairs)} pares en total"]
+    if n_tests is not None and n_rej is not None:
+        note_bits.append(
+            f"{n_rej} de {n_tests} significativos tras corrección "
+            f"{model._safe_str(mt.get('method', 'FDR')).upper()}")
+    lines.append("")
+    lines.append(f"*{'; '.join(note_bits)}.*")
+    return "\n".join(lines)
+
+
+# Numeric statistics, in serialization order: (profile key, column header).
+_NUM_STATS = [
+    ("count", "n"), ("mean", "mean"), ("median", "median"), ("mode", "mode"),
+    ("std", "std"), ("variance", "variance"), ("cv", "cv"),
+    ("skew", "skew"), ("kurtosis", "kurtosis"),
+    ("min", "min"), ("p1", "p1"), ("p5", "p5"), ("p25", "p25"), ("p50", "p50"),
+    ("p75", "p75"), ("p95", "p95"), ("p99", "p99"), ("iqr", "iqr"),
+    ("max", "max"), ("n_outliers", "outliers"),
+    ("distribution_type", "distribución"),
+]
+
+
+def _app_numeric_describe(columns: list) -> str:
+    """Loss #2 — every numeric statistic for every numeric column.
+
+    One row per numeric column with the full describe: mean/median/mode/std/
+    variance/cv, skew & kurtosis (for ALL columns, not only the skewed ones),
+    p1/p5/p25/p50/p75/p95/p99, iqr, min/max, outliers and distribution_type.
+    """
+    rows = []
+    for info in (columns or []):
+        num = info.get("numeric") if isinstance(info, dict) else None
+        if not num:
+            continue
+        name = _cell(info.get("name"))
+        cells = [name]
+        for key, _hdr in _NUM_STATS:
+            v = num.get("count" if key == "count" else key)
+            if key == "count":
+                v = num.get("count", info.get("count"))
+            if key == "distribution_type":
+                cells.append(_cell(v))
+            else:
+                cells.append(_fmt_num(v) if v is not None else "")
+        rows.append(cells)
+    if not rows:
+        return ""
+    header = ["Columna"] + [hdr for _k, hdr in _NUM_STATS]
+    lines = ["### Estadísticos numéricos completos (describe)",
+             "",
+             "| " + " | ".join(header) + " |",
+             "| " + " | ".join(["---"] * len(header)) + " |"]
+    for cells in rows:
+        lines.append("| " + " | ".join(cells) + " |")
+    return "\n".join(lines)
+
+
+def _app_reexpression(columns: list) -> str:
+    """Loss #3 — the concrete recommended re-expression per column.
+
+    Names the transform (log1p/sqrt/yeo-johnson/none) instead of a vague
+    "consider re-expressing", with the ladder power, reason and alternatives.
+    """
+    rows = []
+    for info in (columns or []):
+        rx = info.get("reexpression") if isinstance(info, dict) else None
+        if not rx or not isinstance(rx, dict):
+            continue
+        rec = model._safe_str(rx.get("recommended")).strip()
+        if not rec:
+            continue
+        alts = rx.get("alternatives") or []
+        alt_txt = ", ".join(
+            model._safe_str(a.get("transform")) for a in alts
+            if isinstance(a, dict) and a.get("transform")) or "—"
+        rows.append([
+            _cell(info.get("name")), _cell(rec),
+            _fmt_num(rx.get("ladder_power")) if rx.get("ladder_power") is not None else "",
+            _cell(rx.get("reason")), _cell(alt_txt),
+        ])
+    if not rows:
+        return ""
+    lines = ["### Re-expresión recomendada (escalera de Tukey)",
+             "",
+             "| Columna | Recomendada | Potencia | Razón | Alternativas |",
+             "| --- | --- | --- | --- | --- |"]
+    for r in rows:
+        lines.append("| " + " | ".join(r) + " |")
+    return "\n".join(lines)
+
+
+def _app_kmeans_scores(kmeans: dict) -> str:
+    """Loss #4 — KMeans silhouette + inertia per k (justifies the chosen k)."""
+    scores = list(kmeans.get("scores_by_k", []) or [])
+    if not scores:
+        return ""
+    best_k = kmeans.get("best_k")
+    lines = ["#### KMeans — selección de k (`scores_by_k`)",
+             "",
+             "| k | Silhouette | Inercia | Elegido |",
+             "| --- | --- | --- | --- |"]
+    for s in scores:
+        if not isinstance(s, dict):
+            continue
+        k = s.get("k")
+        chosen = "✓" if best_k is not None and k == best_k else ""
+        lines.append(
+            f"| {_fmt_num(k)} | {_fmt_num(s.get('silhouette'))} "
+            f"| {_fmt_num(s.get('inertia'))} | {chosen} |")
+    return "\n".join(lines)
+
+
+def _app_normality(normality: dict) -> str:
+    """Loss #5 — each normality test's statistic next to its p-value."""
+    if not isinstance(normality, dict) or not normality:
+        return ""
+    lines = ["#### Tests de normalidad (estadístico + p-value)",
+             "",
+             ("| Columna | n | JB stat | JB p | D'Agostino stat | D'Agostino p "
+              "| Shapiro stat | Shapiro p | ¿Normal? |"),
+             "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"]
+    any_row = False
+    for col, res in normality.items():
+        if not isinstance(res, dict):
+            continue
+        jb = res.get("jarque_bera") or {}
+        da = res.get("dagostino") or {}
+        sh = res.get("shapiro") or {}
+        is_norm = "sí" if res.get("is_normal") else "no"
+        lines.append(
+            f"| {_cell(col)} | {_fmt_num(res.get('n')) if res.get('n') is not None else ''} "
+            f"| {_fmt_num(jb.get('stat'))} | {_fmt_num(jb.get('p'))} "
+            f"| {_fmt_num(da.get('stat'))} | {_fmt_num(da.get('p'))} "
+            f"| {_fmt_num(sh.get('stat'))} | {_fmt_num(sh.get('p'))} | {is_norm} |")
+        any_row = True
+    return "\n".join(lines) if any_row else ""
+
+
+def _profile_appendix(profile: dict) -> str:
+    """Build the full-data appendix from a TableProfile dict (additive).
+
+    Returns a Markdown ``## Apéndice`` section with one sub-table per loss the
+    human chapters drop, or ``""`` when the profile carries none of them. Never
+    raises: a missing/oddly-shaped section is skipped, not fatal.
+    """
+    if not isinstance(profile, dict):
+        return ""
+    sections: list = []
+    try:
+        corr = profile.get("correlations") or {}
+        seg = _app_correlations(corr) if isinstance(corr, dict) else ""
+        if seg:
+            sections.append(seg)
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        columns = profile.get("columns") or []
+        seg = _app_numeric_describe(columns)
+        if seg:
+            sections.append(seg)
+        seg = _app_reexpression(columns)
+        if seg:
+            sections.append(seg)
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        models = profile.get("models") or {}
+        if isinstance(models, dict):
+            model_segs = []
+            seg = _app_kmeans_scores(models.get("kmeans") or {})
+            if seg:
+                model_segs.append(seg)
+            seg = _app_normality(models.get("normality") or {})
+            if seg:
+                model_segs.append(seg)
+            if model_segs:
+                sections.append(
+                    "### Modelos — detalle\n\n" + "\n\n".join(model_segs))
+    except Exception:  # noqa: BLE001
+        pass
+    if not sections:
+        return ""
+    intro = ("Volcado completo de los datos que el motor computó y que los "
+             "capítulos (pensados para lectura humana / PDF) resumen. "
+             "Pensado para que un LLM reconstruya el análisis entero.")
+    return ("## Apéndice — Datos completos del perfil\n\n"
+            f"*{intro}*\n\n" + "\n\n".join(sections))
+
+
 # --------------------------------------------------------------------------- #
 # Entry point.
 # --------------------------------------------------------------------------- #
@@ -437,6 +715,18 @@ def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
                segments.append(seg)
        chapters_meta.append({"id": ch.id, "version": ch.version})

+    # Full-data appendix: dump everything the profile holds that the human
+    # chapters drop (additive — the .md ends up with more than the PDF/PPTX).
+    # Emitted only when a profile is supplied via meta['profile']; never fatal.
+    try:
+        appendix = _profile_appendix(meta.get("profile"))
+    except Exception as e:  # noqa: BLE001
+        appendix = ""
+        notes.append(f"apéndice de perfil omitido: {e}")
+    if appendix:
+        segments.append("---")
+        segments.append(appendix)
+
    content = "\n\n".join(segments) + "\n"
    note = f"{len(content)} caracteres"
    if notes:
@@ -261,7 +261,15 @@ def render_automatic_eda(
        md_path = None
        if emit_md:
            md_path = os.path.join(out_dir, base + ".md")
-            rmd = render_automatic_eda_markdown(prof, md_path, meta) or {}
+            # El Markdown es la salida MÁS completa: además del documento por
+            # capítulos (compartido con PDF/PPTX) volca un apéndice con TODOS los
+            # datos numéricos del perfil (matriz de asociación completa, describe
+            # con skew/kurtosis/percentiles, re-expresiones, scores_by_k de
+            # KMeans, estadísticos de normalidad). Se le pasa el `prof` vía
+            # meta['profile']; un meta propio evita alterar el de PDF/PPTX.
+            md_meta = dict(meta)
+            md_meta["profile"] = prof
+            rmd = render_automatic_eda_markdown(prof, md_path, md_meta) or {}

        return {
            "status": "ok",