feat(eda): el Markdown del AutomaticEDA vuelca TODOS los datos del profile
El .md del grupo `eda` es la salida pensada para pegar a un LLM, así que debe contener todo lo que el motor computó, aunque el PDF/PPTX (vista humana) resuman. La evaluación 2053 detectó 6 datos que el .md perdía respecto al profile. Se cierran de forma aditiva (el .md tiene MÁS que el PDF/PPTX, sin tocar esos renderers ni los capítulos). render_automatic_eda.py pasa el profile al serializador Markdown vía meta['profile'] (un meta propio del MD; el de PDF/PPTX queda intacto). render_md_impl.py añade un "Apéndice — Datos completos del perfil" al final del documento, emitido solo cuando hay profile y degradando limpio cuando falta una sección (lite sin modelos, profile sin correlaciones). El apéndice no se acopla a los ids de capítulo (que editan otros agentes en paralelo). Pérdidas cerradas: 1. Matriz de asociación COMPLETA: los N pares de correlations.pairs (no solo el top-17), incluidos correlation_ratio (num↔cat) y cramers_v (cat↔cat). 2. Numéricas: describe completo por columna — mean/median/mode/std/variance/cv, skew y kurtosis para TODAS (no solo las asimétricas), p1/p5/p25/p50/p75/p95/ p99, iqr, min/max, outliers, distribution_type. 3. Re-expresión: nombra la transformación concreta (log1p/sqrt/yeo-johnson) con potencia, razón y alternativas, no un vago "considerar re-expresión". 4. KMeans: tabla scores_by_k (silhouette + inercia por k) marcando el k elegido. 5. Normalidad: el estadístico (stat) de cada test junto al p-value. 6. Encabezados de figuras de barras/scree dejan de heredar "Desde/Hasta/Frecuencia" del histograma; usan "Inicio/Fin/Valor" cuando el caption no es un histograma. Test nuevo md_completeness_test.py: profile sintético, asserta los N pares de correlación, skew/kurtosis de cada numérica, percentiles extendidos, log1p, scores_by_k, stat de normalidad, headers de barras y los edges (sin modelos / sin correlaciones / sin profile, defensivo). Verificado con titanic (profile_level=full): 28 pares en la tabla (incl. Sex↔Embarked cramers_v), 7 numéricas con skew+kurtosis, p5/p95/p99, scores_by_k y JB/D'Agostino/Shapiro stat presentes. PDF/PPTX/manifest siguen saliendo. Suite automatic_eda + render_automatic_eda_test: 134 passed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,253 @@
|
|||||||
|
"""Tests for the Markdown completeness appendix (report 2053).
|
||||||
|
|
||||||
|
The AutomaticEDA Markdown is the output meant to be *pasted into an LLM*, so it
|
||||||
|
must carry EVERYTHING the engine computed — even the numbers the human-facing
|
||||||
|
chapters (shared with the PDF/PPTX) drop for readability. ``render_md`` appends a
|
||||||
|
full-data appendix built from ``meta['profile']`` that closes the six losses the
|
||||||
|
evaluation found:
|
||||||
|
|
||||||
|
1. the complete association matrix (every pair, incl. correlation_ratio /
|
||||||
|
cramers_v) — not just the top extremes;
|
||||||
|
2. every numeric statistic for every numeric column (skew/kurtosis/percentiles);
|
||||||
|
3. the concrete recommended re-expression;
|
||||||
|
4. KMeans ``scores_by_k``;
|
||||||
|
5. the normality test statistics;
|
||||||
|
6. correct headers for bar/scree figure tables (not ``Desde/Hasta/Frecuencia``).
|
||||||
|
|
||||||
|
Self-contained: a synthetic profile, no DuckDB, no heavy renderer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pytest # noqa: F401
|
||||||
|
|
||||||
|
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions
|
||||||
|
if _FUNCTIONS not in sys.path:
|
||||||
|
sys.path.insert(0, _FUNCTIONS)
|
||||||
|
|
||||||
|
from datascience.automatic_eda import model # noqa: E402
|
||||||
|
from datascience.automatic_eda.render_md_impl import ( # noqa: E402
|
||||||
|
_bars_table,
|
||||||
|
_is_histogram_caption,
|
||||||
|
_profile_appendix,
|
||||||
|
render_md,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Synthetic profile fixtures.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _numeric(skew, kurtosis):
|
||||||
|
"""A numeric stat block with every key the appendix serializes."""
|
||||||
|
return {
|
||||||
|
"count": 100, "min": 0.0, "max": 10.0, "mean": 5.0, "median": 5.0,
|
||||||
|
"mode": 4.0, "std": 2.0, "variance": 4.0, "cv": 0.4,
|
||||||
|
"p1": 0.1, "p5": 0.5, "p25": 2.5, "p50": 5.0, "p75": 7.5,
|
||||||
|
"p95": 9.5, "p99": 9.9, "iqr": 5.0, "skew": skew, "kurtosis": kurtosis,
|
||||||
|
"n_outliers": 1, "distribution_type": "normal",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _profile():
|
||||||
|
"""A small but structurally faithful TableProfile (3 numeric, 2 categorical)."""
|
||||||
|
pairs = [
|
||||||
|
{"a": "A", "b": "B", "a_type": "numeric", "b_type": "numeric",
|
||||||
|
"method": "pearson/spearman", "value": 0.8,
|
||||||
|
"p_value": 1e-9, "p_value_adjusted": 2e-9, "significant": True},
|
||||||
|
{"a": "A", "b": "C", "a_type": "numeric", "b_type": "numeric",
|
||||||
|
"method": "pearson/spearman", "value": -0.3,
|
||||||
|
"p_value": 0.01, "p_value_adjusted": 0.02, "significant": True},
|
||||||
|
{"a": "A", "b": "Cat1", "a_type": "numeric", "b_type": "categorical",
|
||||||
|
"method": "correlation_ratio", "value": 0.45,
|
||||||
|
"p_value": 0.001, "p_value_adjusted": 0.002, "significant": True},
|
||||||
|
# The single cat-cat pair the human chapter never shows.
|
||||||
|
{"a": "Cat1", "b": "Cat2", "a_type": "categorical",
|
||||||
|
"b_type": "categorical", "method": "cramers_v", "value": 0.11,
|
||||||
|
"p_value": 0.04, "p_value_adjusted": 0.05, "significant": False},
|
||||||
|
]
|
||||||
|
return {
|
||||||
|
"correlations": {
|
||||||
|
"pairs": pairs,
|
||||||
|
"multiple_testing": {"method": "bh", "n_tests": 4, "n_rejected": 3},
|
||||||
|
},
|
||||||
|
"columns": [
|
||||||
|
{"name": "A", "count": 100, "numeric": _numeric(0.0, -1.2),
|
||||||
|
"reexpression": {"recommended": "none", "ladder_power": 1.0,
|
||||||
|
"reason": "symmetric", "alternatives": []}},
|
||||||
|
{"name": "B", "count": 100, "numeric": _numeric(4.77, 33.1),
|
||||||
|
"reexpression": {"recommended": "log1p", "ladder_power": 0.0,
|
||||||
|
"reason": "skew 4.77 with zeros",
|
||||||
|
"alternatives": [{"transform": "yeo-johnson"},
|
||||||
|
{"transform": "sqrt"}]}},
|
||||||
|
{"name": "C", "count": 100, "numeric": _numeric(-0.6, 0.2)},
|
||||||
|
{"name": "Cat1", "categorical": {"top": [], "mode": "x"}},
|
||||||
|
{"name": "Cat2", "categorical": {"top": [], "mode": "y"}},
|
||||||
|
],
|
||||||
|
"models": {
|
||||||
|
"kmeans": {
|
||||||
|
"best_k": 3,
|
||||||
|
"scores_by_k": [
|
||||||
|
{"k": 2, "silhouette": 0.46, "inertia": 900.0},
|
||||||
|
{"k": 3, "silhouette": 0.50, "inertia": 550.0},
|
||||||
|
{"k": 4, "silhouette": 0.38, "inertia": 430.0},
|
||||||
|
],
|
||||||
|
"cluster_sizes": [40, 35, 25],
|
||||||
|
},
|
||||||
|
"normality": {
|
||||||
|
"A": {"n": 100,
|
||||||
|
"jarque_bera": {"stat": 18.7, "p": 8e-5, "normal": False},
|
||||||
|
"dagostino": {"stat": 18.1, "p": 1e-4, "normal": False},
|
||||||
|
"shapiro": {"stat": 0.98, "p": 7e-8, "normal": False},
|
||||||
|
"is_normal": False},
|
||||||
|
"C": {"n": 100,
|
||||||
|
"jarque_bera": {"stat": 2.1, "p": 0.35, "normal": True},
|
||||||
|
"dagostino": {"stat": 1.9, "p": 0.38, "normal": True},
|
||||||
|
"shapiro": {"stat": 0.99, "p": 0.12, "normal": True},
|
||||||
|
"is_normal": True},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _dummy_chapters():
|
||||||
|
"""A minimal one-chapter document so render_md does not early-return empty."""
|
||||||
|
return model.as_chapters([
|
||||||
|
{"id": "intro", "title": "Intro",
|
||||||
|
"blocks": [{"kind": "markdown", "text": "cuerpo del informe"}]},
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def _render(tmp_path, profile):
|
||||||
|
out = os.path.join(str(tmp_path), "out.md")
|
||||||
|
res = render_md(_dummy_chapters(), out, {"title": "EDA — t", "profile": profile})
|
||||||
|
assert res["path"] == out
|
||||||
|
return open(out, encoding="utf-8").read()
|
||||||
|
|
||||||
|
|
||||||
|
def _table_rows(md, section_title):
|
||||||
|
"""Count data rows of the first Markdown table under ``section_title``."""
|
||||||
|
seg = md.split(section_title, 1)[1]
|
||||||
|
rows, in_t, seen_sep = 0, False, False
|
||||||
|
for ln in seg.splitlines():
|
||||||
|
if ln.startswith("|"):
|
||||||
|
in_t = True
|
||||||
|
stripped = ln.replace("|", "").replace(" ", "")
|
||||||
|
if stripped and set(stripped) == {"-"}:
|
||||||
|
seen_sep = True
|
||||||
|
continue
|
||||||
|
if seen_sep:
|
||||||
|
rows += 1
|
||||||
|
elif in_t and not ln.strip():
|
||||||
|
break
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Golden: every datum the profile holds reaches the .md.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_appendix_lists_all_correlation_pairs(tmp_path):
|
||||||
|
md = _render(tmp_path, _profile())
|
||||||
|
assert "## Apéndice — Datos completos del perfil" in md
|
||||||
|
# All 4 pairs (the real titanic profile has 28; here 4 synthetic).
|
||||||
|
assert _table_rows(md, "### Matriz de asociación") == 4
|
||||||
|
# The cat-cat Cramér's V pair the human chapter drops is present.
|
||||||
|
assert "Cat1 ↔ Cat2" in md
|
||||||
|
assert "cramers_v" in md
|
||||||
|
assert "correlation_ratio" in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_appendix_has_skew_kurtosis_for_every_numeric(tmp_path):
|
||||||
|
md = _render(tmp_path, _profile())
|
||||||
|
seg = md.split("### Estadísticos numéricos completos", 1)[1].split("###", 1)[0]
|
||||||
|
lines = [l for l in seg.splitlines() if l.startswith("|")]
|
||||||
|
header = [h.strip() for h in lines[0].strip("|").split("|")]
|
||||||
|
assert "skew" in header and "kurtosis" in header
|
||||||
|
ski, kui = header.index("skew"), header.index("kurtosis")
|
||||||
|
data = lines[2:] # skip header + separator
|
||||||
|
assert len(data) == 3 # exactly the 3 numeric columns
|
||||||
|
for row in data:
|
||||||
|
cells = [c.strip() for c in row.strip("|").split("|")]
|
||||||
|
assert cells[ski] != "", f"missing skew in {cells[0]}"
|
||||||
|
assert cells[kui] != "", f"missing kurtosis in {cells[0]}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_appendix_has_extended_percentiles(tmp_path):
|
||||||
|
md = _render(tmp_path, _profile())
|
||||||
|
seg = md.split("### Estadísticos numéricos completos", 1)[1]
|
||||||
|
header = [h.strip() for h in seg.splitlines()[2].strip("|").split("|")]
|
||||||
|
for p in ("p1", "p5", "p25", "p75", "p95", "p99"):
|
||||||
|
assert p in header, f"percentile {p} missing from describe header"
|
||||||
|
|
||||||
|
|
||||||
|
def test_appendix_names_concrete_reexpression(tmp_path):
|
||||||
|
md = _render(tmp_path, _profile())
|
||||||
|
assert "### Re-expresión recomendada" in md
|
||||||
|
assert "log1p" in md # the concrete transform, not just "consider re-expressing"
|
||||||
|
assert "yeo-johnson" in md # alternatives listed too
|
||||||
|
|
||||||
|
|
||||||
|
def test_appendix_has_kmeans_scores_by_k(tmp_path):
|
||||||
|
md = _render(tmp_path, _profile())
|
||||||
|
assert "scores_by_k" in md
|
||||||
|
assert _table_rows(md, "#### KMeans — selección de k") == 3 # k=2,3,4
|
||||||
|
|
||||||
|
|
||||||
|
def test_appendix_has_normality_statistics(tmp_path):
|
||||||
|
md = _render(tmp_path, _profile())
|
||||||
|
assert "JB stat" in md # the statistic, not only the p-value
|
||||||
|
assert "Shapiro stat" in md
|
||||||
|
assert _table_rows(md, "#### Tests de normalidad") == 2 # cols A and C
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Edge: a profile missing models / correlations degrades, never raises.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_lite_profile_without_models(tmp_path):
|
||||||
|
prof = _profile()
|
||||||
|
prof.pop("models") # lite: no KMeans/normality
|
||||||
|
md = _render(tmp_path, prof)
|
||||||
|
assert "scores_by_k" not in md # section skipped
|
||||||
|
assert "Matriz de asociación" in md # correlations still dumped
|
||||||
|
assert "## Apéndice" in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_profile_without_correlations(tmp_path):
|
||||||
|
prof = _profile()
|
||||||
|
prof.pop("correlations")
|
||||||
|
md = _render(tmp_path, prof) # must not raise
|
||||||
|
assert "Matriz de asociación" not in md
|
||||||
|
assert "Estadísticos numéricos completos" in md # numeric section still there
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_profile_means_no_appendix(tmp_path):
|
||||||
|
out = os.path.join(str(tmp_path), "noprof.md")
|
||||||
|
res = render_md(_dummy_chapters(), out, {"title": "x"})
|
||||||
|
assert res["path"] == out
|
||||||
|
assert "## Apéndice" not in open(out, encoding="utf-8").read()
|
||||||
|
|
||||||
|
|
||||||
|
def test_appendix_helper_is_defensive():
|
||||||
|
assert _profile_appendix(None) == ""
|
||||||
|
assert _profile_appendix({}) == ""
|
||||||
|
assert _profile_appendix({"columns": []}) == ""
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Loss #6: bar/scree figure tables get a non-misleading header.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_histogram_caption_detection():
|
||||||
|
assert _is_histogram_caption("Histograma de Age")
|
||||||
|
assert _is_histogram_caption("Distribución de Fare")
|
||||||
|
assert not _is_histogram_caption("Media de Survived por Sex")
|
||||||
|
assert not _is_histogram_caption("Varianza explicada (scree PCA)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_bars_table_custom_header():
|
||||||
|
bars = [(0.0, 1.0, 5.0), (1.0, 2.0, 3.0)]
|
||||||
|
hist = _bars_table(bars) # default histogram header
|
||||||
|
assert "| Desde | Hasta | Frecuencia |" in hist
|
||||||
|
bar = _bars_table(bars, ("Inicio", "Fin", "Valor"))
|
||||||
|
assert "| Inicio | Fin | Valor |" in bar
|
||||||
|
assert "Frecuencia" not in bar
|
||||||
@@ -178,9 +178,17 @@ def _md_data_table(block) -> str:
|
|||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def _bars_table(bars: list) -> str:
|
def _bars_table(bars: list, header: tuple = ("Desde", "Hasta", "Frecuencia")) -> str:
|
||||||
"""Render extracted bar/histogram data as a Markdown table (Desde/Hasta/Frec)."""
|
"""Render extracted bar/histogram data as a Markdown table.
|
||||||
lines = ["| Desde | Hasta | Frecuencia |", "| --- | --- | --- |"]
|
|
||||||
|
``header`` is the 3-column header to use. Histogram bars are
|
||||||
|
``(Desde, Hasta, Frecuencia)``; bar/scree charts (means by group, PCA
|
||||||
|
explained variance) are *not* bins, so the caller passes a semantically
|
||||||
|
correct header (e.g. ``(Inicio, Fin, Valor)``) to avoid the misleading
|
||||||
|
"Frecuencia" label — see report 2053, loss #6.
|
||||||
|
"""
|
||||||
|
h0, h1, h2 = header
|
||||||
|
lines = [f"| {h0} | {h1} | {h2} |", "| --- | --- | --- |"]
|
||||||
shown = bars[:_MAX_BAR_ROWS]
|
shown = bars[:_MAX_BAR_ROWS]
|
||||||
for x0, x1, h in shown:
|
for x0, x1, h in shown:
|
||||||
lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
|
lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
|
||||||
@@ -191,6 +199,18 @@ def _bars_table(bars: list) -> str:
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _is_histogram_caption(caption: str) -> bool:
|
||||||
|
"""True when a figure caption describes a histogram (genuine numeric bins).
|
||||||
|
|
||||||
|
Histograms are the only figures whose bars are real ``[Desde, Hasta)`` bins
|
||||||
|
with a frequency count. Bar charts (means by group) and the PCA scree plot
|
||||||
|
carry per-category / per-component values, not bins — they must not inherit
|
||||||
|
the ``Desde/Hasta/Frecuencia`` header.
|
||||||
|
"""
|
||||||
|
c = (caption or "").lower()
|
||||||
|
return "histograma" in c or "distribución" in c or "distribucion" in c
|
||||||
|
|
||||||
|
|
||||||
def _extract_bars(fig) -> list:
|
def _extract_bars(fig) -> list:
|
||||||
"""Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
|
"""Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
|
||||||
|
|
||||||
@@ -253,7 +273,13 @@ def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
|
|||||||
if fig is not None:
|
if fig is not None:
|
||||||
bars = _extract_bars(fig)
|
bars = _extract_bars(fig)
|
||||||
if bars:
|
if bars:
|
||||||
parts.append(_bars_table(bars))
|
# A histogram's bars are genuine numeric bins (Desde/Hasta/
|
||||||
|
# Frecuencia). Bar charts and the PCA scree plot are not bins —
|
||||||
|
# give them a header that does not lie about "Frecuencia".
|
||||||
|
header = (("Desde", "Hasta", "Frecuencia")
|
||||||
|
if _is_histogram_caption(caption)
|
||||||
|
else ("Inicio", "Fin", "Valor"))
|
||||||
|
parts.append(_bars_table(bars, header))
|
||||||
if meta.get("embed_figures"):
|
if meta.get("embed_figures"):
|
||||||
png = _embed_png(fig, out_path, counter)
|
png = _embed_png(fig, out_path, counter)
|
||||||
if png:
|
if png:
|
||||||
@@ -354,6 +380,258 @@ def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
|
|||||||
return _md_note(model.Note(text=model._safe_str(block)))
|
return _md_note(model.Note(text=model._safe_str(block)))
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Profile appendix — the data the human-facing chapters drop.
|
||||||
|
#
|
||||||
|
# The chapter document (shared with the PDF/PPTX renderers) is designed for human
|
||||||
|
# reading and intentionally omits raw numbers: the correlation matrix shows only
|
||||||
|
# the top extremes, the numeric blocks skip skew/kurtosis/extended percentiles,
|
||||||
|
# the model chapter does not list ``scores_by_k`` or the normality test
|
||||||
|
# statistics. But the Markdown is meant to be *pasted into an LLM*, so it should
|
||||||
|
# carry EVERYTHING the engine computed. This appendix serializes the full
|
||||||
|
# ``profile`` (passed via ``meta['profile']``) as Markdown tables, additively:
|
||||||
|
# the PDF/PPTX are untouched, the .md simply has more than they do. Each section
|
||||||
|
# is emitted only when its source data is present, so a ``lite`` profile (no
|
||||||
|
# models) or a profile without correlations degrades cleanly instead of raising.
|
||||||
|
# See report 2053 for the six losses this closes.
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def _pair_types(a_type, b_type) -> str:
|
||||||
|
"""Short ``num↔cat`` label for an association pair's variable types."""
|
||||||
|
def short(t):
|
||||||
|
t = model._safe_str(t).lower()
|
||||||
|
if t.startswith("num"):
|
||||||
|
return "num"
|
||||||
|
if t.startswith("cat"):
|
||||||
|
return "cat"
|
||||||
|
return t or "?"
|
||||||
|
return f"{short(a_type)}↔{short(b_type)}"
|
||||||
|
|
||||||
|
|
||||||
|
def _app_correlations(corr: dict) -> str:
|
||||||
|
"""Loss #1 — every association pair (not just the top extremes).
|
||||||
|
|
||||||
|
Dumps all of ``correlations['pairs']`` as a table (pair · types · method ·
|
||||||
|
value · p · p-FDR · significant), ordered by |value| desc so the strongest
|
||||||
|
associations lead while nothing is cut. Includes the ``correlation_ratio``
|
||||||
|
(num↔cat) and ``cramers_v`` (cat↔cat) pairs the human chapter never shows.
|
||||||
|
"""
|
||||||
|
pairs = list(corr.get("pairs", []) or [])
|
||||||
|
if not pairs:
|
||||||
|
return ""
|
||||||
|
def keyfn(p):
|
||||||
|
try:
|
||||||
|
return -abs(float(p.get("value")))
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return 0.0
|
||||||
|
pairs_sorted = sorted(pairs, key=keyfn)
|
||||||
|
lines = ["### Matriz de asociación — todos los pares",
|
||||||
|
"",
|
||||||
|
("| Par | Tipos | Método | Valor | p-value | p-ajustado (FDR) "
|
||||||
|
"| ¿Sig? |"),
|
||||||
|
"| --- | --- | --- | --- | --- | --- | --- |"]
|
||||||
|
for p in pairs_sorted:
|
||||||
|
par = f"{_cell(p.get('a'))} ↔ {_cell(p.get('b'))}"
|
||||||
|
types = _pair_types(p.get("a_type"), p.get("b_type"))
|
||||||
|
method = _cell(p.get("method"))
|
||||||
|
val = _fmt_num(p.get("value"))
|
||||||
|
pv = _fmt_num(p.get("p_value")) if p.get("p_value") is not None else ""
|
||||||
|
padj = (_fmt_num(p.get("p_value_adjusted"))
|
||||||
|
if p.get("p_value_adjusted") is not None else "")
|
||||||
|
sig = "sí" if p.get("significant") else "no"
|
||||||
|
lines.append(
|
||||||
|
f"| {par} | {types} | {method} | {val} | {pv} | {padj} | {sig} |")
|
||||||
|
mt = corr.get("multiple_testing") or {}
|
||||||
|
n_tests = mt.get("n_tests", corr.get("n_tests"))
|
||||||
|
n_rej = mt.get("n_rejected")
|
||||||
|
note_bits = [f"{len(pairs)} pares en total"]
|
||||||
|
if n_tests is not None and n_rej is not None:
|
||||||
|
note_bits.append(
|
||||||
|
f"{n_rej} de {n_tests} significativos tras corrección "
|
||||||
|
f"{model._safe_str(mt.get('method', 'FDR')).upper()}")
|
||||||
|
lines.append("")
|
||||||
|
lines.append(f"*{'; '.join(note_bits)}.*")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
# Numeric statistics, in serialization order: (profile key, column header).
|
||||||
|
_NUM_STATS = [
|
||||||
|
("count", "n"), ("mean", "mean"), ("median", "median"), ("mode", "mode"),
|
||||||
|
("std", "std"), ("variance", "variance"), ("cv", "cv"),
|
||||||
|
("skew", "skew"), ("kurtosis", "kurtosis"),
|
||||||
|
("min", "min"), ("p1", "p1"), ("p5", "p5"), ("p25", "p25"), ("p50", "p50"),
|
||||||
|
("p75", "p75"), ("p95", "p95"), ("p99", "p99"), ("iqr", "iqr"),
|
||||||
|
("max", "max"), ("n_outliers", "outliers"),
|
||||||
|
("distribution_type", "distribución"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _app_numeric_describe(columns: list) -> str:
|
||||||
|
"""Loss #2 — every numeric statistic for every numeric column.
|
||||||
|
|
||||||
|
One row per numeric column with the full describe: mean/median/mode/std/
|
||||||
|
variance/cv, skew & kurtosis (for ALL columns, not only the skewed ones),
|
||||||
|
p1/p5/p25/p50/p75/p95/p99, iqr, min/max, outliers and distribution_type.
|
||||||
|
"""
|
||||||
|
rows = []
|
||||||
|
for info in (columns or []):
|
||||||
|
num = info.get("numeric") if isinstance(info, dict) else None
|
||||||
|
if not num:
|
||||||
|
continue
|
||||||
|
name = _cell(info.get("name"))
|
||||||
|
cells = [name]
|
||||||
|
for key, _hdr in _NUM_STATS:
|
||||||
|
v = num.get("count" if key == "count" else key)
|
||||||
|
if key == "count":
|
||||||
|
v = num.get("count", info.get("count"))
|
||||||
|
if key == "distribution_type":
|
||||||
|
cells.append(_cell(v))
|
||||||
|
else:
|
||||||
|
cells.append(_fmt_num(v) if v is not None else "")
|
||||||
|
rows.append(cells)
|
||||||
|
if not rows:
|
||||||
|
return ""
|
||||||
|
header = ["Columna"] + [hdr for _k, hdr in _NUM_STATS]
|
||||||
|
lines = ["### Estadísticos numéricos completos (describe)",
|
||||||
|
"",
|
||||||
|
"| " + " | ".join(header) + " |",
|
||||||
|
"| " + " | ".join(["---"] * len(header)) + " |"]
|
||||||
|
for cells in rows:
|
||||||
|
lines.append("| " + " | ".join(cells) + " |")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _app_reexpression(columns: list) -> str:
|
||||||
|
"""Loss #3 — the concrete recommended re-expression per column.
|
||||||
|
|
||||||
|
Names the transform (log1p/sqrt/yeo-johnson/none) instead of a vague
|
||||||
|
"consider re-expressing", with the ladder power, reason and alternatives.
|
||||||
|
"""
|
||||||
|
rows = []
|
||||||
|
for info in (columns or []):
|
||||||
|
rx = info.get("reexpression") if isinstance(info, dict) else None
|
||||||
|
if not rx or not isinstance(rx, dict):
|
||||||
|
continue
|
||||||
|
rec = model._safe_str(rx.get("recommended")).strip()
|
||||||
|
if not rec:
|
||||||
|
continue
|
||||||
|
alts = rx.get("alternatives") or []
|
||||||
|
alt_txt = ", ".join(
|
||||||
|
model._safe_str(a.get("transform")) for a in alts
|
||||||
|
if isinstance(a, dict) and a.get("transform")) or "—"
|
||||||
|
rows.append([
|
||||||
|
_cell(info.get("name")), _cell(rec),
|
||||||
|
_fmt_num(rx.get("ladder_power")) if rx.get("ladder_power") is not None else "",
|
||||||
|
_cell(rx.get("reason")), _cell(alt_txt),
|
||||||
|
])
|
||||||
|
if not rows:
|
||||||
|
return ""
|
||||||
|
lines = ["### Re-expresión recomendada (escalera de Tukey)",
|
||||||
|
"",
|
||||||
|
"| Columna | Recomendada | Potencia | Razón | Alternativas |",
|
||||||
|
"| --- | --- | --- | --- | --- |"]
|
||||||
|
for r in rows:
|
||||||
|
lines.append("| " + " | ".join(r) + " |")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _app_kmeans_scores(kmeans: dict) -> str:
|
||||||
|
"""Loss #4 — KMeans silhouette + inertia per k (justifies the chosen k)."""
|
||||||
|
scores = list(kmeans.get("scores_by_k", []) or [])
|
||||||
|
if not scores:
|
||||||
|
return ""
|
||||||
|
best_k = kmeans.get("best_k")
|
||||||
|
lines = ["#### KMeans — selección de k (`scores_by_k`)",
|
||||||
|
"",
|
||||||
|
"| k | Silhouette | Inercia | Elegido |",
|
||||||
|
"| --- | --- | --- | --- |"]
|
||||||
|
for s in scores:
|
||||||
|
if not isinstance(s, dict):
|
||||||
|
continue
|
||||||
|
k = s.get("k")
|
||||||
|
chosen = "✓" if best_k is not None and k == best_k else ""
|
||||||
|
lines.append(
|
||||||
|
f"| {_fmt_num(k)} | {_fmt_num(s.get('silhouette'))} "
|
||||||
|
f"| {_fmt_num(s.get('inertia'))} | {chosen} |")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _app_normality(normality: dict) -> str:
|
||||||
|
"""Loss #5 — each normality test's statistic next to its p-value."""
|
||||||
|
if not isinstance(normality, dict) or not normality:
|
||||||
|
return ""
|
||||||
|
lines = ["#### Tests de normalidad (estadístico + p-value)",
|
||||||
|
"",
|
||||||
|
("| Columna | n | JB stat | JB p | D'Agostino stat | D'Agostino p "
|
||||||
|
"| Shapiro stat | Shapiro p | ¿Normal? |"),
|
||||||
|
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |"]
|
||||||
|
any_row = False
|
||||||
|
for col, res in normality.items():
|
||||||
|
if not isinstance(res, dict):
|
||||||
|
continue
|
||||||
|
jb = res.get("jarque_bera") or {}
|
||||||
|
da = res.get("dagostino") or {}
|
||||||
|
sh = res.get("shapiro") or {}
|
||||||
|
is_norm = "sí" if res.get("is_normal") else "no"
|
||||||
|
lines.append(
|
||||||
|
f"| {_cell(col)} | {_fmt_num(res.get('n')) if res.get('n') is not None else ''} "
|
||||||
|
f"| {_fmt_num(jb.get('stat'))} | {_fmt_num(jb.get('p'))} "
|
||||||
|
f"| {_fmt_num(da.get('stat'))} | {_fmt_num(da.get('p'))} "
|
||||||
|
f"| {_fmt_num(sh.get('stat'))} | {_fmt_num(sh.get('p'))} | {is_norm} |")
|
||||||
|
any_row = True
|
||||||
|
return "\n".join(lines) if any_row else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _profile_appendix(profile: dict) -> str:
|
||||||
|
"""Build the full-data appendix from a TableProfile dict (additive).
|
||||||
|
|
||||||
|
Returns a Markdown ``## Apéndice`` section with one sub-table per loss the
|
||||||
|
human chapters drop, or ``""`` when the profile carries none of them. Never
|
||||||
|
raises: a missing/oddly-shaped section is skipped, not fatal.
|
||||||
|
"""
|
||||||
|
if not isinstance(profile, dict):
|
||||||
|
return ""
|
||||||
|
sections: list = []
|
||||||
|
try:
|
||||||
|
corr = profile.get("correlations") or {}
|
||||||
|
seg = _app_correlations(corr) if isinstance(corr, dict) else ""
|
||||||
|
if seg:
|
||||||
|
sections.append(seg)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
columns = profile.get("columns") or []
|
||||||
|
seg = _app_numeric_describe(columns)
|
||||||
|
if seg:
|
||||||
|
sections.append(seg)
|
||||||
|
seg = _app_reexpression(columns)
|
||||||
|
if seg:
|
||||||
|
sections.append(seg)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
models = profile.get("models") or {}
|
||||||
|
if isinstance(models, dict):
|
||||||
|
model_segs = []
|
||||||
|
seg = _app_kmeans_scores(models.get("kmeans") or {})
|
||||||
|
if seg:
|
||||||
|
model_segs.append(seg)
|
||||||
|
seg = _app_normality(models.get("normality") or {})
|
||||||
|
if seg:
|
||||||
|
model_segs.append(seg)
|
||||||
|
if model_segs:
|
||||||
|
sections.append(
|
||||||
|
"### Modelos — detalle\n\n" + "\n\n".join(model_segs))
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
if not sections:
|
||||||
|
return ""
|
||||||
|
intro = ("Volcado completo de los datos que el motor computó y que los "
|
||||||
|
"capítulos (pensados para lectura humana / PDF) resumen. "
|
||||||
|
"Pensado para que un LLM reconstruya el análisis entero.")
|
||||||
|
return ("## Apéndice — Datos completos del perfil\n\n"
|
||||||
|
f"*{intro}*\n\n" + "\n\n".join(sections))
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
# Entry point.
|
# Entry point.
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
@@ -437,6 +715,18 @@ def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
|
|||||||
segments.append(seg)
|
segments.append(seg)
|
||||||
chapters_meta.append({"id": ch.id, "version": ch.version})
|
chapters_meta.append({"id": ch.id, "version": ch.version})
|
||||||
|
|
||||||
|
# Full-data appendix: dump everything the profile holds that the human
|
||||||
|
# chapters drop (additive — the .md ends up with more than the PDF/PPTX).
|
||||||
|
# Emitted only when a profile is supplied via meta['profile']; never fatal.
|
||||||
|
try:
|
||||||
|
appendix = _profile_appendix(meta.get("profile"))
|
||||||
|
except Exception as e: # noqa: BLE001
|
||||||
|
appendix = ""
|
||||||
|
notes.append(f"apéndice de perfil omitido: {e}")
|
||||||
|
if appendix:
|
||||||
|
segments.append("---")
|
||||||
|
segments.append(appendix)
|
||||||
|
|
||||||
content = "\n\n".join(segments) + "\n"
|
content = "\n\n".join(segments) + "\n"
|
||||||
note = f"{len(content)} caracteres"
|
note = f"{len(content)} caracteres"
|
||||||
if notes:
|
if notes:
|
||||||
|
|||||||
@@ -261,7 +261,15 @@ def render_automatic_eda(
|
|||||||
md_path = None
|
md_path = None
|
||||||
if emit_md:
|
if emit_md:
|
||||||
md_path = os.path.join(out_dir, base + ".md")
|
md_path = os.path.join(out_dir, base + ".md")
|
||||||
rmd = render_automatic_eda_markdown(prof, md_path, meta) or {}
|
# El Markdown es la salida MÁS completa: además del documento por
|
||||||
|
# capítulos (compartido con PDF/PPTX) volca un apéndice con TODOS los
|
||||||
|
# datos numéricos del perfil (matriz de asociación completa, describe
|
||||||
|
# con skew/kurtosis/percentiles, re-expresiones, scores_by_k de
|
||||||
|
# KMeans, estadísticos de normalidad). Se le pasa el `prof` vía
|
||||||
|
# meta['profile']; un meta propio evita alterar el de PDF/PPTX.
|
||||||
|
md_meta = dict(meta)
|
||||||
|
md_meta["profile"] = prof
|
||||||
|
rmd = render_automatic_eda_markdown(prof, md_path, md_meta) or {}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
|
|||||||
Reference in New Issue
Block a user