merge: capitulo AutomaticEDA correlacion (verificado met)

2026-06-30 15:15:39 +02:00
parent 415154d9a3 03f3dca823
commit ba162ab301
2 changed files with 527 additions and 0 deletions
@@ -0,0 +1,352 @@
 """Correlation chapter — association matrix plus top positive/negative pairs.
 Builds the CORRELACION chapter of an AutomaticEDA document from a TableProfile.
 It renders exactly what the user asked for:
 1. A correlation/association **matrix** (heatmap) reconstructed from the evaluated
   pairs, signed for numeric-numeric pairs (Pearson/Spearman, ``[-1, 1]``) and as
   magnitude for the mixed-type metrics (Cramér's V, correlation ratio, mutual
   information, ``[0, 1]``). Labels are ordered by total connectivity so strong
   associations cluster together instead of being scattered alphabetically.
 2. The **TOP positive** pairs and the **TOP negative** pairs as two separate
   tables. Only numeric-numeric metrics carry a sign, so negative pairs are by
   construction Pearson/Spearman; positive pairs may use any method.
 3. The methods legend and the multiple-testing (FDR) summary, so the reader sees
   how many pairs survive the correction.
 4. A spuriousness caveat when the profile flags level-based correlations on
   non-stationary series (Granger–Newbold).
 All data comes from ``profile['correlations']`` — the output of the ``eda`` group
 function ``association_matrix`` (optionally enriched by ``profile_table``). The
 chapter never recomputes any statistic; it only lays the existing values out as
 format-independent blocks. The renderers paginate tables (repeating the header)
 and scale the heatmap to fit entirely, so nothing is ever cut.
 Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
 """
 from __future__ import annotations
 import math
 from .. import model
 CHAPTER_VERSION = "1.0.0"
 CHAPTER_ID = "correlacion"
 CHAPTER_TITLE = "Correlación"
 # Methods whose value carries a sign (direction). Everything else is a magnitude
 # in [0, 1] and therefore only ever contributes to the positive side.
 _SIGNED_METHODS = ("pearson", "spearman")
 # Cap the heatmap to the most-connected variables so it stays legible on a phone
 # screen / a slide. The renderer would scale a bigger matrix to fit, but the
 # cells become unreadable; we instead show the top-N and say so.
 _MAX_MATRIX_LABELS = 16
 # How many pairs to show in each of the top-positive / top-negative tables.
 _TOP_N = 10
 def _is_num(v) -> bool:
    """True for a real, finite int/float (not bool, not NaN/inf)."""
    return (
        isinstance(v, (int, float))
        and not isinstance(v, bool)
        and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v)))
    )
 def _fmt_val(value, decimals: int = 2) -> str:
    """Format an association value compactly, signed, with a fixed width feel."""
    if not _is_num(value):
        return "—"
    text = f"{float(value):+.{decimals}f}"
    # Strip a trailing -0.00 / +0.00 into a clean 0.00 for readability.
    if text in ("+0.00", "-0.00"):
        return "0.00"
    return text
 def _fmt_p(value) -> str:
    """Format an adjusted p-value; tiny values collapse to a '<' threshold."""
    if not _is_num(value):
        return "—"
    p = float(value)
    if p < 0.001:
        return "<0.001"
    return f"{p:.3f}"
 def _is_signed(pair: dict) -> bool:
    """True if the pair's method reports a directional (signed) value."""
    method = str(pair.get("method") or "").lower()
    return any(m in method for m in _SIGNED_METHODS)
 def _significant(pair: dict) -> bool:
    """True if the pair is significant after FDR (or has no test to correct)."""
    if pair.get("significant") is True:
        return True
    # Pairs without an applicable test (p_value None) are not penalised: they are
    # admitted on magnitude alone upstream, so treat missing as "not rejected".
    return pair.get("p_value") is None and pair.get("significant") is None
 def _label(pair: dict) -> str:
    """Human label for a pair, e.g. 'alcohol ↔ density'."""
    return f"{model._safe_str(pair.get('a'))} ↔ {model._safe_str(pair.get('b'))}"
 def _split_top(pairs: list, top_n: int = _TOP_N):
    """Split evaluated pairs into ranked top-positive and top-negative lists.
    Positive: any pair with a positive value, ranked by value descending.
    Negative: only signed (numeric-numeric) pairs with a negative value, ranked
    by value ascending (most negative first). Non-finite values are dropped.
    """
    positive = []
    negative = []
    for pair in pairs:
        if not isinstance(pair, dict):
            continue
        value = pair.get("value")
        if not _is_num(value):
            continue
        if value > 0:
            positive.append(pair)
        elif value < 0 and _is_signed(pair):
            negative.append(pair)
    positive.sort(key=lambda p: float(p.get("value", 0.0)), reverse=True)
    negative.sort(key=lambda p: float(p.get("value", 0.0)))
    return positive[:top_n], negative[:top_n]
 def _top_table(pairs: list, title: str):
    """Build a DataTable for a list of pairs, or None if there are none."""
    if not pairs:
        return None
    header = ["Par", "Método", "Valor", "p (FDR)", "Sig."]
    rows = []
    for pair in pairs:
        method = model._safe_str(pair.get("method")) or "—"
        rows.append([
            _label(pair),
            method,
            _fmt_val(pair.get("value")),
            _fmt_p(pair.get("p_value_adjusted")),
            "sí" if _significant(pair) else "no",
        ])
    return model.DataTable(header=header, rows=rows, title=title)
 def _ordered_labels(pairs: list):
    """Pick and order the matrix labels by total connectivity (descending).
    Returns the list of variable names to place on the axes, capped at
    ``_MAX_MATRIX_LABELS`` (the most-connected ones), plus a boolean saying
    whether the cap trimmed anything.
    """
    strength = {}
    for pair in pairs:
        if not isinstance(pair, dict):
            continue
        value = pair.get("value")
        if not _is_num(value):
            continue
        mag = abs(float(value))
        for key in ("a", "b"):
            name = pair.get(key)
            if name is None:
                continue
            strength[name] = strength.get(name, 0.0) + mag
    if not strength:
        return [], False
    ordered = sorted(strength, key=lambda n: strength[n], reverse=True)
    trimmed = len(ordered) > _MAX_MATRIX_LABELS
    return ordered[:_MAX_MATRIX_LABELS], trimmed
 def _matrix_figure(pairs: list, labels: list):
    """Return a Figure (lazy) with the signed association heatmap, or None.
    The matplotlib figure is built lazily inside ``make`` so importing this
    module never requires matplotlib and a malformed plot degrades to nothing
    instead of aborting the chapter.
    """
    if len(labels) < 2:
        return None
    index = {name: i for i, name in enumerate(labels)}
    def make():
        import numpy as np
        from matplotlib.figure import Figure
        n = len(labels)
        grid = np.full((n, n), np.nan, dtype=float)
        for i in range(n):
            grid[i, i] = 1.0
        for pair in pairs:
            if not isinstance(pair, dict):
                continue
            a = pair.get("a")
            b = pair.get("b")
            value = pair.get("value")
            if a not in index or b not in index or not _is_num(value):
                continue
            v = float(value)
            # Mixed-type magnitudes are non-negative; keep them as-is on [0, 1].
            ia, ib = index[a], index[b]
            grid[ia, ib] = v
            grid[ib, ia] = v
        import matplotlib
        masked = np.ma.masked_invalid(grid)
        fig = Figure(figsize=(6.2, 5.6))
        ax = fig.add_subplot(111)
        cmap = matplotlib.colormaps["RdBu_r"].copy()
        cmap.set_bad(color="#eeeeee")
        im = ax.imshow(masked, cmap=cmap, vmin=-1.0, vmax=1.0, aspect="auto")
        ax.set_xticks(range(n))
        ax.set_yticks(range(n))
        short = [str(s)[:14] for s in labels]
        ax.set_xticks(range(n))
        ax.set_xticklabels(short, rotation=90, fontsize=7)
        ax.set_yticklabels(short, fontsize=7)
        # Annotate cells only when the matrix is small enough to stay legible.
        if n <= 8:
            for i in range(n):
                for j in range(n):
                    cell = grid[i, j]
                    if _is_num(cell):
                        ax.text(j, i, f"{cell:+.2f}".replace("+", "") if cell < 0
                                else f"{cell:.2f}",
                                ha="center", va="center", fontsize=6,
                                color="#222222")
        fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04,
                     label="asociación (signo en num-num)")
        fig.tight_layout()
        return fig
    return model.Figure(make=make,
                        caption="Matriz de asociación. Azul = positiva, rojo = "
                                "negativa (sólo num-num lleva signo); gris = par "
                                "no evaluado.")
 def _methods_block(corr: dict):
    """Build a KVTable with the legend of the methods actually present."""
    legend = corr.get("methods_legend")
    if not isinstance(legend, dict) or not legend:
        return None
    rows = [(model._safe_str(k), model._safe_str(v)) for k, v in legend.items()]
    return model.KVTable(rows=rows, title="Métodos de asociación")
 def _fdr_text(corr: dict) -> str | None:
    """One-line summary of the multiple-testing (FDR) correction, or None."""
    mt = corr.get("multiple_testing")
    if not isinstance(mt, dict) or not mt:
        return None
    method = model._safe_str(mt.get("method")).upper() or "FDR"
    alpha = mt.get("alpha")
    n_tests = mt.get("n_tests")
    n_rej = mt.get("n_rejected")
    parts = [f"Corrección por comparaciones múltiples ({method}"]
    if _is_num(alpha):
        parts[0] += f", α={float(alpha):g}"
    parts[0] += ")."
    if _is_num(n_tests):
        rej = n_rej if _is_num(n_rej) else "—"
        parts.append(
            f"De {int(n_tests)} pares con test, {rej} siguen siendo "
            f"significativos tras la corrección.")
    return " ".join(parts)
 def build_correlacion(profile: dict, ctx: dict):
    """Build the Correlation Chapter, or None if there are no pairs to show.
    Reads ``profile['correlations']`` (the ``association_matrix`` output). Returns
    ``None`` when the dataset has fewer than two associable columns (no evaluated
    pairs), so the chapter is omitted instead of showing an empty section. Never
    raises: every access is defensive.
    ctx keys consumed: none specific (presentation metadata is inherited from the
    document). The chapter reads everything it needs from the profile.
    """
    profile = profile or {}
    ctx = ctx or {}
    corr = profile.get("correlations")
    if not isinstance(corr, dict):
        return None
    pairs = corr.get("pairs")
    if not isinstance(pairs, list) or not pairs:
        return None
    blocks: list = []
    # Intro: what this chapter shows and how to read the sign.
    blocks.append(model.Markdown(text=(
        "Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
        "sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
        "entre categóricas; razón de correlación num-categórica; información mutua "
        "como medida común no lineal). Sólo las correlaciones **num-num** tienen "
        "dirección: por eso los pares **negativos** son siempre num-num.")))
    # 1) Association matrix (heatmap).
    labels, trimmed = _ordered_labels(pairs)
    fig = _matrix_figure(pairs, labels)
    if fig is not None:
        blocks.append(model.Heading(text="Matriz de asociación", level=2))
        blocks.append(fig)
        if trimmed:
            blocks.append(model.Note(text=(
                f"Se muestran las {len(labels)} variables más conectadas de la "
                "matriz para mantenerla legible; el resto de pares siguen en las "
                "tablas de abajo.")))
    # 2) Top positive / top negative pairs.
    positive, negative = _split_top(pairs, _TOP_N)
    pos_table = _top_table(positive, f"Top {len(positive)} positivas")
    neg_table = _top_table(negative, f"Top {len(negative)} negativas")
    if pos_table is not None:
        blocks.append(model.Heading(text="Pares más correlacionados (positivos)",
                                    level=2))
        blocks.append(pos_table)
    if neg_table is not None:
        blocks.append(model.Heading(text="Pares más correlacionados (negativos)",
                                    level=2))
        blocks.append(neg_table)
    elif pos_table is not None:
        # No signed-negative pairs at all: say so honestly rather than omit.
        blocks.append(model.Note(text=(
            "No se han hallado correlaciones negativas significativas entre "
            "columnas numéricas.")))
    # 3) Spuriousness caveat for level-based correlations (Granger–Newbold).
    caveat = corr.get("levels_caveat")
    if isinstance(caveat, str) and caveat.strip():
        blocks.append(model.Note(text=caveat.strip()))
    elif corr.get("levels_possible_spurious"):
        blocks.append(model.Note(text=(
            "Aviso: algunas correlaciones se calcularon sobre niveles de series "
            "no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
            "sobre los retornos/diferencias antes de interpretarlas.")))
    # 4) FDR summary + methods legend.
    fdr_text = _fdr_text(corr)
    if fdr_text:
        blocks.append(model.Markdown(text=fdr_text))
    methods = _methods_block(corr)
    if methods is not None:
        blocks.append(model.Heading(text="Métodos y leyenda", level=2))
        blocks.append(methods)
    if not blocks:
        return None
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,175 @@
 """Tests for the CORRELACION chapter — DoD: golden + edges + error/anti-cut.
 Self-contained: builds a synthetic TableProfile carrying a ``correlations`` block
 shaped exactly like ``association_matrix`` output (no DuckDB), so the suite is
 fast and deterministic. Verifies that the chapter emits the association-matrix
 figure plus separate top-positive / top-negative tables with the right pairs,
 that it returns None when the profile has no pairs, that a None/empty profile
 does not raise, and that a wide matrix with long labels renders to PDF *and* PPTX
 without cutting anything.
 """
 import os
 import re
 import tempfile
 from pypdf import PdfReader
 from datascience.automatic_eda.chapters.correlacion import (
    CHAPTER_VERSION,
    build_correlacion,
 )
 from datascience.automatic_eda.model import DataTable, Figure
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
 def _pair(a, b, value, method, padj, sig, p=0.0001):
    return {
        "a": a, "b": b, "a_type": "numeric", "b_type": "numeric",
        "method": method, "value": value, "extra": {"mi": abs(value) * 0.5},
        "p_value": p, "p_value_adjusted": padj, "significant": sig,
    }
 def _profile() -> dict:
    """Synthetic wine-like profile with signed and unsigned associations."""
    pairs = [
        _pair("alcohol", "quality", 0.48, "pearson/spearman", 0.0005, True),
        _pair("density", "alcohol", -0.78, "pearson/spearman", 0.0001, True),
        _pair("ph", "fixed_acidity", -0.68, "pearson/spearman", 0.0002, True),
        _pair("sulphates", "quality", 0.25, "pearson/spearman", 0.03, True),
        # Unsigned mixed-type metrics: only ever positive, never in the neg table.
        {"a": "region", "b": "type", "a_type": "categorical",
         "b_type": "categorical", "method": "cramers_v", "value": 0.55,
         "extra": {"mi": 0.3}, "p_value": 0.001, "p_value_adjusted": 0.004,
         "significant": True},
    ]
    return {
        "table": "wine",
        "source": "/data/wine.csv",
        "n_rows": 1599,
        "n_cols": 12,
        "correlations": {
            "pairs": pairs,
            "strong": [p for p in pairs if abs(p["value"]) >= 0.5],
            "methods_legend": {
                "pearson": "num-num lineal (Pearson r), [-1, 1]",
                "cramers_v": "cat-cat simétrica (Cramér's V), [0, 1]",
            },
            "multiple_testing": {"method": "bh", "alpha": 0.05,
                                 "n_tests": 5, "n_rejected": 5},
        },
    }
 def _pdf_text(path: str) -> str:
    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
    return re.sub(r"\s+", " ", txt)
 def test_golden_chapter_tiene_matriz_y_top_positivos_y_negativos():
    ch = build_correlacion(_profile(), {})
    assert ch is not None
    assert ch.id == "correlacion"
    assert ch.version == CHAPTER_VERSION
    kinds = [b.kind for b in ch.blocks]
    assert "figure" in kinds  # association matrix heatmap.
    figs = [b for b in ch.blocks if isinstance(b, Figure)]
    assert figs and figs[0].make is not None  # lazy figure.
    tables = [b for b in ch.blocks if isinstance(b, DataTable)]
    assert len(tables) >= 2  # top positive + top negative.
    flat = " ".join(str(c) for t in tables for r in t.rows for c in r)
    # Strongest positive present and signed +, strongest negative present and -.
    assert "alcohol" in flat and "quality" in flat
    assert "+0.48" in flat
    assert "density" in flat and "-0.78" in flat
 def test_golden_render_pdf_y_pptx_muestran_lo_exigido():
    prof = _profile()
    with tempfile.TemporaryDirectory() as d:
        pdf = os.path.join(d, "corr.pdf")
        pptx = os.path.join(d, "corr.pptx")
        rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine"})
        rx = render_automatic_eda_pptx(prof, pptx, {"title": "EDA — wine"})
        assert rp["path"] == pdf and rp["n_pages"] >= 1
        assert rx["path"] == pptx and rx["n_slides"] >= 1
        assert "correlacion" in [c["id"] for c in rp["chapters"]]
        assert "correlacion" in [c["id"] for c in rx["chapters"]]
        txt = _pdf_text(pdf)
        # The requirement: matrix + top positive/negative pairs, all visible.
        assert "Correlaci" in txt  # chapter title (accents may vary in extract).
        assert "density" in txt and "alcohol" in txt and "quality" in txt
        assert "0.78" in txt and "0.48" in txt
        # Both signs surfaced as separate sections.
        assert "positiv" in txt.lower() and "negativ" in txt.lower()
 def test_edge_sin_pares_devuelve_none():
    # No correlations key, empty pairs, and wrong types all yield None, not error.
    assert build_correlacion({"table": "x"}, {}) is None
    assert build_correlacion({"correlations": {}}, {}) is None
    assert build_correlacion({"correlations": {"pairs": []}}, {}) is None
    assert build_correlacion({"correlations": {"pairs": "nope"}}, {}) is None
    assert build_correlacion(None, None) is None
    assert build_correlacion({}, {}) is None
 def test_edge_solo_positivos_emite_nota_sin_tabla_negativa():
    prof = {
        "correlations": {
            "pairs": [
                _pair("a", "b", 0.6, "pearson/spearman", 0.001, True),
                {"a": "c", "b": "d", "a_type": "categorical",
                 "b_type": "categorical", "method": "cramers_v", "value": 0.7,
                 "extra": {"mi": 0.4}, "p_value": 0.001,
                 "p_value_adjusted": 0.003, "significant": True},
            ],
        },
    }
    ch = build_correlacion(prof, {})
    assert ch is not None
    tables = [b for b in ch.blocks if isinstance(b, DataTable)]
    assert len(tables) == 1  # only the positive table.
    notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
    assert "negativas" in notes  # honest "no negative correlations" note.
 def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
    # 20 numeric vars with long names -> matrix trimmed to top-N + both renderers
    # must lay the chapter out without raising and keep a long label intact.
    long_a = "concentracion_de_dioxido_de_azufre_libre"
    long_b = "concentracion_de_dioxido_de_azufre_total"
    pairs = [_pair(long_a, long_b, -0.72, "pearson/spearman", 0.0001, True)]
    for i in range(20):
        pairs.append(_pair(f"variable_numerica_larga_{i:02d}",
                           f"variable_numerica_larga_{(i + 1) % 20:02d}",
                           0.55 - i * 0.02, "pearson/spearman", 0.01, True))
    prof = {"correlations": {"pairs": pairs,
                             "multiple_testing": {"method": "bh", "alpha": 0.05,
                                                  "n_tests": len(pairs),
                                                  "n_rejected": len(pairs)}}}
    ch = build_correlacion(prof, {})
    assert ch is not None
    # A "showing top-N most connected" note appears when the matrix is trimmed.
    notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
    assert "más conectadas" in notes
    # Anti-cut guarantee at the block level: the long pair reaches the renderer
    # whole (the block never truncates); the renderer then wraps the cell inside
    # its column. Both long labels are present, intact, in a table cell.
    tables = [b for b in ch.blocks if isinstance(b, DataTable)]
    cells = [str(c) for t in tables for r in t.rows for c in r]
    assert any(long_a in c and long_b in c for c in cells)
    with tempfile.TemporaryDirectory() as d:
        pdf = os.path.join(d, "wide.pdf")
        pptx = os.path.join(d, "wide.pptx")
        rp = render_automatic_eda_pdf(prof, pdf, {"write_manifest": False})
        rx = render_automatic_eda_pptx(prof, pptx, {"write_manifest": False})
        # Both renderers lay the wide chapter out without raising and produce a
        # non-empty document (nothing dropped, just wrapped/scaled to fit).
        assert rp["path"] == pdf and os.path.exists(pdf) and rp["n_pages"] >= 1
        assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
        # A short, unbreakable fragment of the long label survives the wrap.
        assert "azufre" in _pdf_text(pdf)