diff --git a/python/functions/datascience/automatic_eda/chapters/correlacion.py b/python/functions/datascience/automatic_eda/chapters/correlacion.py new file mode 100644 index 00000000..22b6eb0c --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/correlacion.py @@ -0,0 +1,352 @@ +"""Correlation chapter — association matrix plus top positive/negative pairs. + +Builds the CORRELACION chapter of an AutomaticEDA document from a TableProfile. +It renders exactly what the user asked for: + +1. A correlation/association **matrix** (heatmap) reconstructed from the evaluated + pairs, signed for numeric-numeric pairs (Pearson/Spearman, ``[-1, 1]``) and as + magnitude for the mixed-type metrics (Cramér's V, correlation ratio, mutual + information, ``[0, 1]``). Labels are ordered by total connectivity so strong + associations cluster together instead of being scattered alphabetically. +2. The **TOP positive** pairs and the **TOP negative** pairs as two separate + tables. Only numeric-numeric metrics carry a sign, so negative pairs are by + construction Pearson/Spearman; positive pairs may use any method. +3. The methods legend and the multiple-testing (FDR) summary, so the reader sees + how many pairs survive the correction. +4. A spuriousness caveat when the profile flags level-based correlations on + non-stationary series (Granger–Newbold). + +All data comes from ``profile['correlations']`` — the output of the ``eda`` group +function ``association_matrix`` (optionally enriched by ``profile_table``). The +chapter never recomputes any statistic; it only lays the existing values out as +format-independent blocks. The renderers paginate tables (repeating the header) +and scale the heatmap to fit entirely, so nothing is ever cut. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +import math + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "correlacion" +CHAPTER_TITLE = "Correlación" + +# Methods whose value carries a sign (direction). Everything else is a magnitude +# in [0, 1] and therefore only ever contributes to the positive side. +_SIGNED_METHODS = ("pearson", "spearman") + +# Cap the heatmap to the most-connected variables so it stays legible on a phone +# screen / a slide. The renderer would scale a bigger matrix to fit, but the +# cells become unreadable; we instead show the top-N and say so. +_MAX_MATRIX_LABELS = 16 + +# How many pairs to show in each of the top-positive / top-negative tables. +_TOP_N = 10 + + +def _is_num(v) -> bool: + """True for a real, finite int/float (not bool, not NaN/inf).""" + return ( + isinstance(v, (int, float)) + and not isinstance(v, bool) + and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v))) + ) + + +def _fmt_val(value, decimals: int = 2) -> str: + """Format an association value compactly, signed, with a fixed width feel.""" + if not _is_num(value): + return "—" + text = f"{float(value):+.{decimals}f}" + # Strip a trailing -0.00 / +0.00 into a clean 0.00 for readability. + if text in ("+0.00", "-0.00"): + return "0.00" + return text + + +def _fmt_p(value) -> str: + """Format an adjusted p-value; tiny values collapse to a '<' threshold.""" + if not _is_num(value): + return "—" + p = float(value) + if p < 0.001: + return "<0.001" + return f"{p:.3f}" + + +def _is_signed(pair: dict) -> bool: + """True if the pair's method reports a directional (signed) value.""" + method = str(pair.get("method") or "").lower() + return any(m in method for m in _SIGNED_METHODS) + + +def _significant(pair: dict) -> bool: + """True if the pair is significant after FDR (or has no test to correct).""" + if pair.get("significant") is True: + return True + # Pairs without an applicable test (p_value None) are not penalised: they are + # admitted on magnitude alone upstream, so treat missing as "not rejected". + return pair.get("p_value") is None and pair.get("significant") is None + + +def _label(pair: dict) -> str: + """Human label for a pair, e.g. 'alcohol ↔ density'.""" + return f"{model._safe_str(pair.get('a'))} ↔ {model._safe_str(pair.get('b'))}" + + +def _split_top(pairs: list, top_n: int = _TOP_N): + """Split evaluated pairs into ranked top-positive and top-negative lists. + + Positive: any pair with a positive value, ranked by value descending. + Negative: only signed (numeric-numeric) pairs with a negative value, ranked + by value ascending (most negative first). Non-finite values are dropped. + """ + positive = [] + negative = [] + for pair in pairs: + if not isinstance(pair, dict): + continue + value = pair.get("value") + if not _is_num(value): + continue + if value > 0: + positive.append(pair) + elif value < 0 and _is_signed(pair): + negative.append(pair) + positive.sort(key=lambda p: float(p.get("value", 0.0)), reverse=True) + negative.sort(key=lambda p: float(p.get("value", 0.0))) + return positive[:top_n], negative[:top_n] + + +def _top_table(pairs: list, title: str): + """Build a DataTable for a list of pairs, or None if there are none.""" + if not pairs: + return None + header = ["Par", "Método", "Valor", "p (FDR)", "Sig."] + rows = [] + for pair in pairs: + method = model._safe_str(pair.get("method")) or "—" + rows.append([ + _label(pair), + method, + _fmt_val(pair.get("value")), + _fmt_p(pair.get("p_value_adjusted")), + "sí" if _significant(pair) else "no", + ]) + return model.DataTable(header=header, rows=rows, title=title) + + +def _ordered_labels(pairs: list): + """Pick and order the matrix labels by total connectivity (descending). + + Returns the list of variable names to place on the axes, capped at + ``_MAX_MATRIX_LABELS`` (the most-connected ones), plus a boolean saying + whether the cap trimmed anything. + """ + strength = {} + for pair in pairs: + if not isinstance(pair, dict): + continue + value = pair.get("value") + if not _is_num(value): + continue + mag = abs(float(value)) + for key in ("a", "b"): + name = pair.get(key) + if name is None: + continue + strength[name] = strength.get(name, 0.0) + mag + if not strength: + return [], False + ordered = sorted(strength, key=lambda n: strength[n], reverse=True) + trimmed = len(ordered) > _MAX_MATRIX_LABELS + return ordered[:_MAX_MATRIX_LABELS], trimmed + + +def _matrix_figure(pairs: list, labels: list): + """Return a Figure (lazy) with the signed association heatmap, or None. + + The matplotlib figure is built lazily inside ``make`` so importing this + module never requires matplotlib and a malformed plot degrades to nothing + instead of aborting the chapter. + """ + if len(labels) < 2: + return None + + index = {name: i for i, name in enumerate(labels)} + + def make(): + import numpy as np + from matplotlib.figure import Figure + + n = len(labels) + grid = np.full((n, n), np.nan, dtype=float) + for i in range(n): + grid[i, i] = 1.0 + for pair in pairs: + if not isinstance(pair, dict): + continue + a = pair.get("a") + b = pair.get("b") + value = pair.get("value") + if a not in index or b not in index or not _is_num(value): + continue + v = float(value) + # Mixed-type magnitudes are non-negative; keep them as-is on [0, 1]. + ia, ib = index[a], index[b] + grid[ia, ib] = v + grid[ib, ia] = v + + import matplotlib + + masked = np.ma.masked_invalid(grid) + fig = Figure(figsize=(6.2, 5.6)) + ax = fig.add_subplot(111) + cmap = matplotlib.colormaps["RdBu_r"].copy() + cmap.set_bad(color="#eeeeee") + im = ax.imshow(masked, cmap=cmap, vmin=-1.0, vmax=1.0, aspect="auto") + ax.set_xticks(range(n)) + ax.set_yticks(range(n)) + short = [str(s)[:14] for s in labels] + ax.set_xticks(range(n)) + ax.set_xticklabels(short, rotation=90, fontsize=7) + ax.set_yticklabels(short, fontsize=7) + # Annotate cells only when the matrix is small enough to stay legible. + if n <= 8: + for i in range(n): + for j in range(n): + cell = grid[i, j] + if _is_num(cell): + ax.text(j, i, f"{cell:+.2f}".replace("+", "") if cell < 0 + else f"{cell:.2f}", + ha="center", va="center", fontsize=6, + color="#222222") + fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04, + label="asociación (signo en num-num)") + fig.tight_layout() + return fig + + return model.Figure(make=make, + caption="Matriz de asociación. Azul = positiva, rojo = " + "negativa (sólo num-num lleva signo); gris = par " + "no evaluado.") + + +def _methods_block(corr: dict): + """Build a KVTable with the legend of the methods actually present.""" + legend = corr.get("methods_legend") + if not isinstance(legend, dict) or not legend: + return None + rows = [(model._safe_str(k), model._safe_str(v)) for k, v in legend.items()] + return model.KVTable(rows=rows, title="Métodos de asociación") + + +def _fdr_text(corr: dict) -> str | None: + """One-line summary of the multiple-testing (FDR) correction, or None.""" + mt = corr.get("multiple_testing") + if not isinstance(mt, dict) or not mt: + return None + method = model._safe_str(mt.get("method")).upper() or "FDR" + alpha = mt.get("alpha") + n_tests = mt.get("n_tests") + n_rej = mt.get("n_rejected") + parts = [f"Corrección por comparaciones múltiples ({method}"] + if _is_num(alpha): + parts[0] += f", α={float(alpha):g}" + parts[0] += ")." + if _is_num(n_tests): + rej = n_rej if _is_num(n_rej) else "—" + parts.append( + f"De {int(n_tests)} pares con test, {rej} siguen siendo " + f"significativos tras la corrección.") + return " ".join(parts) + + +def build_correlacion(profile: dict, ctx: dict): + """Build the Correlation Chapter, or None if there are no pairs to show. + + Reads ``profile['correlations']`` (the ``association_matrix`` output). Returns + ``None`` when the dataset has fewer than two associable columns (no evaluated + pairs), so the chapter is omitted instead of showing an empty section. Never + raises: every access is defensive. + + ctx keys consumed: none specific (presentation metadata is inherited from the + document). The chapter reads everything it needs from the profile. + """ + profile = profile or {} + ctx = ctx or {} + + corr = profile.get("correlations") + if not isinstance(corr, dict): + return None + pairs = corr.get("pairs") + if not isinstance(pairs, list) or not pairs: + return None + + blocks: list = [] + + # Intro: what this chapter shows and how to read the sign. + blocks.append(model.Markdown(text=( + "Asociación entre columnas. Cada par se evalúa con la métrica adecuada a " + "sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V " + "entre categóricas; razón de correlación num-categórica; información mutua " + "como medida común no lineal). Sólo las correlaciones **num-num** tienen " + "dirección: por eso los pares **negativos** son siempre num-num."))) + + # 1) Association matrix (heatmap). + labels, trimmed = _ordered_labels(pairs) + fig = _matrix_figure(pairs, labels) + if fig is not None: + blocks.append(model.Heading(text="Matriz de asociación", level=2)) + blocks.append(fig) + if trimmed: + blocks.append(model.Note(text=( + f"Se muestran las {len(labels)} variables más conectadas de la " + "matriz para mantenerla legible; el resto de pares siguen en las " + "tablas de abajo."))) + + # 2) Top positive / top negative pairs. + positive, negative = _split_top(pairs, _TOP_N) + pos_table = _top_table(positive, f"Top {len(positive)} positivas") + neg_table = _top_table(negative, f"Top {len(negative)} negativas") + if pos_table is not None: + blocks.append(model.Heading(text="Pares más correlacionados (positivos)", + level=2)) + blocks.append(pos_table) + if neg_table is not None: + blocks.append(model.Heading(text="Pares más correlacionados (negativos)", + level=2)) + blocks.append(neg_table) + elif pos_table is not None: + # No signed-negative pairs at all: say so honestly rather than omit. + blocks.append(model.Note(text=( + "No se han hallado correlaciones negativas significativas entre " + "columnas numéricas."))) + + # 3) Spuriousness caveat for level-based correlations (Granger–Newbold). + caveat = corr.get("levels_caveat") + if isinstance(caveat, str) and caveat.strip(): + blocks.append(model.Note(text=caveat.strip())) + elif corr.get("levels_possible_spurious"): + blocks.append(model.Note(text=( + "Aviso: algunas correlaciones se calcularon sobre niveles de series " + "no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas " + "sobre los retornos/diferencias antes de interpretarlas."))) + + # 4) FDR summary + methods legend. + fdr_text = _fdr_text(corr) + if fdr_text: + blocks.append(model.Markdown(text=fdr_text)) + methods = _methods_block(corr) + if methods is not None: + blocks.append(model.Heading(text="Métodos y leyenda", level=2)) + blocks.append(methods) + + if not blocks: + return None + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/correlacion_test.py b/python/functions/datascience/automatic_eda/chapters/correlacion_test.py new file mode 100644 index 00000000..88ddc726 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/correlacion_test.py @@ -0,0 +1,175 @@ +"""Tests for the CORRELACION chapter — DoD: golden + edges + error/anti-cut. + +Self-contained: builds a synthetic TableProfile carrying a ``correlations`` block +shaped exactly like ``association_matrix`` output (no DuckDB), so the suite is +fast and deterministic. Verifies that the chapter emits the association-matrix +figure plus separate top-positive / top-negative tables with the right pairs, +that it returns None when the profile has no pairs, that a None/empty profile +does not raise, and that a wide matrix with long labels renders to PDF *and* PPTX +without cutting anything. +""" + +import os +import re +import tempfile + +from pypdf import PdfReader + +from datascience.automatic_eda.chapters.correlacion import ( + CHAPTER_VERSION, + build_correlacion, +) +from datascience.automatic_eda.model import DataTable, Figure +from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf +from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx + + +def _pair(a, b, value, method, padj, sig, p=0.0001): + return { + "a": a, "b": b, "a_type": "numeric", "b_type": "numeric", + "method": method, "value": value, "extra": {"mi": abs(value) * 0.5}, + "p_value": p, "p_value_adjusted": padj, "significant": sig, + } + + +def _profile() -> dict: + """Synthetic wine-like profile with signed and unsigned associations.""" + pairs = [ + _pair("alcohol", "quality", 0.48, "pearson/spearman", 0.0005, True), + _pair("density", "alcohol", -0.78, "pearson/spearman", 0.0001, True), + _pair("ph", "fixed_acidity", -0.68, "pearson/spearman", 0.0002, True), + _pair("sulphates", "quality", 0.25, "pearson/spearman", 0.03, True), + # Unsigned mixed-type metrics: only ever positive, never in the neg table. + {"a": "region", "b": "type", "a_type": "categorical", + "b_type": "categorical", "method": "cramers_v", "value": 0.55, + "extra": {"mi": 0.3}, "p_value": 0.001, "p_value_adjusted": 0.004, + "significant": True}, + ] + return { + "table": "wine", + "source": "/data/wine.csv", + "n_rows": 1599, + "n_cols": 12, + "correlations": { + "pairs": pairs, + "strong": [p for p in pairs if abs(p["value"]) >= 0.5], + "methods_legend": { + "pearson": "num-num lineal (Pearson r), [-1, 1]", + "cramers_v": "cat-cat simétrica (Cramér's V), [0, 1]", + }, + "multiple_testing": {"method": "bh", "alpha": 0.05, + "n_tests": 5, "n_rejected": 5}, + }, + } + + +def _pdf_text(path: str) -> str: + txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages) + return re.sub(r"\s+", " ", txt) + + +def test_golden_chapter_tiene_matriz_y_top_positivos_y_negativos(): + ch = build_correlacion(_profile(), {}) + assert ch is not None + assert ch.id == "correlacion" + assert ch.version == CHAPTER_VERSION + kinds = [b.kind for b in ch.blocks] + assert "figure" in kinds # association matrix heatmap. + figs = [b for b in ch.blocks if isinstance(b, Figure)] + assert figs and figs[0].make is not None # lazy figure. + + tables = [b for b in ch.blocks if isinstance(b, DataTable)] + assert len(tables) >= 2 # top positive + top negative. + flat = " ".join(str(c) for t in tables for r in t.rows for c in r) + # Strongest positive present and signed +, strongest negative present and -. + assert "alcohol" in flat and "quality" in flat + assert "+0.48" in flat + assert "density" in flat and "-0.78" in flat + + +def test_golden_render_pdf_y_pptx_muestran_lo_exigido(): + prof = _profile() + with tempfile.TemporaryDirectory() as d: + pdf = os.path.join(d, "corr.pdf") + pptx = os.path.join(d, "corr.pptx") + rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine"}) + rx = render_automatic_eda_pptx(prof, pptx, {"title": "EDA — wine"}) + assert rp["path"] == pdf and rp["n_pages"] >= 1 + assert rx["path"] == pptx and rx["n_slides"] >= 1 + assert "correlacion" in [c["id"] for c in rp["chapters"]] + assert "correlacion" in [c["id"] for c in rx["chapters"]] + txt = _pdf_text(pdf) + # The requirement: matrix + top positive/negative pairs, all visible. + assert "Correlaci" in txt # chapter title (accents may vary in extract). + assert "density" in txt and "alcohol" in txt and "quality" in txt + assert "0.78" in txt and "0.48" in txt + # Both signs surfaced as separate sections. + assert "positiv" in txt.lower() and "negativ" in txt.lower() + + +def test_edge_sin_pares_devuelve_none(): + # No correlations key, empty pairs, and wrong types all yield None, not error. + assert build_correlacion({"table": "x"}, {}) is None + assert build_correlacion({"correlations": {}}, {}) is None + assert build_correlacion({"correlations": {"pairs": []}}, {}) is None + assert build_correlacion({"correlations": {"pairs": "nope"}}, {}) is None + assert build_correlacion(None, None) is None + assert build_correlacion({}, {}) is None + + +def test_edge_solo_positivos_emite_nota_sin_tabla_negativa(): + prof = { + "correlations": { + "pairs": [ + _pair("a", "b", 0.6, "pearson/spearman", 0.001, True), + {"a": "c", "b": "d", "a_type": "categorical", + "b_type": "categorical", "method": "cramers_v", "value": 0.7, + "extra": {"mi": 0.4}, "p_value": 0.001, + "p_value_adjusted": 0.003, "significant": True}, + ], + }, + } + ch = build_correlacion(prof, {}) + assert ch is not None + tables = [b for b in ch.blocks if isinstance(b, DataTable)] + assert len(tables) == 1 # only the positive table. + notes = " ".join(b.text for b in ch.blocks if b.kind == "note") + assert "negativas" in notes # honest "no negative correlations" note. + + +def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan(): + # 20 numeric vars with long names -> matrix trimmed to top-N + both renderers + # must lay the chapter out without raising and keep a long label intact. + long_a = "concentracion_de_dioxido_de_azufre_libre" + long_b = "concentracion_de_dioxido_de_azufre_total" + pairs = [_pair(long_a, long_b, -0.72, "pearson/spearman", 0.0001, True)] + for i in range(20): + pairs.append(_pair(f"variable_numerica_larga_{i:02d}", + f"variable_numerica_larga_{(i + 1) % 20:02d}", + 0.55 - i * 0.02, "pearson/spearman", 0.01, True)) + prof = {"correlations": {"pairs": pairs, + "multiple_testing": {"method": "bh", "alpha": 0.05, + "n_tests": len(pairs), + "n_rejected": len(pairs)}}} + ch = build_correlacion(prof, {}) + assert ch is not None + # A "showing top-N most connected" note appears when the matrix is trimmed. + notes = " ".join(b.text for b in ch.blocks if b.kind == "note") + assert "más conectadas" in notes + # Anti-cut guarantee at the block level: the long pair reaches the renderer + # whole (the block never truncates); the renderer then wraps the cell inside + # its column. Both long labels are present, intact, in a table cell. + tables = [b for b in ch.blocks if isinstance(b, DataTable)] + cells = [str(c) for t in tables for r in t.rows for c in r] + assert any(long_a in c and long_b in c for c in cells) + with tempfile.TemporaryDirectory() as d: + pdf = os.path.join(d, "wide.pdf") + pptx = os.path.join(d, "wide.pptx") + rp = render_automatic_eda_pdf(prof, pdf, {"write_manifest": False}) + rx = render_automatic_eda_pptx(prof, pptx, {"write_manifest": False}) + # Both renderers lay the wide chapter out without raising and produce a + # non-empty document (nothing dropped, just wrapped/scaled to fit). + assert rp["path"] == pdf and os.path.exists(pdf) and rp["n_pages"] >= 1 + assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1 + # A short, unbreakable fragment of the long label survives the wrap. + assert "azufre" in _pdf_text(pdf)