merge: capitulo AutomaticEDA correlacion (verificado met)
This commit is contained in:
@@ -0,0 +1,352 @@
|
|||||||
|
"""Correlation chapter — association matrix plus top positive/negative pairs.
|
||||||
|
|
||||||
|
Builds the CORRELACION chapter of an AutomaticEDA document from a TableProfile.
|
||||||
|
It renders exactly what the user asked for:
|
||||||
|
|
||||||
|
1. A correlation/association **matrix** (heatmap) reconstructed from the evaluated
|
||||||
|
pairs, signed for numeric-numeric pairs (Pearson/Spearman, ``[-1, 1]``) and as
|
||||||
|
magnitude for the mixed-type metrics (Cramér's V, correlation ratio, mutual
|
||||||
|
information, ``[0, 1]``). Labels are ordered by total connectivity so strong
|
||||||
|
associations cluster together instead of being scattered alphabetically.
|
||||||
|
2. The **TOP positive** pairs and the **TOP negative** pairs as two separate
|
||||||
|
tables. Only numeric-numeric metrics carry a sign, so negative pairs are by
|
||||||
|
construction Pearson/Spearman; positive pairs may use any method.
|
||||||
|
3. The methods legend and the multiple-testing (FDR) summary, so the reader sees
|
||||||
|
how many pairs survive the correction.
|
||||||
|
4. A spuriousness caveat when the profile flags level-based correlations on
|
||||||
|
non-stationary series (Granger–Newbold).
|
||||||
|
|
||||||
|
All data comes from ``profile['correlations']`` — the output of the ``eda`` group
|
||||||
|
function ``association_matrix`` (optionally enriched by ``profile_table``). The
|
||||||
|
chapter never recomputes any statistic; it only lays the existing values out as
|
||||||
|
format-independent blocks. The renderers paginate tables (repeating the header)
|
||||||
|
and scale the heatmap to fit entirely, so nothing is ever cut.
|
||||||
|
|
||||||
|
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
from .. import model
|
||||||
|
|
||||||
|
CHAPTER_VERSION = "1.0.0"
|
||||||
|
CHAPTER_ID = "correlacion"
|
||||||
|
CHAPTER_TITLE = "Correlación"
|
||||||
|
|
||||||
|
# Methods whose value carries a sign (direction). Everything else is a magnitude
|
||||||
|
# in [0, 1] and therefore only ever contributes to the positive side.
|
||||||
|
_SIGNED_METHODS = ("pearson", "spearman")
|
||||||
|
|
||||||
|
# Cap the heatmap to the most-connected variables so it stays legible on a phone
|
||||||
|
# screen / a slide. The renderer would scale a bigger matrix to fit, but the
|
||||||
|
# cells become unreadable; we instead show the top-N and say so.
|
||||||
|
_MAX_MATRIX_LABELS = 16
|
||||||
|
|
||||||
|
# How many pairs to show in each of the top-positive / top-negative tables.
|
||||||
|
_TOP_N = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _is_num(v) -> bool:
|
||||||
|
"""True for a real, finite int/float (not bool, not NaN/inf)."""
|
||||||
|
return (
|
||||||
|
isinstance(v, (int, float))
|
||||||
|
and not isinstance(v, bool)
|
||||||
|
and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v)))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_val(value, decimals: int = 2) -> str:
|
||||||
|
"""Format an association value compactly, signed, with a fixed width feel."""
|
||||||
|
if not _is_num(value):
|
||||||
|
return "—"
|
||||||
|
text = f"{float(value):+.{decimals}f}"
|
||||||
|
# Strip a trailing -0.00 / +0.00 into a clean 0.00 for readability.
|
||||||
|
if text in ("+0.00", "-0.00"):
|
||||||
|
return "0.00"
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_p(value) -> str:
|
||||||
|
"""Format an adjusted p-value; tiny values collapse to a '<' threshold."""
|
||||||
|
if not _is_num(value):
|
||||||
|
return "—"
|
||||||
|
p = float(value)
|
||||||
|
if p < 0.001:
|
||||||
|
return "<0.001"
|
||||||
|
return f"{p:.3f}"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_signed(pair: dict) -> bool:
|
||||||
|
"""True if the pair's method reports a directional (signed) value."""
|
||||||
|
method = str(pair.get("method") or "").lower()
|
||||||
|
return any(m in method for m in _SIGNED_METHODS)
|
||||||
|
|
||||||
|
|
||||||
|
def _significant(pair: dict) -> bool:
|
||||||
|
"""True if the pair is significant after FDR (or has no test to correct)."""
|
||||||
|
if pair.get("significant") is True:
|
||||||
|
return True
|
||||||
|
# Pairs without an applicable test (p_value None) are not penalised: they are
|
||||||
|
# admitted on magnitude alone upstream, so treat missing as "not rejected".
|
||||||
|
return pair.get("p_value") is None and pair.get("significant") is None
|
||||||
|
|
||||||
|
|
||||||
|
def _label(pair: dict) -> str:
|
||||||
|
"""Human label for a pair, e.g. 'alcohol ↔ density'."""
|
||||||
|
return f"{model._safe_str(pair.get('a'))} ↔ {model._safe_str(pair.get('b'))}"
|
||||||
|
|
||||||
|
|
||||||
|
def _split_top(pairs: list, top_n: int = _TOP_N):
|
||||||
|
"""Split evaluated pairs into ranked top-positive and top-negative lists.
|
||||||
|
|
||||||
|
Positive: any pair with a positive value, ranked by value descending.
|
||||||
|
Negative: only signed (numeric-numeric) pairs with a negative value, ranked
|
||||||
|
by value ascending (most negative first). Non-finite values are dropped.
|
||||||
|
"""
|
||||||
|
positive = []
|
||||||
|
negative = []
|
||||||
|
for pair in pairs:
|
||||||
|
if not isinstance(pair, dict):
|
||||||
|
continue
|
||||||
|
value = pair.get("value")
|
||||||
|
if not _is_num(value):
|
||||||
|
continue
|
||||||
|
if value > 0:
|
||||||
|
positive.append(pair)
|
||||||
|
elif value < 0 and _is_signed(pair):
|
||||||
|
negative.append(pair)
|
||||||
|
positive.sort(key=lambda p: float(p.get("value", 0.0)), reverse=True)
|
||||||
|
negative.sort(key=lambda p: float(p.get("value", 0.0)))
|
||||||
|
return positive[:top_n], negative[:top_n]
|
||||||
|
|
||||||
|
|
||||||
|
def _top_table(pairs: list, title: str):
|
||||||
|
"""Build a DataTable for a list of pairs, or None if there are none."""
|
||||||
|
if not pairs:
|
||||||
|
return None
|
||||||
|
header = ["Par", "Método", "Valor", "p (FDR)", "Sig."]
|
||||||
|
rows = []
|
||||||
|
for pair in pairs:
|
||||||
|
method = model._safe_str(pair.get("method")) or "—"
|
||||||
|
rows.append([
|
||||||
|
_label(pair),
|
||||||
|
method,
|
||||||
|
_fmt_val(pair.get("value")),
|
||||||
|
_fmt_p(pair.get("p_value_adjusted")),
|
||||||
|
"sí" if _significant(pair) else "no",
|
||||||
|
])
|
||||||
|
return model.DataTable(header=header, rows=rows, title=title)
|
||||||
|
|
||||||
|
|
||||||
|
def _ordered_labels(pairs: list):
|
||||||
|
"""Pick and order the matrix labels by total connectivity (descending).
|
||||||
|
|
||||||
|
Returns the list of variable names to place on the axes, capped at
|
||||||
|
``_MAX_MATRIX_LABELS`` (the most-connected ones), plus a boolean saying
|
||||||
|
whether the cap trimmed anything.
|
||||||
|
"""
|
||||||
|
strength = {}
|
||||||
|
for pair in pairs:
|
||||||
|
if not isinstance(pair, dict):
|
||||||
|
continue
|
||||||
|
value = pair.get("value")
|
||||||
|
if not _is_num(value):
|
||||||
|
continue
|
||||||
|
mag = abs(float(value))
|
||||||
|
for key in ("a", "b"):
|
||||||
|
name = pair.get(key)
|
||||||
|
if name is None:
|
||||||
|
continue
|
||||||
|
strength[name] = strength.get(name, 0.0) + mag
|
||||||
|
if not strength:
|
||||||
|
return [], False
|
||||||
|
ordered = sorted(strength, key=lambda n: strength[n], reverse=True)
|
||||||
|
trimmed = len(ordered) > _MAX_MATRIX_LABELS
|
||||||
|
return ordered[:_MAX_MATRIX_LABELS], trimmed
|
||||||
|
|
||||||
|
|
||||||
|
def _matrix_figure(pairs: list, labels: list):
|
||||||
|
"""Return a Figure (lazy) with the signed association heatmap, or None.
|
||||||
|
|
||||||
|
The matplotlib figure is built lazily inside ``make`` so importing this
|
||||||
|
module never requires matplotlib and a malformed plot degrades to nothing
|
||||||
|
instead of aborting the chapter.
|
||||||
|
"""
|
||||||
|
if len(labels) < 2:
|
||||||
|
return None
|
||||||
|
|
||||||
|
index = {name: i for i, name in enumerate(labels)}
|
||||||
|
|
||||||
|
def make():
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib.figure import Figure
|
||||||
|
|
||||||
|
n = len(labels)
|
||||||
|
grid = np.full((n, n), np.nan, dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
grid[i, i] = 1.0
|
||||||
|
for pair in pairs:
|
||||||
|
if not isinstance(pair, dict):
|
||||||
|
continue
|
||||||
|
a = pair.get("a")
|
||||||
|
b = pair.get("b")
|
||||||
|
value = pair.get("value")
|
||||||
|
if a not in index or b not in index or not _is_num(value):
|
||||||
|
continue
|
||||||
|
v = float(value)
|
||||||
|
# Mixed-type magnitudes are non-negative; keep them as-is on [0, 1].
|
||||||
|
ia, ib = index[a], index[b]
|
||||||
|
grid[ia, ib] = v
|
||||||
|
grid[ib, ia] = v
|
||||||
|
|
||||||
|
import matplotlib
|
||||||
|
|
||||||
|
masked = np.ma.masked_invalid(grid)
|
||||||
|
fig = Figure(figsize=(6.2, 5.6))
|
||||||
|
ax = fig.add_subplot(111)
|
||||||
|
cmap = matplotlib.colormaps["RdBu_r"].copy()
|
||||||
|
cmap.set_bad(color="#eeeeee")
|
||||||
|
im = ax.imshow(masked, cmap=cmap, vmin=-1.0, vmax=1.0, aspect="auto")
|
||||||
|
ax.set_xticks(range(n))
|
||||||
|
ax.set_yticks(range(n))
|
||||||
|
short = [str(s)[:14] for s in labels]
|
||||||
|
ax.set_xticks(range(n))
|
||||||
|
ax.set_xticklabels(short, rotation=90, fontsize=7)
|
||||||
|
ax.set_yticklabels(short, fontsize=7)
|
||||||
|
# Annotate cells only when the matrix is small enough to stay legible.
|
||||||
|
if n <= 8:
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
cell = grid[i, j]
|
||||||
|
if _is_num(cell):
|
||||||
|
ax.text(j, i, f"{cell:+.2f}".replace("+", "") if cell < 0
|
||||||
|
else f"{cell:.2f}",
|
||||||
|
ha="center", va="center", fontsize=6,
|
||||||
|
color="#222222")
|
||||||
|
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04,
|
||||||
|
label="asociación (signo en num-num)")
|
||||||
|
fig.tight_layout()
|
||||||
|
return fig
|
||||||
|
|
||||||
|
return model.Figure(make=make,
|
||||||
|
caption="Matriz de asociación. Azul = positiva, rojo = "
|
||||||
|
"negativa (sólo num-num lleva signo); gris = par "
|
||||||
|
"no evaluado.")
|
||||||
|
|
||||||
|
|
||||||
|
def _methods_block(corr: dict):
|
||||||
|
"""Build a KVTable with the legend of the methods actually present."""
|
||||||
|
legend = corr.get("methods_legend")
|
||||||
|
if not isinstance(legend, dict) or not legend:
|
||||||
|
return None
|
||||||
|
rows = [(model._safe_str(k), model._safe_str(v)) for k, v in legend.items()]
|
||||||
|
return model.KVTable(rows=rows, title="Métodos de asociación")
|
||||||
|
|
||||||
|
|
||||||
|
def _fdr_text(corr: dict) -> str | None:
|
||||||
|
"""One-line summary of the multiple-testing (FDR) correction, or None."""
|
||||||
|
mt = corr.get("multiple_testing")
|
||||||
|
if not isinstance(mt, dict) or not mt:
|
||||||
|
return None
|
||||||
|
method = model._safe_str(mt.get("method")).upper() or "FDR"
|
||||||
|
alpha = mt.get("alpha")
|
||||||
|
n_tests = mt.get("n_tests")
|
||||||
|
n_rej = mt.get("n_rejected")
|
||||||
|
parts = [f"Corrección por comparaciones múltiples ({method}"]
|
||||||
|
if _is_num(alpha):
|
||||||
|
parts[0] += f", α={float(alpha):g}"
|
||||||
|
parts[0] += ")."
|
||||||
|
if _is_num(n_tests):
|
||||||
|
rej = n_rej if _is_num(n_rej) else "—"
|
||||||
|
parts.append(
|
||||||
|
f"De {int(n_tests)} pares con test, {rej} siguen siendo "
|
||||||
|
f"significativos tras la corrección.")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def build_correlacion(profile: dict, ctx: dict):
|
||||||
|
"""Build the Correlation Chapter, or None if there are no pairs to show.
|
||||||
|
|
||||||
|
Reads ``profile['correlations']`` (the ``association_matrix`` output). Returns
|
||||||
|
``None`` when the dataset has fewer than two associable columns (no evaluated
|
||||||
|
pairs), so the chapter is omitted instead of showing an empty section. Never
|
||||||
|
raises: every access is defensive.
|
||||||
|
|
||||||
|
ctx keys consumed: none specific (presentation metadata is inherited from the
|
||||||
|
document). The chapter reads everything it needs from the profile.
|
||||||
|
"""
|
||||||
|
profile = profile or {}
|
||||||
|
ctx = ctx or {}
|
||||||
|
|
||||||
|
corr = profile.get("correlations")
|
||||||
|
if not isinstance(corr, dict):
|
||||||
|
return None
|
||||||
|
pairs = corr.get("pairs")
|
||||||
|
if not isinstance(pairs, list) or not pairs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
blocks: list = []
|
||||||
|
|
||||||
|
# Intro: what this chapter shows and how to read the sign.
|
||||||
|
blocks.append(model.Markdown(text=(
|
||||||
|
"Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
|
||||||
|
"sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
|
||||||
|
"entre categóricas; razón de correlación num-categórica; información mutua "
|
||||||
|
"como medida común no lineal). Sólo las correlaciones **num-num** tienen "
|
||||||
|
"dirección: por eso los pares **negativos** son siempre num-num.")))
|
||||||
|
|
||||||
|
# 1) Association matrix (heatmap).
|
||||||
|
labels, trimmed = _ordered_labels(pairs)
|
||||||
|
fig = _matrix_figure(pairs, labels)
|
||||||
|
if fig is not None:
|
||||||
|
blocks.append(model.Heading(text="Matriz de asociación", level=2))
|
||||||
|
blocks.append(fig)
|
||||||
|
if trimmed:
|
||||||
|
blocks.append(model.Note(text=(
|
||||||
|
f"Se muestran las {len(labels)} variables más conectadas de la "
|
||||||
|
"matriz para mantenerla legible; el resto de pares siguen en las "
|
||||||
|
"tablas de abajo.")))
|
||||||
|
|
||||||
|
# 2) Top positive / top negative pairs.
|
||||||
|
positive, negative = _split_top(pairs, _TOP_N)
|
||||||
|
pos_table = _top_table(positive, f"Top {len(positive)} positivas")
|
||||||
|
neg_table = _top_table(negative, f"Top {len(negative)} negativas")
|
||||||
|
if pos_table is not None:
|
||||||
|
blocks.append(model.Heading(text="Pares más correlacionados (positivos)",
|
||||||
|
level=2))
|
||||||
|
blocks.append(pos_table)
|
||||||
|
if neg_table is not None:
|
||||||
|
blocks.append(model.Heading(text="Pares más correlacionados (negativos)",
|
||||||
|
level=2))
|
||||||
|
blocks.append(neg_table)
|
||||||
|
elif pos_table is not None:
|
||||||
|
# No signed-negative pairs at all: say so honestly rather than omit.
|
||||||
|
blocks.append(model.Note(text=(
|
||||||
|
"No se han hallado correlaciones negativas significativas entre "
|
||||||
|
"columnas numéricas.")))
|
||||||
|
|
||||||
|
# 3) Spuriousness caveat for level-based correlations (Granger–Newbold).
|
||||||
|
caveat = corr.get("levels_caveat")
|
||||||
|
if isinstance(caveat, str) and caveat.strip():
|
||||||
|
blocks.append(model.Note(text=caveat.strip()))
|
||||||
|
elif corr.get("levels_possible_spurious"):
|
||||||
|
blocks.append(model.Note(text=(
|
||||||
|
"Aviso: algunas correlaciones se calcularon sobre niveles de series "
|
||||||
|
"no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
|
||||||
|
"sobre los retornos/diferencias antes de interpretarlas.")))
|
||||||
|
|
||||||
|
# 4) FDR summary + methods legend.
|
||||||
|
fdr_text = _fdr_text(corr)
|
||||||
|
if fdr_text:
|
||||||
|
blocks.append(model.Markdown(text=fdr_text))
|
||||||
|
methods = _methods_block(corr)
|
||||||
|
if methods is not None:
|
||||||
|
blocks.append(model.Heading(text="Métodos y leyenda", level=2))
|
||||||
|
blocks.append(methods)
|
||||||
|
|
||||||
|
if not blocks:
|
||||||
|
return None
|
||||||
|
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||||
|
version=CHAPTER_VERSION, blocks=blocks)
|
||||||
@@ -0,0 +1,175 @@
|
|||||||
|
"""Tests for the CORRELACION chapter — DoD: golden + edges + error/anti-cut.
|
||||||
|
|
||||||
|
Self-contained: builds a synthetic TableProfile carrying a ``correlations`` block
|
||||||
|
shaped exactly like ``association_matrix`` output (no DuckDB), so the suite is
|
||||||
|
fast and deterministic. Verifies that the chapter emits the association-matrix
|
||||||
|
figure plus separate top-positive / top-negative tables with the right pairs,
|
||||||
|
that it returns None when the profile has no pairs, that a None/empty profile
|
||||||
|
does not raise, and that a wide matrix with long labels renders to PDF *and* PPTX
|
||||||
|
without cutting anything.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
from datascience.automatic_eda.chapters.correlacion import (
|
||||||
|
CHAPTER_VERSION,
|
||||||
|
build_correlacion,
|
||||||
|
)
|
||||||
|
from datascience.automatic_eda.model import DataTable, Figure
|
||||||
|
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||||
|
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||||
|
|
||||||
|
|
||||||
|
def _pair(a, b, value, method, padj, sig, p=0.0001):
|
||||||
|
return {
|
||||||
|
"a": a, "b": b, "a_type": "numeric", "b_type": "numeric",
|
||||||
|
"method": method, "value": value, "extra": {"mi": abs(value) * 0.5},
|
||||||
|
"p_value": p, "p_value_adjusted": padj, "significant": sig,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _profile() -> dict:
|
||||||
|
"""Synthetic wine-like profile with signed and unsigned associations."""
|
||||||
|
pairs = [
|
||||||
|
_pair("alcohol", "quality", 0.48, "pearson/spearman", 0.0005, True),
|
||||||
|
_pair("density", "alcohol", -0.78, "pearson/spearman", 0.0001, True),
|
||||||
|
_pair("ph", "fixed_acidity", -0.68, "pearson/spearman", 0.0002, True),
|
||||||
|
_pair("sulphates", "quality", 0.25, "pearson/spearman", 0.03, True),
|
||||||
|
# Unsigned mixed-type metrics: only ever positive, never in the neg table.
|
||||||
|
{"a": "region", "b": "type", "a_type": "categorical",
|
||||||
|
"b_type": "categorical", "method": "cramers_v", "value": 0.55,
|
||||||
|
"extra": {"mi": 0.3}, "p_value": 0.001, "p_value_adjusted": 0.004,
|
||||||
|
"significant": True},
|
||||||
|
]
|
||||||
|
return {
|
||||||
|
"table": "wine",
|
||||||
|
"source": "/data/wine.csv",
|
||||||
|
"n_rows": 1599,
|
||||||
|
"n_cols": 12,
|
||||||
|
"correlations": {
|
||||||
|
"pairs": pairs,
|
||||||
|
"strong": [p for p in pairs if abs(p["value"]) >= 0.5],
|
||||||
|
"methods_legend": {
|
||||||
|
"pearson": "num-num lineal (Pearson r), [-1, 1]",
|
||||||
|
"cramers_v": "cat-cat simétrica (Cramér's V), [0, 1]",
|
||||||
|
},
|
||||||
|
"multiple_testing": {"method": "bh", "alpha": 0.05,
|
||||||
|
"n_tests": 5, "n_rejected": 5},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_text(path: str) -> str:
|
||||||
|
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||||
|
return re.sub(r"\s+", " ", txt)
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_chapter_tiene_matriz_y_top_positivos_y_negativos():
|
||||||
|
ch = build_correlacion(_profile(), {})
|
||||||
|
assert ch is not None
|
||||||
|
assert ch.id == "correlacion"
|
||||||
|
assert ch.version == CHAPTER_VERSION
|
||||||
|
kinds = [b.kind for b in ch.blocks]
|
||||||
|
assert "figure" in kinds # association matrix heatmap.
|
||||||
|
figs = [b for b in ch.blocks if isinstance(b, Figure)]
|
||||||
|
assert figs and figs[0].make is not None # lazy figure.
|
||||||
|
|
||||||
|
tables = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||||
|
assert len(tables) >= 2 # top positive + top negative.
|
||||||
|
flat = " ".join(str(c) for t in tables for r in t.rows for c in r)
|
||||||
|
# Strongest positive present and signed +, strongest negative present and -.
|
||||||
|
assert "alcohol" in flat and "quality" in flat
|
||||||
|
assert "+0.48" in flat
|
||||||
|
assert "density" in flat and "-0.78" in flat
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_render_pdf_y_pptx_muestran_lo_exigido():
|
||||||
|
prof = _profile()
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
pdf = os.path.join(d, "corr.pdf")
|
||||||
|
pptx = os.path.join(d, "corr.pptx")
|
||||||
|
rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine"})
|
||||||
|
rx = render_automatic_eda_pptx(prof, pptx, {"title": "EDA — wine"})
|
||||||
|
assert rp["path"] == pdf and rp["n_pages"] >= 1
|
||||||
|
assert rx["path"] == pptx and rx["n_slides"] >= 1
|
||||||
|
assert "correlacion" in [c["id"] for c in rp["chapters"]]
|
||||||
|
assert "correlacion" in [c["id"] for c in rx["chapters"]]
|
||||||
|
txt = _pdf_text(pdf)
|
||||||
|
# The requirement: matrix + top positive/negative pairs, all visible.
|
||||||
|
assert "Correlaci" in txt # chapter title (accents may vary in extract).
|
||||||
|
assert "density" in txt and "alcohol" in txt and "quality" in txt
|
||||||
|
assert "0.78" in txt and "0.48" in txt
|
||||||
|
# Both signs surfaced as separate sections.
|
||||||
|
assert "positiv" in txt.lower() and "negativ" in txt.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_sin_pares_devuelve_none():
|
||||||
|
# No correlations key, empty pairs, and wrong types all yield None, not error.
|
||||||
|
assert build_correlacion({"table": "x"}, {}) is None
|
||||||
|
assert build_correlacion({"correlations": {}}, {}) is None
|
||||||
|
assert build_correlacion({"correlations": {"pairs": []}}, {}) is None
|
||||||
|
assert build_correlacion({"correlations": {"pairs": "nope"}}, {}) is None
|
||||||
|
assert build_correlacion(None, None) is None
|
||||||
|
assert build_correlacion({}, {}) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_solo_positivos_emite_nota_sin_tabla_negativa():
|
||||||
|
prof = {
|
||||||
|
"correlations": {
|
||||||
|
"pairs": [
|
||||||
|
_pair("a", "b", 0.6, "pearson/spearman", 0.001, True),
|
||||||
|
{"a": "c", "b": "d", "a_type": "categorical",
|
||||||
|
"b_type": "categorical", "method": "cramers_v", "value": 0.7,
|
||||||
|
"extra": {"mi": 0.4}, "p_value": 0.001,
|
||||||
|
"p_value_adjusted": 0.003, "significant": True},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ch = build_correlacion(prof, {})
|
||||||
|
assert ch is not None
|
||||||
|
tables = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||||
|
assert len(tables) == 1 # only the positive table.
|
||||||
|
notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
|
||||||
|
assert "negativas" in notes # honest "no negative correlations" note.
|
||||||
|
|
||||||
|
|
||||||
|
def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
|
||||||
|
# 20 numeric vars with long names -> matrix trimmed to top-N + both renderers
|
||||||
|
# must lay the chapter out without raising and keep a long label intact.
|
||||||
|
long_a = "concentracion_de_dioxido_de_azufre_libre"
|
||||||
|
long_b = "concentracion_de_dioxido_de_azufre_total"
|
||||||
|
pairs = [_pair(long_a, long_b, -0.72, "pearson/spearman", 0.0001, True)]
|
||||||
|
for i in range(20):
|
||||||
|
pairs.append(_pair(f"variable_numerica_larga_{i:02d}",
|
||||||
|
f"variable_numerica_larga_{(i + 1) % 20:02d}",
|
||||||
|
0.55 - i * 0.02, "pearson/spearman", 0.01, True))
|
||||||
|
prof = {"correlations": {"pairs": pairs,
|
||||||
|
"multiple_testing": {"method": "bh", "alpha": 0.05,
|
||||||
|
"n_tests": len(pairs),
|
||||||
|
"n_rejected": len(pairs)}}}
|
||||||
|
ch = build_correlacion(prof, {})
|
||||||
|
assert ch is not None
|
||||||
|
# A "showing top-N most connected" note appears when the matrix is trimmed.
|
||||||
|
notes = " ".join(b.text for b in ch.blocks if b.kind == "note")
|
||||||
|
assert "más conectadas" in notes
|
||||||
|
# Anti-cut guarantee at the block level: the long pair reaches the renderer
|
||||||
|
# whole (the block never truncates); the renderer then wraps the cell inside
|
||||||
|
# its column. Both long labels are present, intact, in a table cell.
|
||||||
|
tables = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||||
|
cells = [str(c) for t in tables for r in t.rows for c in r]
|
||||||
|
assert any(long_a in c and long_b in c for c in cells)
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
pdf = os.path.join(d, "wide.pdf")
|
||||||
|
pptx = os.path.join(d, "wide.pptx")
|
||||||
|
rp = render_automatic_eda_pdf(prof, pdf, {"write_manifest": False})
|
||||||
|
rx = render_automatic_eda_pptx(prof, pptx, {"write_manifest": False})
|
||||||
|
# Both renderers lay the wide chapter out without raising and produce a
|
||||||
|
# non-empty document (nothing dropped, just wrapped/scaled to fit).
|
||||||
|
assert rp["path"] == pdf and os.path.exists(pdf) and rp["n_pages"] >= 1
|
||||||
|
assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
|
||||||
|
# A short, unbreakable fragment of the long label survives the wrap.
|
||||||
|
assert "azufre" in _pdf_text(pdf)
|
||||||
Reference in New Issue
Block a user