3be188a921
Añade el parámetro profile_level a render_automatic_eda como preset de consumo CPU/LLM que mapea a los flags existentes (run_models, run_series, run_llm, sample). Tres niveles: - lite (bajo consumo): run_llm=False, run_series=False, sample=2000 y modelos limitados a PCA + normalidad, SIN KMeans ni IsolationForest (lo caro en CPU). Para un vistazo rápido y barato. - standard (default): comportamiento histórico — modelos completos, serie, sin LLM. - full: standard + narrativa LLM por capítulo. Precedencia: un flag explícito del caller (run_llm=..., run_models=..., etc.) siempre prima sobre el default que fija el preset; el preset solo aplica al parámetro que se deja en None. Cableado del modo lite sin tocar profile_table (lo tocan otros agentes en paralelo): profile_table NO corre los modelos (evita pagar KMeans + IsolationForest); este pipeline los corre con run_eda_models(run_kmeans=False, run_isolation=False) reusando ctx['raw_numeric'], y quita raw_numeric del ctx para que el capítulo modelos no reproyecte clusters KMeans en vivo (project_clusters_2d). geo_points ya queda derivado, así que geospatial no se afecta. Cambio aditivo y retro-compatible: sin profile_level el comportamiento es idéntico al de v1.0.0 (standard). Tests nuevos cubren lite/standard, la precedencia flag-sobre-preset, y la equivalencia del default con el histórico. Bump 1.0.0 -> 1.1.0 + growth log en el .md. Skill /eda documenta --lite/--full. Verificación: golden lite/standard/full sobre titanic — lite 4.8s (PCA+norm, sin KMeans/iso/LLM/serie), standard 7.8s (modelos completos), full 38.3s (+LLM). Suite render_automatic_eda + automatic_eda: 96 passed. fn index sin error. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
259 lines
10 KiB
Python
259 lines
10 KiB
Python
"""Test del pipeline render_automatic_eda — EDA completo a PDF + PPTX.
|
|
|
|
Self-contained: crea un DuckDB temporal pequeño con categóricas + fecha + lat/lon
|
|
+ varias numéricas, corre el pipeline (sin LLM) y verifica que emite PDF y PPTX
|
|
con páginas/slides, manifest, y que los capítulos dependientes de ctx quedan
|
|
POBLADOS (sin la nota de degradación).
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
_HERE = os.path.dirname(os.path.abspath(__file__))
|
|
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..")) # python/functions
|
|
if _FUNCTIONS not in sys.path:
|
|
sys.path.insert(0, _FUNCTIONS)
|
|
|
|
import duckdb # noqa: E402
|
|
|
|
from pipelines.render_automatic_eda import render_automatic_eda # noqa: E402
|
|
|
|
|
|
def _make_db(path):
|
|
con = duckdb.connect(path)
|
|
con.execute(
|
|
"CREATE TABLE sales (d DATE, region VARCHAR, channel VARCHAR, "
|
|
"amount DOUBLE, units INTEGER, lat DOUBLE, lon DOUBLE)"
|
|
)
|
|
from datetime import date, timedelta
|
|
|
|
regions = ["norte", "sur", "este"]
|
|
channels = ["web", "tienda"]
|
|
centers = {"norte": (43.0, -3.0), "sur": (37.0, -5.0), "este": (39.5, -0.4)}
|
|
rows = []
|
|
d0 = date(2024, 1, 1)
|
|
for i in range(180):
|
|
r = regions[i % 3]
|
|
ch = channels[i % 2]
|
|
clat, clon = centers[r]
|
|
rows.append((
|
|
d0 + timedelta(days=i), r, ch,
|
|
round(100 + (i % 7) * 13.5 + (5 if ch == "web" else 0), 2),
|
|
10 + (i % 5),
|
|
round(clat + (i % 3) * 0.1, 4),
|
|
round(clon + (i % 4) * 0.1, 4),
|
|
))
|
|
con.executemany("INSERT INTO sales VALUES (?,?,?,?,?,?,?)", rows)
|
|
con.close()
|
|
|
|
|
|
def test_pipeline_emits_pdf_and_pptx_with_chapters(tmp_path):
|
|
db = str(tmp_path / "sales.duckdb")
|
|
_make_db(db)
|
|
out = str(tmp_path / "out")
|
|
|
|
r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
|
|
run_llm=False, out_dir=out, basename="test_sales")
|
|
assert r["status"] == "ok", r.get("error")
|
|
|
|
# Both formats produced.
|
|
assert r["pdf_path"] and os.path.exists(r["pdf_path"])
|
|
assert r["pptx_path"] and os.path.exists(r["pptx_path"])
|
|
assert (r["n_pages"] or 0) > 0
|
|
assert (r["n_slides"] or 0) > 0
|
|
# Per-chapter manifest written next to the output.
|
|
assert r["manifest_path"] and os.path.exists(r["manifest_path"])
|
|
|
|
|
|
def test_pipeline_chapters_populated_not_degraded(tmp_path):
|
|
"""The 4 ctx-dependent chapters build with real data (no degradation note)."""
|
|
import json
|
|
|
|
db = str(tmp_path / "sales.duckdb")
|
|
_make_db(db)
|
|
out = str(tmp_path / "out")
|
|
r = render_automatic_eda(db, "sales", run_models=True, run_series=True,
|
|
run_llm=False, out_dir=out, basename="t2")
|
|
assert r["status"] == "ok"
|
|
|
|
# The manifest lists the ctx-dependent chapters as actually rendered.
|
|
with open(r["manifest_path"], encoding="utf-8") as fh:
|
|
man = json.load(fh)
|
|
chapters = man.get("chapters") or {}
|
|
for cid in ("modelos", "timeseries", "geospatial", "agregacion"):
|
|
assert cid in chapters, f"capítulo {cid} ausente del manifest: {list(chapters)}"
|
|
|
|
|
|
def test_pipeline_bad_db_degrades_without_raising(tmp_path):
|
|
r = render_automatic_eda(str(tmp_path / "nope.duckdb"), "ghost",
|
|
out_dir=str(tmp_path / "o"))
|
|
assert r["status"] == "error"
|
|
assert "error" in r
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# profile_level: preset de bajo consumo CPU/LLM.
|
|
# --------------------------------------------------------------------------- #
|
|
def _make_db_models(path):
|
|
"""DB con >=2 numéricas continuas (alta cardinalidad, 3 clusters gaussianos).
|
|
|
|
El DB `sales` de _make_db solo deja UNA columna de modelo tras la selección de
|
|
features (units es baja cardinalidad, lat/lon discretizadas), insuficiente para
|
|
PCA/KMeans/IsolationForest (necesitan >=2). Este DB sí tiene 3 numéricas
|
|
continuas con estructura de clusters para que el modo completo ejecute los
|
|
multivariantes.
|
|
"""
|
|
import random
|
|
from datetime import date, timedelta
|
|
|
|
con = duckdb.connect(path)
|
|
con.execute(
|
|
"CREATE TABLE pts (d DATE, grp VARCHAR, x1 DOUBLE, x2 DOUBLE, x3 DOUBLE)"
|
|
)
|
|
random.seed(42)
|
|
centers = [(0.0, 0.0, 0.0), (10.0, 10.0, 10.0), (20.0, 5.0, 15.0)]
|
|
d0 = date(2024, 1, 1)
|
|
rows = []
|
|
for i in range(150):
|
|
cx, cy, cz = centers[i % 3]
|
|
rows.append((
|
|
d0 + timedelta(days=i), f"g{i % 3}",
|
|
round(cx + random.gauss(0, 1.0), 4),
|
|
round(cy + random.gauss(0, 1.0), 4),
|
|
round(cz + random.gauss(0, 1.0), 4),
|
|
))
|
|
con.executemany("INSERT INTO pts VALUES (?,?,?,?,?)", rows)
|
|
con.close()
|
|
|
|
|
|
def test_profile_level_lite_skips_expensive_models(tmp_path):
|
|
"""lite: el bloque models trae PCA + normalidad pero NO KMeans/IsolationForest.
|
|
|
|
Demuestra (DoD bajo consumo) que el camino lite no ejecuta los modelos caros
|
|
en CPU ni la capa LLM ni la serie temporal: prof['models'] queda con pca y
|
|
normality poblados y kmeans/outliers a None, prof['llm'] y prof['series'] a
|
|
None, y el capítulo `modelos` se renderiza igualmente (con PCA, sin clusters).
|
|
"""
|
|
import json
|
|
|
|
db = str(tmp_path / "pts.duckdb")
|
|
_make_db_models(db)
|
|
out = str(tmp_path / "out")
|
|
r = render_automatic_eda(db, "pts", profile_level="lite",
|
|
out_dir=out, basename="lite")
|
|
assert r["status"] == "ok", r.get("error")
|
|
|
|
models = (r["profile"] or {}).get("models") or {}
|
|
assert models.get("pca") is not None, "lite debe traer PCA"
|
|
assert models.get("normality") is not None, "lite debe traer normalidad"
|
|
assert models.get("kmeans") is None, "lite NO debe ejecutar KMeans"
|
|
assert models.get("outliers") is None, "lite NO debe ejecutar IsolationForest"
|
|
assert (r["profile"] or {}).get("llm") is None, "lite NO debe llamar al LLM"
|
|
assert (r["profile"] or {}).get("series") is None, "lite NO debe calcular serie"
|
|
|
|
# El capítulo modelos sigue presente (lo puebla el PCA), sin clusters KMeans.
|
|
with open(r["manifest_path"], encoding="utf-8") as fh:
|
|
man = json.load(fh)
|
|
assert "modelos" in (man.get("chapters") or {})
|
|
|
|
|
|
def test_profile_level_standard_runs_full_models(tmp_path):
|
|
"""standard (default): modelos completos (KMeans + IsolationForest) y serie."""
|
|
db = str(tmp_path / "pts.duckdb")
|
|
_make_db_models(db)
|
|
out = str(tmp_path / "out")
|
|
r = render_automatic_eda(db, "pts", profile_level="standard",
|
|
out_dir=out, basename="std")
|
|
assert r["status"] == "ok", r.get("error")
|
|
models = (r["profile"] or {}).get("models") or {}
|
|
assert models.get("pca") is not None
|
|
assert models.get("kmeans") is not None, "standard debe ejecutar KMeans"
|
|
assert models.get("outliers") is not None, "standard debe ejecutar IsolationForest"
|
|
assert (r["profile"] or {}).get("series") is not None, "standard calcula serie"
|
|
|
|
|
|
def _patch_pipeline_internals(monkeypatch, captured):
|
|
"""Stub de las dependencias del pipeline para tests de resolución de flags.
|
|
|
|
Sustituye profile_table / build_eda_render_ctx / renderers por stubs rápidos
|
|
sin red ni matplotlib, capturando los kwargs con los que se invocan. Permite
|
|
verificar la PRECEDENCIA flag-explícito-sobre-preset sin ejecutar el EDA real.
|
|
"""
|
|
import pipelines.render_automatic_eda as mod
|
|
|
|
def fake_profile_table(db_path, table, **kw):
|
|
captured["run_llm"] = kw.get("run_llm")
|
|
captured["run_models"] = kw.get("run_models")
|
|
captured["run_series"] = kw.get("run_series")
|
|
captured["sample"] = kw.get("sample")
|
|
return {"status": "ok", "profile": {"columns": []}}
|
|
|
|
def fake_ctx(db_path, table, prof, **kw):
|
|
captured["base_ctx"] = kw.get("base_ctx")
|
|
return {}
|
|
|
|
monkeypatch.setattr(mod, "profile_table", fake_profile_table)
|
|
monkeypatch.setattr(mod, "build_eda_render_ctx", fake_ctx)
|
|
monkeypatch.setattr(mod, "render_automatic_eda_pdf",
|
|
lambda *a, **k: {"path": "x.pdf", "n_pages": 1,
|
|
"manifest_path": "m.json"})
|
|
monkeypatch.setattr(mod, "render_automatic_eda_pptx",
|
|
lambda *a, **k: {"path": "x.pptx", "n_slides": 1})
|
|
|
|
|
|
def test_explicit_flag_overrides_preset(monkeypatch):
|
|
"""Precedencia: profile_level='lite' con run_llm=True explícito → LLM activo.
|
|
|
|
El flag explícito del caller gana al default del preset. Se verifica tanto en
|
|
el flag que llega a profile_table (run_llm=True ⇒ profile_table llamará al
|
|
LLM) como en el base_ctx (run_cluster_llm=True ⇒ narrativa LLM por capítulo).
|
|
"""
|
|
captured = {}
|
|
_patch_pipeline_internals(monkeypatch, captured)
|
|
|
|
captured.clear()
|
|
render_automatic_eda("db", "t", profile_level="lite", run_llm=True)
|
|
assert captured["run_llm"] is True, "flag explícito debe primar sobre preset lite"
|
|
assert (captured["base_ctx"] or {}).get("run_cluster_llm") is True
|
|
|
|
|
|
def test_full_preset_enables_llm(monkeypatch):
|
|
"""full: el preset resuelve run_llm=True y activa la narrativa LLM en el ctx."""
|
|
captured = {}
|
|
_patch_pipeline_internals(monkeypatch, captured)
|
|
|
|
captured.clear()
|
|
render_automatic_eda("db", "t", profile_level="full")
|
|
assert captured["run_llm"] is True
|
|
assert (captured["base_ctx"] or {}).get("run_cluster_llm") is True
|
|
|
|
|
|
def test_no_profile_level_defaults_to_standard(monkeypatch):
|
|
"""Retro-compat: sin profile_level ni flags, el comportamiento es el histórico.
|
|
|
|
standard = run_models True, run_series True, run_llm False, sample 5000. Es el
|
|
mismo default que tenía el pipeline antes de introducir profile_level (cambio
|
|
aditivo: las llamadas existentes no cambian de comportamiento).
|
|
"""
|
|
captured = {}
|
|
_patch_pipeline_internals(monkeypatch, captured)
|
|
|
|
captured.clear()
|
|
render_automatic_eda("db", "t") # sin profile_level ni flags de coste
|
|
assert captured["run_models"] is True
|
|
assert captured["run_series"] is True
|
|
assert captured["run_llm"] is False
|
|
assert captured["sample"] == 5000
|
|
|
|
|
|
def test_lite_preset_defaults(monkeypatch):
|
|
"""lite por defecto: run_llm/run_series False y sample reducido a 2000."""
|
|
captured = {}
|
|
_patch_pipeline_internals(monkeypatch, captured)
|
|
|
|
captured.clear()
|
|
render_automatic_eda("db", "t", profile_level="lite")
|
|
assert captured["run_llm"] is False
|
|
assert captured["run_series"] is False
|
|
assert captured["sample"] == 2000
|