feat(eda): generadores de datasets sintéticos Faker que ejercitan el AutomaticEDA

Añade dos funciones impuras dict-no-throw, deterministas por seed, al dominio datascience (grupo eda): - generate_synthetic_eda_table: una tabla DuckDB de 19 columnas (numéricas correlacionadas + outliers, categóricas desbalanceadas, texto largo multi-idioma es/en/fr, fecha DATE, lat/lon válidas, PII email/iban/phone/uuid, nulos con patrón MCAR/MAR co-ocurrentes). Activa 14 capítulos del motor AutomaticEDA (num_distr, cat_distr, text_distr, calidad, missingness, correlacion, relaciones, modelos, timeseries, geospatial, agregacion, glosario + portada/overview). - generate_synthetic_eda_folder: 3 CSV relacionados (customers/orders/reviews) con FK customer detectable por containment, para el EDA de carpeta multi-tabla. Determinismo via Faker.seed_instance + numpy.default_rng. Tests: 16 passed (incluye determinismo por hash, rangos lat/lon, co-nulos income/spending, mediana palabras review >=20, phone formato internacional, FK containment). Añade faker (40.27.0) a python/pyproject.toml + uv.lock. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 21:25:31 +02:00
24 changed files with 884 additions and 2591 deletions
@@ -77,8 +77,12 @@ from .add_pdf_internal_links import add_pdf_internal_links
 from .suggest_intratable_fk_candidates import suggest_intratable_fk_candidates
 from .render_paper_pdf import render_paper_pdf
 from .draw_join_graph_figure import draw_join_graph_figure
+from .generate_synthetic_eda_table import generate_synthetic_eda_table
+from .generate_synthetic_eda_folder import generate_synthetic_eda_folder

 __all__ = [
+    "generate_synthetic_eda_table",
+    "generate_synthetic_eda_folder",
    "render_paper_pdf",
    "draw_join_graph_figure",
    "suggest_intratable_fk_candidates",
@@ -1,109 +0,0 @@
-"""Tests del filtro `only` de build_document (selección de capítulos).
-
-Verifican que:
-  - only=None mantiene el comportamiento histórico (todos los capítulos).
-  - only=[ids] restringe el CUERPO a esos ids, pero portada (primera) y glosario
-    (última) están SIEMPRE presentes.
-  - only=[] produce el documento mínimo (solo portada + glosario).
-  - la selección también viaja por la clave reservada ctx['_only_chapters']
-    (el canal que usan los renderers, que llaman build_document sin `only`), y
-    esa clave nunca se filtra a los capítulos.
-"""
-
-import os
-import sys
-
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))  # python/functions
-if _FUNCTIONS not in sys.path:
-    sys.path.insert(0, _FUNCTIONS)
-
-from datascience.automatic_eda import build_document  # noqa: E402
-
-
-def _profile_with_cat_and_num():
-    """Perfil mínimo que hace construir cat_distr y num_distr (cuerpo no vacío)."""
-    return {
-        "table": "ventas", "n_rows": 120, "n_cols": 2, "quality_score": 91,
-        "duplicate_pct": 1.5, "null_cell_pct": 0.8,
-        "columns": [
-            {"name": "region", "inferred_type": "categorical",
-             "categorical": {
-                 "top": [{"value": "norte", "count": 50, "pct": 0.42},
-                         {"value": "sur", "count": 40, "pct": 0.33},
-                         {"value": "este", "count": 30, "pct": 0.25}],
-                 "mode": "norte", "n_distinct": 3, "entropy": 1.55,
-                 "imbalance": 0.1}},
-            {"name": "importe", "inferred_type": "numeric",
-             "numeric": {"mean": 50.0, "median": 48.0, "std": 10.0,
-                         "min": 10, "max": 99, "iqr": 15,
-                         "histogram": [{"lo": 0, "hi": 50, "count": 40},
-                                       {"lo": 50, "hi": 100, "count": 80}]}},
-        ],
-    }
-
-
-def test_only_none_is_full_document():
-    """Retro-compat: sin `only`, salen todos los capítulos aplicables."""
-    chs = build_document(_profile_with_cat_and_num(), ctx={"dataset_name": "v"})
-    ids = [c.id for c in chs]
-    assert ids[0] == "portada"
-    assert ids[-1] == "glosario"
-    # El cuerpo trae las distribuciones (cat/num), no solo portada+glosario.
-    assert "num_distr" in ids
-    assert "cat_distr" in ids
-
-
-def test_only_restricts_body_but_keeps_cover_and_glossary():
-    # cat_distr registra el término "entropía" en el glosario, así que el
-    # glosario (destino del término clicable) aparece — demuestra el contrato
-    # "portada primera + capítulo + glosario última".
-    chs = build_document(_profile_with_cat_and_num(),
-                         ctx={"dataset_name": "v"}, only=["cat_distr"])
-    ids = [c.id for c in chs]
-    assert ids[0] == "portada", f"portada no es la primera: {ids}"
-    assert ids[-1] == "glosario", f"glosario no es la última: {ids}"
-    assert "cat_distr" in ids
-    # num_distr quedó fuera de la selección.
-    assert "num_distr" not in ids
-
-
-def test_only_empty_yields_minimal_document():
-    # only=[] -> cuerpo vacío. La portada está siempre; el glosario solo aparece
-    # si algún capítulo registró términos (patrón preexistente: glosario vacío se
-    # omite). Sin cuerpo no hay términos → documento mínimo = solo portada.
-    chs = build_document(_profile_with_cat_and_num(),
-                         ctx={"dataset_name": "v"}, only=[])
-    ids = [c.id for c in chs]
-    assert ids == ["portada"], \
-        f"only=[] debe dar el documento mínimo (solo portada), no {ids}"
-
-
-def test_selection_via_reserved_ctx_key():
-    """La selección viaja por ctx['_only_chapters'] cuando no se pasa `only`."""
-    chs = build_document(_profile_with_cat_and_num(),
-                         ctx={"dataset_name": "v",
-                              "_only_chapters": ["cat_distr"]})
-    ids = [c.id for c in chs]
-    assert "cat_distr" in ids
-    assert "num_distr" not in ids
-    assert ids[0] == "portada" and ids[-1] == "glosario"
-
-
-def test_explicit_only_arg_wins_over_ctx_key():
-    """Si se pasan ambos, el argumento `only` manda sobre la clave del ctx."""
-    chs = build_document(_profile_with_cat_and_num(),
-                         ctx={"dataset_name": "v",
-                              "_only_chapters": ["cat_distr"]},
-                         only=["num_distr"])
-    ids = [c.id for c in chs]
-    assert "num_distr" in ids
-    assert "cat_distr" not in ids
-
-
-def test_reserved_key_not_leaked_to_caller_ctx():
-    """build_document no muta el ctx del caller (copia interna)."""
-    ctx = {"dataset_name": "v", "_only_chapters": ["num_distr"]}
-    build_document(_profile_with_cat_and_num(), ctx=ctx)
-    # La clave reservada sigue en el dict del caller (no se mutó su copia).
-    assert ctx["_only_chapters"] == ["num_distr"]
@@ -1,205 +0,0 @@
-"""chapter_deps — mapa central de dependencias de cómputo por capítulo del EDA.
-
-Fuente de verdad ÚNICA de qué necesita cada capítulo de ``CHAPTER_ORDER`` para
-computarse COMPLETO (sin caer en su rama degradada "datos insuficientes"). Lo
-consume el pipeline ``render_automatic_eda`` cuando se le pide renderizar un
-SUBCONJUNTO de capítulos (kwarg ``only_chapters``): antes de perfilar, resuelve
-los requisitos de los capítulos pedidos y activa SOLO el cómputo que esos
-capítulos necesitan, de modo que un capítulo suelto siempre llegue poblado y a la
-vez no se malgaste CPU/LLM en piezas que ningún capítulo pedido usa.
-
-Diseño: el mapa es CENTRAL (este módulo), NO una constante por capítulo. Así se
-evita tocar los ``chapters/<id>.py`` (cada agente es dueño de su capítulo) y se
-elimina el riesgo de colisión entre ramas. Si un capítulo cambia lo que lee del
-``profile``/``ctx``, se actualiza ESTE mapa — es donde el motor mira.
-
-Dos clases de dependencia, derivadas inspeccionando qué lee cada capítulo:
-
-  - ``profile_flags``: flags de coste de ``profile_table`` que hay que ACTIVAR
-    para que el ``profile`` traiga el bloque que el capítulo lee. Son los caros:
-      * ``run_models``  -> ``profile['models']`` (KMeans/IsolationForest/PCA).
-        Lo leen ``outliers`` (fallback del multivariante) y ``modelos``.
-      * ``run_series``  -> ``profile['series']`` (análisis de serie temporal).
-        Lo lee ``timeseries``.
-      * ``run_llm``     -> ``profile['llm']`` (interpretación del modelo).
-        Lo lee ``analisis_llm``.
-
-  - ``ctx``: etiquetas de las piezas de DATOS CRUDOS que construye
-    ``build_eda_render_ctx`` y que el capítulo lee del ``ctx``. Si la lista está
-    vacía, el capítulo no necesita datos crudos y el pipeline puede saltarse
-    ``build_eda_render_ctx`` por completo cuando ningún capítulo pedido los pide.
-    Etiquetas y claves reales que mapean (ver ``CTX_LABEL_TO_KEYS``):
-      * ``head_rows``      -> ``ctx['head_rows']``      (overview: df.head real).
-      * ``raw_numeric``    -> ``ctx['raw_numeric']``    (outliers/modelos/
-        correlacion/missingness/geospatial: muestra numérica alineada por fila).
-      * ``timeseries_raw`` -> ``ctx['timeseries_raw']`` (timeseries: serie cruda).
-      * ``geo_points``     -> ``ctx['geo_points']`` (+ ``raw_numeric``)
-        (geospatial: lat/lon).
-      * ``db_path_table``  -> ``ctx['db_path']`` + ``ctx['table']`` (agregacion/
-        text_distr/missingness/relaciones: push-down de queries propias).
-
-``portada`` y ``glosario`` NO son opcionales: el pipeline los incluye SIEMPRE
-(la portada resume el documento y el glosario es el destino de los términos
-clicables), así que aquí se declaran sin requisitos de cómputo.
-
-Todas las funciones de este módulo son PURAS (no I/O, deterministas): se prestan
-a test unitario directo.
-"""
-
-from __future__ import annotations
-
-# Mapa central. Una entrada por id de CHAPTER_ORDER. ``profile_flags`` lista los
-# flags de coste a activar; ``ctx`` las etiquetas de datos crudos que lee. Las
-# claves vacías significan "no necesita ese tipo de dependencia".
-CHAPTER_DEPS = {
-    # Portada y glosario: SIEMPRE presentes, sin cómputo propio (la portada lee
-    # el document_summary que arma build_document; el glosario lee los términos
-    # que el resto registró). Se declaran para que el mapa cubra CHAPTER_ORDER
-    # entero y la validación los reconozca.
-    "portada":      {"profile_flags": [], "ctx": []},
-    "overview":     {"profile_flags": [], "ctx": ["head_rows"]},
-    "analisis_llm": {"profile_flags": ["run_llm"], "ctx": []},
-    "num_distr":    {"profile_flags": [], "ctx": []},
-    "cat_distr":    {"profile_flags": [], "ctx": []},
-    # text_distr empuja su propia query de texto (no usa raw_numeric); necesita
-    # db_path/table en el ctx para hacerlo.
-    "text_distr":   {"profile_flags": [], "ctx": ["db_path_table"]},
-    "calidad":      {"profile_flags": [], "ctx": []},
-    # missingness lee la muestra numérica cruda (co-ocurrencia de ausencias) y
-    # puede empujar una query de patrón de nulos con db_path/table.
-    "missingness":  {"profile_flags": [], "ctx": ["raw_numeric", "db_path_table"]},
-    # outliers corre IsolationForest EN VIVO sobre ctx['raw_numeric']; run_models
-    # asegura además el fallback profile['models']['outliers'] si el ctx faltara.
-    "outliers":     {"profile_flags": ["run_models"], "ctx": ["raw_numeric"]},
-    "correlacion":  {"profile_flags": [], "ctx": ["raw_numeric"]},
-    "relaciones":   {"profile_flags": [], "ctx": ["db_path_table"]},
-    "modelos":      {"profile_flags": ["run_models"], "ctx": ["raw_numeric"]},
-    "timeseries":   {"profile_flags": ["run_series"], "ctx": ["timeseries_raw"]},
-    "geospatial":   {"profile_flags": [], "ctx": ["geo_points", "raw_numeric"]},
-    "agregacion":   {"profile_flags": [], "ctx": ["db_path_table"]},
-    "glosario":     {"profile_flags": [], "ctx": []},
-}
-
-# Capítulos que el documento incluye SIEMPRE, independientemente de only_chapters.
-ALWAYS_PRESENT = ("portada", "glosario")
-
-# Flags de coste reconocidos (el orden no importa; se devuelven como set).
-KNOWN_PROFILE_FLAGS = ("run_models", "run_series", "run_llm")
-
-# Mapeo de cada etiqueta de ctx a las claves REALES que produce
-# build_eda_render_ctx. ``db_path_table`` es especial: db_path/table siempre se
-# ponen para un backend válido y son inofensivos, por eso no se podan nunca (no
-# aparecen en DATA_CTX_KEYS). El resto (head_rows/raw_numeric/timeseries_raw/
-# geo_points) son las piezas de datos podables.
-CTX_LABEL_TO_KEYS = {
-    "head_rows":      {"head_rows"},
-    "raw_numeric":    {"raw_numeric"},
-    "timeseries_raw": {"timeseries_raw"},
-    "geo_points":     {"geo_points", "raw_numeric"},
-    "db_path_table":  set(),  # db_path/table siempre presentes; nunca se podan.
-}
-
-# Claves de datos crudos del ctx que se pueden podar cuando ningún capítulo
-# pedido las necesita (las que cuestan muestreo). db_path/table NO entran aquí.
-DATA_CTX_KEYS = ("head_rows", "raw_numeric", "timeseries_raw", "geo_points")
-
-
-def _as_id_list(chapter_ids):
-    """Normaliza la entrada a una lista de ids string, defensiva. None -> []."""
-    if chapter_ids is None:
-        return []
-    if isinstance(chapter_ids, str):
-        return [chapter_ids]
-    return [c for c in chapter_ids if isinstance(c, str)]
-
-
-def validate_chapter_ids(chapter_ids, order):
-    """Separa los ids pedidos en válidos y desconocidos respecto a ``order``.
-
-    Args:
-        chapter_ids: lista (o str) de ids de capítulo pedidos.
-        order: lista canónica de ids válidos (CHAPTER_ORDER).
-
-    Returns:
-        dict ``{"valid": [...], "unknown": [...]}`` preservando el orden de
-        aparición de la entrada. Función pura.
-    """
-    valid_set = set(order or [])
-    valid, unknown = [], []
-    for cid in _as_id_list(chapter_ids):
-        (valid if cid in valid_set else unknown).append(cid)
-    return {"valid": valid, "unknown": unknown}
-
-
-def resolve_requirements(chapter_ids):
-    """Une los requisitos de cómputo de los capítulos pedidos.
-
-    Es el corazón de la resolución de dependencias: dado el subconjunto de
-    capítulos a renderizar, devuelve TODO lo que hay que activar/construir para
-    que esos capítulos lleguen COMPLETOS, y solo eso.
-
-    Los capítulos ``ALWAYS_PRESENT`` (portada/glosario) se añaden implícitamente
-    porque el pipeline siempre los incluye; como no tienen requisitos, no alteran
-    el resultado, pero se contemplan para que el conjunto sea coherente.
-
-    Args:
-        chapter_ids: lista (o str) de ids de capítulo. Ids desconocidos se
-            ignoran silenciosamente (la validación estricta es de quien llama).
-            None o lista vacía -> requisitos vacíos.
-
-    Returns:
-        dict ``{"profile_flags": set[str], "ctx_keys": set[str]}`` donde
-        ``ctx_keys`` son las ETIQUETAS de ctx (no las claves reales). Función
-        pura.
-    """
-    ids = set(_as_id_list(chapter_ids)) | set(ALWAYS_PRESENT)
-    profile_flags = set()
-    ctx_keys = set()
-    for cid in ids:
-        dep = CHAPTER_DEPS.get(cid)
-        if not isinstance(dep, dict):
-            continue
-        for f in dep.get("profile_flags", []) or []:
-            if f in KNOWN_PROFILE_FLAGS:
-                profile_flags.add(f)
-        for k in dep.get("ctx", []) or []:
-            ctx_keys.add(k)
-    return {"profile_flags": profile_flags, "ctx_keys": ctx_keys}
-
-
-def resolve_profile_flags(chapter_ids):
-    """Atajo: solo el set de profile_flags a activar para los capítulos pedidos.
-
-    Función pura. Devuelve un set ⊆ KNOWN_PROFILE_FLAGS.
-    """
-    return resolve_requirements(chapter_ids)["profile_flags"]
-
-
-def needs_render_ctx(chapter_ids):
-    """True si algún capítulo pedido necesita datos crudos del ctx.
-
-    Cuando es False, el pipeline puede saltarse ``build_eda_render_ctx`` entero
-    (ahorro real de CPU/I/O): los capítulos pedidos no leen ninguna pieza de
-    datos crudos. Función pura.
-    """
-    return bool(resolve_requirements(chapter_ids)["ctx_keys"])
-
-
-def resolve_ctx_data_keys(chapter_ids):
-    """Claves REALES de datos del ctx a CONSERVAR para los capítulos pedidos.
-
-    Traduce las etiquetas de ctx a las claves concretas que produce
-    ``build_eda_render_ctx`` (head_rows/raw_numeric/timeseries_raw/geo_points).
-    El pipeline poda del ctx las claves de datos que NO estén en este set, para
-    que un capítulo suelto no arrastre piezas de datos que no usa. db_path/table
-    nunca se podan (no aparecen aquí). Función pura.
-
-    Returns:
-        set[str] subconjunto de DATA_CTX_KEYS.
-    """
-    req = resolve_requirements(chapter_ids)
-    keep = set()
-    for label in req["ctx_keys"]:
-        keep |= CTX_LABEL_TO_KEYS.get(label, set())
-    # Solo claves de datos podables (db_path/table se gestionan aparte).
-    return {k for k in keep if k in DATA_CTX_KEYS}
@@ -1,160 +0,0 @@
-"""Tests del mapa central de dependencias por capítulo (chapter_deps).
-
-Todas las funciones bajo prueba son PURAS (sin I/O): se ejercitan directamente
-sin DuckDB ni renderizado. Cubren la resolución de requisitos (golden + edges),
-la validación de ids y los helpers de eficiencia (qué cómputo se salta).
-"""
-
-import os
-import sys
-
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))  # python/functions
-if _FUNCTIONS not in sys.path:
-    sys.path.insert(0, _FUNCTIONS)
-
-from datascience.automatic_eda.chapter_deps import (  # noqa: E402
-    ALWAYS_PRESENT,
-    CHAPTER_DEPS,
-    DATA_CTX_KEYS,
-    needs_render_ctx,
-    resolve_ctx_data_keys,
-    resolve_profile_flags,
-    resolve_requirements,
-    validate_chapter_ids,
-)
-from datascience.automatic_eda.chapters_registry import CHAPTER_ORDER  # noqa: E402
-
-
-# --------------------------------------------------------------------------- #
-# El mapa cubre CHAPTER_ORDER entero (sin huecos ni claves de más).
-# --------------------------------------------------------------------------- #
-def test_chapter_deps_covers_every_chapter_in_order():
-    assert set(CHAPTER_DEPS) == set(CHAPTER_ORDER), (
-        "CHAPTER_DEPS debe declarar exactamente los ids de CHAPTER_ORDER")
-    # Cada entrada tiene la forma esperada.
-    for cid, dep in CHAPTER_DEPS.items():
-        assert isinstance(dep.get("profile_flags"), list), cid
-        assert isinstance(dep.get("ctx"), list), cid
-
-
-# --------------------------------------------------------------------------- #
-# resolve_requirements — golden: outliers exige run_models + raw_numeric.
-# --------------------------------------------------------------------------- #
-def test_resolve_outliers_requires_run_models_and_raw_numeric():
-    req = resolve_requirements(["outliers"])
-    assert "run_models" in req["profile_flags"]
-    assert "raw_numeric" in req["ctx_keys"]
-    assert "run_series" not in req["profile_flags"]
-    assert "run_llm" not in req["profile_flags"]
-
-
-def test_resolve_timeseries_requires_run_series():
-    req = resolve_requirements(["timeseries"])
-    assert req["profile_flags"] == {"run_series"}
-    assert "timeseries_raw" in req["ctx_keys"]
-
-
-def test_resolve_analisis_llm_requires_run_llm():
-    assert resolve_requirements(["analisis_llm"])["profile_flags"] == {"run_llm"}
-
-
-def test_resolve_union_of_several_chapters():
-    req = resolve_requirements(["outliers", "timeseries", "analisis_llm"])
-    assert req["profile_flags"] == {"run_models", "run_series", "run_llm"}
-
-
-# --------------------------------------------------------------------------- #
-# Eficiencia: capítulos que NO necesitan flags caros no los activan.
-# --------------------------------------------------------------------------- #
-def test_resolve_geospatial_needs_no_cost_flags():
-    """geospatial sale de geo_points/raw_numeric del ctx, NO de los modelos."""
-    req = resolve_requirements(["geospatial"])
-    assert req["profile_flags"] == set(), \
-        "geospatial no debe activar run_models/run_series/run_llm"
-    assert "geo_points" in req["ctx_keys"]
-
-
-def test_resolve_correlacion_needs_raw_numeric_but_no_models():
-    req = resolve_requirements(["correlacion"])
-    assert req["profile_flags"] == set()
-    assert "raw_numeric" in req["ctx_keys"]
-
-
-def test_always_present_chapters_add_no_requirements():
-    """portada y glosario están siempre, pero no arrastran cómputo."""
-    for cid in ALWAYS_PRESENT:
-        req = resolve_requirements([cid])
-        assert req["profile_flags"] == set()
-        assert req["ctx_keys"] == set()
-
-
-def test_resolve_profile_flags_shortcut():
-    assert resolve_profile_flags(["modelos"]) == {"run_models"}
-    assert resolve_profile_flags(["num_distr"]) == set()
-
-
-# --------------------------------------------------------------------------- #
-# needs_render_ctx — cuándo se puede saltar build_eda_render_ctx por completo.
-# --------------------------------------------------------------------------- #
-def test_needs_render_ctx_true_when_chapter_reads_raw_data():
-    assert needs_render_ctx(["outliers"]) is True
-    assert needs_render_ctx(["agregacion"]) is True  # db_path/table push-down
-    assert needs_render_ctx(["timeseries"]) is True
-
-
-def test_needs_render_ctx_false_for_purely_aggregated_chapters():
-    """num_distr / cat_distr / calidad solo leen el profile agregado."""
-    assert needs_render_ctx(["num_distr"]) is False
-    assert needs_render_ctx(["cat_distr", "calidad"]) is False
-
-
-# --------------------------------------------------------------------------- #
-# resolve_ctx_data_keys — poda: qué claves de DATOS conservar (db_path/table no).
-# --------------------------------------------------------------------------- #
-def test_resolve_ctx_data_keys_outliers_keeps_only_raw_numeric():
-    assert resolve_ctx_data_keys(["outliers"]) == {"raw_numeric"}
-
-
-def test_resolve_ctx_data_keys_geospatial_keeps_geo_and_numeric():
-    assert resolve_ctx_data_keys(["geospatial"]) == {"geo_points", "raw_numeric"}
-
-
-def test_resolve_ctx_data_keys_aggregation_keeps_nothing_prunable():
-    """agregacion usa db_path/table (siempre presentes), 0 claves podables."""
-    assert resolve_ctx_data_keys(["agregacion"]) == set()
-
-
-def test_resolve_ctx_data_keys_subset_of_data_keys():
-    keep = resolve_ctx_data_keys(["overview", "timeseries", "geospatial"])
-    assert keep <= set(DATA_CTX_KEYS)
-    assert {"head_rows", "timeseries_raw", "geo_points", "raw_numeric"} == keep
-
-
-# --------------------------------------------------------------------------- #
-# validate_chapter_ids — separa válidos de desconocidos preservando orden.
-# --------------------------------------------------------------------------- #
-def test_validate_separates_known_and_unknown():
-    out = validate_chapter_ids(["outliers", "nope", "timeseries", "ghost"],
-                               CHAPTER_ORDER)
-    assert out["valid"] == ["outliers", "timeseries"]
-    assert out["unknown"] == ["nope", "ghost"]
-
-
-def test_validate_all_known():
-    out = validate_chapter_ids(["portada", "glosario"], CHAPTER_ORDER)
-    assert out["unknown"] == []
-
-
-# --------------------------------------------------------------------------- #
-# Robustez: entradas raras nunca lanzan.
-# --------------------------------------------------------------------------- #
-def test_resolve_handles_none_and_empty():
-    assert resolve_requirements(None)["profile_flags"] == set()
-    assert resolve_requirements([])["profile_flags"] == set()
-    # ids desconocidos se ignoran silenciosamente en la resolución.
-    assert resolve_requirements(["no_existe"])["ctx_keys"] == set()
-
-
-def test_resolve_accepts_single_string():
-    assert resolve_requirements("outliers")["profile_flags"] == {"run_models"}
@@ -1,593 +0,0 @@
-"""Outliers chapter (OUTLIERS) — univariate + multivariate atypical values.
-
-Today the analysis of atypical values is scattered across the document: the
-NUM DISTR chapter mentions the per-column outlier count inside each distribution
-figure, and the MODELOS chapter runs Isolation Forest as one of several cheap
-models. This chapter gathers and deepens the whole outlier story in a single
-place, with its interpretation: an [[term:outlier]]outlier[[/term]] is **not
-necessarily an error** — it can be a legitimate, extreme but real observation —
-so the reading is exploratory (what to look at), never confirmatory (what to
-delete).
-
-Sections, in order:
-
-1. **Resumen univariante por columna** — for every numeric column, the number
-   and percentage of atypical values by two complementary criteria: Tukey's
-   1.5·IQR rule ([[term:tukey_fence]]vallas de Tukey[[/term]]) and the
-   [[term:zscore]]z-score[[/term]] rule (|z| > 3). The most contaminated columns
-   are flagged. The fences come from the pure registry function
-   ``build_boxplot_stats`` (derived from the profile percentiles); the per-column
-   counts use the raw sample in ``ctx['raw_numeric']`` when available (the exact
-   count), degrading to the profile's own z-score counts otherwise.
-2. **Boxplots** — a single figure with the Tukey boxplots of the most
-   contaminated columns (box, whiskers and atypical points), delegated to the
-   reusable registry helper ``build_boxplots_figure``.
-3. **Multivariante (filas anómalas)** — rows that are atypical considering ALL
-   columns at once, via the registry function ``isolation_forest_outliers``: the
-   count and percentage of anomalous rows, the most anomalous rows with their
-   score, and the dimensions that make each one rare (top columns by |z|, via
-   ``summarize_outlier_dims``). Run live on ``ctx['raw_numeric']`` (the same
-   numeric columns ``summarize_outlier_dims`` uses, so the row indexing stays
-   coherent and the dimension breakdown is correct); falls back to the
-   precomputed ``profile['models']['outliers']`` only when no raw sample is
-   available (e.g. the lite preset), where no per-row breakdown is shown.
-4. **Interpretación** — outlier ≠ error: how to tell a data-entry error from a
-   genuine extreme value, and what to do (inspect, winsorize, or re-express —
-   linking to the Tukey re-expression the profile already computes).
-
-The chapter activates whenever the table has at least one numeric column; with
-no numeric column it returns ``None`` and disappears from the document.
-
-Reads everything defensively (``.get``) and never raises: every registry
-delegation is imported lazily and degraded to an honest note on any failure.
-
-Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
-"""
-
-from __future__ import annotations
-
-from .. import model
-
-CHAPTER_VERSION = "1.0.0"
-CHAPTER_ID = "outliers"
-CHAPTER_TITLE = "Valores atípicos"
-
-# z-score threshold for the univariate z rule: |z| > 3 flags a value ~3 standard
-# deviations from the mean (≈99.7% of a normal distribution lies within ±3σ).
-_Z_THRESH = 3.0
-# How many columns to draw in the boxplots figure (most contaminated first) and
-# how many anomalous rows to list in the multivariate table.
-_TOP_BOX = 12
-_TOP_ROWS = 12
-# Cap on the raw atypical values passed as boxplot fliers, so a heavy-tailed
-# column does not flood the figure with thousands of points.
-_MAX_FLIERS = 200
-# How many columns flagged as "most contaminated" in the summary note.
-_TOP_FLAGGED = 3
-
-# Glossary terms this chapter explains (contract §11.1). Registered in the shared
-# collector and marked clickable on first appearance. ``isolation_forest`` and
-# ``zscore`` may also be registered by the MODELOS chapter — ``add`` is
-# idempotent (first definition wins), so registering them here is harmless and
-# keeps this chapter self-contained when MODELOS does not render.
-_TERM_DEFS = {
-    "outlier": (
-        "Valor atípico (outlier)",
-        "Una observación que se aparta mucho del grueso de los datos. Un atípico "
-        "NO es necesariamente un error: puede ser un fallo de medida o de "
-        "registro, pero también un dato real extremo (un cliente que gasta diez "
-        "veces la media, un día de ventas excepcional). Por eso se señalan para "
-        "revisarlos, no para borrarlos automáticamente.",
-    ),
-    "tukey_fence": (
-        "Vallas de Tukey (1,5·IQR)",
-        "Regla clásica para marcar atípicos a partir de los cuartiles: se calcula "
-        "el rango intercuartílico IQR = P75 − P25 y se trazan dos vallas, una "
-        "inferior en P25 − 1,5·IQR y otra superior en P75 + 1,5·IQR. Los valores "
-        "que caen fuera de esas vallas se consideran atípicos. Es robusta porque "
-        "se apoya en la mediana y los cuartiles, no en la media.",
-    ),
-    "zscore": (
-        "z-score (puntuación típica)",
-        "Mide a cuántas desviaciones típicas está un valor de la media de su "
-        "columna: z = (valor − media) / desviación típica. Un |z| grande (aquí > "
-        "3) señala un valor alejado del centro. A diferencia de las vallas de "
-        "Tukey, el z-score usa media y desviación, así que es más sensible a la "
-        "presencia de los propios atípicos.",
-    ),
-    "isolation_forest": (
-        "Isolation Forest (anomalías multivariantes)",
-        "Algoritmo de detección de anomalías que considera TODAS las columnas a "
-        "la vez: construye árboles que parten el espacio con cortes aleatorios y "
-        "mide cuántos cortes hacen falta para aislar cada fila. Las filas raras "
-        "se aíslan con muy pocos cortes y se marcan como atípicas según un umbral "
-        "de contaminación. Detecta combinaciones de valores poco frecuentes que "
-        "ninguna columna por separado revelaría.",
-    ),
-}
-
-
-# --------------------------------------------------------------------------- #
-# Lazy registry delegations (each degrades to None / no-op on any failure).
-# --------------------------------------------------------------------------- #
-def _load_build_boxplot_stats():
-    try:
-        from datascience.build_boxplot_stats import build_boxplot_stats
-        return build_boxplot_stats
-    except Exception:  # noqa: BLE001
-        return None
-
-
-def _load_detect_outliers():
-    # detect_outliers lives in the monolithic ``datascience.datascience`` module
-    # (file_path datascience.py), not in its own submodule — try both shapes.
-    try:
-        from datascience.datascience import detect_outliers
-        return detect_outliers
-    except Exception:  # noqa: BLE001
-        try:
-            from datascience import detect_outliers
-            return detect_outliers
-        except Exception:  # noqa: BLE001
-            return None
-
-
-def _load_isolation_forest():
-    try:
-        from datascience.isolation_forest_outliers import isolation_forest_outliers
-        return isolation_forest_outliers
-    except Exception:  # noqa: BLE001
-        return None
-
-
-def _load_summarize_dims():
-    try:
-        from datascience.summarize_outlier_dims import summarize_outlier_dims
-        return summarize_outlier_dims
-    except Exception:  # noqa: BLE001
-        return None
-
-
-# --------------------------------------------------------------------------- #
-# Defensive formatters (own copy: the chapter never imports siblings).
-# --------------------------------------------------------------------------- #
-def _fmt_num(value, decimals: int = 3) -> str:
-    if value is None:
-        return "—"
-    if isinstance(value, bool):
-        return "sí" if value else "no"
-    if isinstance(value, int):
-        return f"{value:,}".replace(",", ".")
-    if isinstance(value, float):
-        if value != value:  # NaN
-            return "—"
-        if value in (float("inf"), float("-inf")):
-            return str(value)
-        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
-        return text if text else "0"
-    return model._safe_str(value)
-
-
-def _fmt_int(value) -> str:
-    if value is None:
-        return "—"
-    try:
-        return f"{int(round(float(value))):,}".replace(",", ".")
-    except (TypeError, ValueError):
-        return model._safe_str(value)
-
-
-def _fmt_pct(value, decimals: int = 2) -> str:
-    """Format an already-0-100 value as a percentage. None -> placeholder."""
-    if value is None:
-        return "—"
-    try:
-        return f"{float(value):.{decimals}f}%"
-    except (TypeError, ValueError):
-        return model._safe_str(value)
-
-
-def _term(mark: bool, key: str, text: str) -> str:
-    return f"[[term:{key}]]{text}[[/term]]" if mark else text
-
-
-def _is_dict(v) -> bool:
-    return isinstance(v, dict)
-
-
-# --------------------------------------------------------------------------- #
-# Profile reads.
-# --------------------------------------------------------------------------- #
-def _numeric_columns(profile: dict) -> list:
-    """Return [(name, numeric_dict)] for numeric columns with usable stats."""
-    out = []
-    for col in profile.get("columns") or []:
-        if not isinstance(col, dict):
-            continue
-        if col.get("inferred_type") != "numeric":
-            continue
-        num = col.get("numeric")
-        if not isinstance(num, dict) or not num:
-            continue
-        if num.get("mean") is None and num.get("median") is None:
-            continue
-        out.append((col.get("name") or "(columna)", num))
-    return out
-
-
-def _clean_values(raw):
-    """Return the finite float values of a raw column list (drop None/NaN/inf)."""
-    if not isinstance(raw, (list, tuple)):
-        return None
-    vals = []
-    for v in raw:
-        if v is None or isinstance(v, bool):
-            continue
-        try:
-            f = float(v)
-        except (TypeError, ValueError):
-            continue
-        if f != f or f in (float("inf"), float("-inf")):
-            continue
-        vals.append(f)
-    return vals
-
-
-# --------------------------------------------------------------------------- #
-# Per-column univariate summary.
-# --------------------------------------------------------------------------- #
-def _univariate_row(name, numeric, raw_vals, box_fn, detect_fn):
-    """Compute one univariate summary row + boxplot inputs for a column.
-
-    Returns a dict with the table cells and, when raw values are available, the
-    exact Tukey/z counts and the list of atypical (flier) values; otherwise it
-    degrades to the profile's own z-score counts and the fence flags.
-    """
-    box = {}
-    if box_fn is not None:
-        try:
-            box = box_fn(numeric) or {}
-        except Exception:  # noqa: BLE001
-            box = {}
-    lf = box.get("lower_fence")
-    uf = box.get("upper_fence")
-
-    vals = _clean_values(raw_vals)
-    n_tukey = pct_tukey = None
-    n_z = pct_z = None
-    low_extreme = high_extreme = None
-    fliers = []
-    contamination = None  # metric used to rank columns (prefer Tukey %).
-
-    if vals:
-        n = len(vals)
-        tukey_out = []
-        for v in vals:
-            below = (lf is not None and v < lf)
-            above = (uf is not None and v > uf)
-            if below or above:
-                tukey_out.append(v)
-        n_tukey = len(tukey_out)
-        pct_tukey = 100.0 * n_tukey / n if n else None
-        if tukey_out:
-            low_extreme = min(tukey_out)
-            high_extreme = max(tukey_out)
-            fliers = tukey_out[:_MAX_FLIERS]
-        # z-score rule via the registry function (returns parallel bools).
-        if detect_fn is not None:
-            try:
-                flags = detect_fn(vals, _Z_THRESH) or []
-                n_z = int(sum(1 for b in flags if b))
-                pct_z = 100.0 * n_z / n if n else None
-            except Exception:  # noqa: BLE001
-                n_z = pct_z = None
-        contamination = pct_tukey
-    else:
-        # Degrade: no raw sample for this column. The profile's own outlier
-        # count/pct come from the z-score block (build_boxplot_stats note); the
-        # Tukey count is unknown, only the fence flags are.
-        n_z = numeric.get("n_outliers")
-        pct_z = numeric.get("outlier_pct")
-        if box.get("has_low_outliers") and box.get("min") is not None:
-            low_extreme = box.get("min")
-        if box.get("has_high_outliers") and box.get("max") is not None:
-            high_extreme = box.get("max")
-        contamination = pct_z if isinstance(pct_z, (int, float)) else None
-
-    # Compact "extremos atípicos" cell: down/up arrows for the low/high tail.
-    extremes = []
-    if low_extreme is not None:
-        extremes.append(f"↓ {_fmt_num(low_extreme)}")
-    if high_extreme is not None:
-        extremes.append(f"↑ {_fmt_num(high_extreme)}")
-    extremes_cell = "  ".join(extremes) if extremes else "—"
-
-    return {
-        "name": model._safe_str(name),
-        "n_tukey": n_tukey,
-        "pct_tukey": pct_tukey,
-        "n_z": n_z,
-        "pct_z": pct_z,
-        "lower_fence": lf,
-        "upper_fence": uf,
-        "extremes": extremes_cell,
-        "box": box,
-        "fliers": fliers,
-        "has_raw": bool(vals),
-        "contamination": contamination if isinstance(contamination, (int, float)) else -1.0,
-    }
-
-
-def _univariate_table(rows: list) -> model.DataTable:
-    header = ["Columna", "Atípicos Tukey", "% Tukey", "Atípicos z", "% z",
-              "Valla inf.", "Valla sup.", "Extremos atípicos"]
-    table_rows = []
-    for r in rows:
-        table_rows.append([
-            r["name"],
-            _fmt_int(r["n_tukey"]) if r["n_tukey"] is not None else "—",
-            _fmt_pct(r["pct_tukey"]) if r["pct_tukey"] is not None else "—",
-            _fmt_int(r["n_z"]) if r["n_z"] is not None else "—",
-            _fmt_pct(r["pct_z"]) if r["pct_z"] is not None else "—",
-            _fmt_num(r["lower_fence"]),
-            _fmt_num(r["upper_fence"]),
-            r["extremes"],
-        ])
-    return model.DataTable(
-        header=header, rows=table_rows,
-        title="Valores atípicos por columna",
-        note="Tukey = fuera de las vallas 1,5·IQR · z = |z-score| > 3 · "
-             "ordenado de más a menos contaminada")
-
-
-# --------------------------------------------------------------------------- #
-# Multivariate (Isolation Forest) section.
-# --------------------------------------------------------------------------- #
-def _resolve_multivariate(profile: dict, ctx: dict, raw_numeric):
-    """Return (outliers_dict_or_None, source).
-
-    Prefers a LIVE Isolation Forest over ``raw_numeric`` so the detector and
-    ``summarize_outlier_dims`` use EXACTLY the same numeric columns and the same
-    valid-row indexing — otherwise the precomputed ``profile['models']
-    ['outliers']`` (run by MODELOS over a possibly different column subset) would
-    yield ``row_index`` values that no longer point at the rows
-    ``summarize_outlier_dims`` reconstructs, mislabelling the "dimensions that
-    make each row rare". Falls back to the precomputed block when no raw sample
-    is available (e.g. the lite preset drops ``raw_numeric``)."""
-    if _is_dict(raw_numeric) and raw_numeric:
-        iso = _load_isolation_forest()
-        if iso is not None:
-            try:
-                out = iso(raw_numeric)
-                if _is_dict(out) and out.get("n_outliers") is not None and out.get("n_rows_used"):
-                    return out, "live"
-            except Exception:  # noqa: BLE001
-                pass
-    # Fallback: the model the MODELOS chapter already computed (no raw sample to
-    # recompute against, so no per-row dimension breakdown either).
-    models = profile.get("models") if _is_dict(profile.get("models")) else {}
-    pre = models.get("outliers") if _is_dict(models) else None
-    if _is_dict(pre) and pre.get("n_outliers") is not None and pre.get("n_rows_used"):
-        return pre, "precomputed"
-    return None, "none"
-
-
-def _multivariate_blocks(outliers: dict, raw_numeric, mark: bool) -> list:
-    isof = _term(mark, "isolation_forest", "**Isolation Forest**")
-    blocks = [
-        model.Heading(text="Filas atípicas (multivariante)", level=2),
-        model.Markdown(text=(
-            f"Hasta aquí cada columna se ha mirado por separado. {isof} busca "
-            "filas raras considerando **todas las columnas a la vez**: una fila "
-            "puede ser normal en cada variable y aun así ser atípica por la "
-            "**combinación** de sus valores (p. ej. una edad baja con una tarifa "
-            "muy alta). La tabla resume cuántas filas se marcaron y el umbral de "
-            "decisión.")),
-        model.KVTable(rows=[
-            ("Filas analizadas", _fmt_int(outliers.get("n_rows_used"))),
-            ("Columnas consideradas", _fmt_int(outliers.get("n_features"))),
-            ("Filas atípicas", _fmt_int(outliers.get("n_outliers"))),
-            ("% filas atípicas", _fmt_pct(outliers.get("outlier_pct"))),
-            ("Umbral de decisión", _fmt_num(outliers.get("threshold"), 4)),
-        ], title="Anomalías multivariantes"),
-    ]
-
-    rows_in = outliers.get("outlier_rows") or []
-    if not rows_in:
-        return blocks
-
-    # Enrich each anomalous row with the dimensions that make it rare, when the
-    # raw sample is available (summarize_outlier_dims reconstructs the same
-    # valid-row indexing as isolation_forest_outliers).
-    dims_by_row = {}
-    if _is_dict(raw_numeric) and raw_numeric:
-        summ = _load_summarize_dims()
-        if summ is not None:
-            try:
-                enriched = summ(raw_numeric, rows_in, top_k=3) or []
-                for e in enriched:
-                    if _is_dict(e) and e.get("row_index") is not None:
-                        dims_by_row[e.get("row_index")] = e.get("dims") or []
-            except Exception:  # noqa: BLE001
-                dims_by_row = {}
-
-    has_dims = bool(dims_by_row)
-    header = ["Fila (entre válidas)", "Score"]
-    if has_dims:
-        header.append("Dimensiones que la hacen rara (col = valor, z)")
-    table_rows = []
-    for r in rows_in[:_TOP_ROWS]:
-        if not _is_dict(r):
-            continue
-        ridx = r.get("row_index")
-        cells = [_fmt_int(ridx), _fmt_num(r.get("score"), 4)]
-        if has_dims:
-            dims = dims_by_row.get(ridx) or []
-            parts = []
-            for d in dims:
-                if not _is_dict(d):
-                    continue
-                parts.append(
-                    f"{model._safe_str(d.get('col'))} = {_fmt_num(d.get('value'))} "
-                    f"(z {_fmt_num(d.get('z'), 2)})")
-            cells.append("; ".join(parts) if parts else "—")
-        table_rows.append(cells)
-
-    if table_rows:
-        shown = len(table_rows)
-        total = outliers.get("n_outliers")
-        note = "las filas más anómalas primero (score más bajo = más rara)"
-        if isinstance(total, int) and total > shown:
-            note += f" — top {shown} de {total}"
-        if not has_dims:
-            note += (" · no se pudo recuperar la muestra cruda para explicar las "
-                     "dimensiones de cada fila")
-        blocks.append(model.DataTable(
-            header=header, rows=table_rows,
-            title="Filas más atípicas", note=note))
-    return blocks
-
-
-# --------------------------------------------------------------------------- #
-# Interpretation section.
-# --------------------------------------------------------------------------- #
-def _interpretation_block(mark: bool) -> model.Markdown:
-    outlier = _term(mark, "outlier", "atípico")
-    text = (
-        f"**Un {outlier} no es necesariamente un error.** Conviene distinguir "
-        "dos casos antes de actuar:\n\n"
-        "- **Error de dato** (medida, registro o unidad equivocada): una edad de "
-        "200 años, un importe negativo donde no puede haberlo, un decimal "
-        "desplazado. Estos sí se corrigen o se eliminan, idealmente en el origen.\n"
-        "- **Dato real extremo**: una observación legítima de la cola de la "
-        "distribución (un cliente que gasta mucho más, una tarifa de lujo, un día "
-        "de ventas excepcional). Borrarla sesga el análisis y oculta información "
-        "valiosa.\n\n"
-        "**Qué hacer.** Primero, **revisar** los valores señalados arriba contra "
-        "su origen para decidir cuál de los dos casos es. Si son errores, "
-        "corregirlos. Si son datos reales que distorsionan medias y modelos, hay "
-        "alternativas a borrarlos: **winsorizar** (recortar los extremos a un "
-        "percentil), o **re-expresar** la variable (por ejemplo una "
-        "transformación logarítmica o la escalera de re-expresión de Tukey que "
-        "este mismo perfil ya calcula para las columnas asimétricas), que suele "
-        "domar la cola sin perder ninguna fila. La elección depende del objetivo: "
-        "esta lectura es **exploratoria** —orienta dónde mirar—, no una regla "
-        "automática de limpieza.")
-    return model.Markdown(text=text)
-
-
-# --------------------------------------------------------------------------- #
-# Entry point.
-# --------------------------------------------------------------------------- #
-def build_outliers(profile: dict, ctx: dict):
-    """Build the OUTLIERS Chapter, or None if the dataset has no numeric column."""
-    profile = profile or {}
-    ctx = ctx or {}
-    if not isinstance(profile, dict):
-        return None
-
-    numerics = _numeric_columns(profile)
-    if not numerics:
-        return None  # chapter does not apply to a dataset with no numerics.
-
-    # Register glossary terms (if a collector is present) and mark them clickable.
-    glossary = ctx.get("glossary")
-    mark = False
-    if isinstance(glossary, model.GlossaryCollector):
-        for key, (label, definition) in _TERM_DEFS.items():
-            glossary.add(key, label, definition)
-        mark = True
-
-    raw_numeric = ctx.get("raw_numeric")
-    raw_numeric = raw_numeric if isinstance(raw_numeric, dict) else {}
-
-    box_fn = _load_build_boxplot_stats()
-    detect_fn = _load_detect_outliers()
-
-    # --- Univariate summary ------------------------------------------------- #
-    uni_rows = []
-    for name, numeric in numerics:
-        uni_rows.append(_univariate_row(
-            name, numeric, raw_numeric.get(name), box_fn, detect_fn))
-    # Rank columns by contamination (Tukey % when available, else z %).
-    uni_rows.sort(key=lambda r: r.get("contamination", -1.0), reverse=True)
-
-    intro = (
-        "Este capítulo reúne en un solo sitio el análisis de los **valores "
-        "atípicos** de la tabla, que en el resto del informe aparecen dispersos. "
-        f"Un {_term(mark, 'outlier', 'atípico')} es una observación que se aparta "
-        "mucho del grueso de los datos. Cada columna numérica se evalúa con dos "
-        f"criterios complementarios: las {_term(mark, 'tukey_fence', 'vallas de Tukey')} "
-        "(fuera de P25−1,5·IQR o P75+1,5·IQR, robusto a la propia cola) y el "
-        f"{_term(mark, 'zscore', 'z-score')} (|z| > 3, sensible a la media). La "
-        "tabla está ordenada de la columna más contaminada a la menos.")
-
-    blocks = [
-        model.Heading(text=CHAPTER_TITLE, level=1),
-        model.Markdown(text=intro),
-        _univariate_table(uni_rows),
-    ]
-
-    # Flag the most contaminated columns explicitly.
-    flagged = [r["name"] for r in uni_rows
-               if r.get("contamination", -1.0) > 0][:_TOP_FLAGGED]
-    if flagged:
-        names = ", ".join(f"**{n}**" for n in flagged)
-        blocks.append(model.Markdown(text=(
-            f"Las columnas con mayor proporción de atípicos son {names}: "
-            "concentran el grueso de los valores fuera de las vallas y son las "
-            "primeras a revisar.")))
-
-    # --- Boxplots figure ---------------------------------------------------- #
-    box_entries = [
-        {"name": r["name"], "box": r["box"], "fliers": r.get("fliers")}
-        for r in uni_rows
-        if r.get("box")
-    ][:_TOP_BOX]
-    if box_entries:
-        def _boxplots_make(entries=box_entries):
-            try:
-                from datascience.build_boxplots_figure import build_boxplots_figure
-                return build_boxplots_figure(
-                    entries, title="Boxplots de Tukey por columna",
-                    max_boxes=_TOP_BOX)
-            except Exception:  # noqa: BLE001 — minimal fallback figure.
-                import matplotlib
-                matplotlib.use("Agg")
-                from matplotlib.figure import Figure
-                fig = Figure(figsize=(5.0, 2.2))
-                ax = fig.add_subplot(111)
-                ax.text(0.5, 0.5, "(boxplots no disponibles)",
-                        ha="center", va="center")
-                ax.axis("off")
-                return fig
-
-        blocks.append(model.Group(blocks=[
-            model.Heading(text="Boxplots", level=2),
-            model.Markdown(text=(
-                "Cada caja abarca del primer al tercer cuartil (P25–P75), la línea "
-                "interior es la mediana y los bigotes llegan hasta 1,5·IQR; los "
-                "puntos son los valores que caen fuera de las vallas (atípicos por "
-                "Tukey).")),
-            model.Figure(
-                make=_boxplots_make,
-                caption="Boxplots de Tukey de las columnas más contaminadas."),
-        ]))
-
-    # --- Multivariate ------------------------------------------------------- #
-    outliers, _src = _resolve_multivariate(profile, ctx, raw_numeric)
-    if outliers is not None:
-        blocks.extend(_multivariate_blocks(outliers, raw_numeric, mark))
-    else:
-        blocks.append(model.Heading(text="Filas atípicas (multivariante)", level=2))
-        blocks.append(model.Note(
-            "No se pudo analizar la anomalía multivariante: hacen falta al menos "
-            "dos columnas numéricas y la muestra cruda (o los modelos del perfil) "
-            "para correr Isolation Forest."))
-
-    # --- Interpretation ----------------------------------------------------- #
-    blocks.append(model.Heading(text="Cómo interpretar los atípicos", level=2))
-    blocks.append(_interpretation_block(mark))
-
-    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
-                         version=CHAPTER_VERSION, blocks=blocks)
@@ -1,304 +0,0 @@
-"""Tests for the OUTLIERS chapter — DoD: golden + edges + error path.
-
-Self-contained: builds synthetic ``numeric`` blocks + a raw_numeric sample (no
-DuckDB) so the suite is fast and deterministic. Verifies that the chapter emits
-the univariate per-column table, a boxplots figure, the multivariate Isolation
-Forest section and the outlier≠error interpretation; that the most contaminated
-column is ranked first; that a profile with no numeric column yields None; that
-None/empty never raises; that the glossary terms are registered; and that the
-chapter renders into both PDF and PPTX without cutting its title.
-"""
-
-import math
-import os
-import re
-import tempfile
-
-from pypdf import PdfReader
-
-from datascience.automatic_eda.chapters.outliers import (
-    build_outliers, CHAPTER_VERSION, CHAPTER_TITLE, _TERM_DEFS,
-)
-from datascience.automatic_eda import model
-from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
-from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
-
-
-def _percentile(sorted_vals, q):
-    """Linear-interpolation percentile (q in 0..1) on an already-sorted list."""
-    if not sorted_vals:
-        return None
-    if len(sorted_vals) == 1:
-        return float(sorted_vals[0])
-    pos = q * (len(sorted_vals) - 1)
-    lo = int(math.floor(pos))
-    hi = int(math.ceil(pos))
-    if lo == hi:
-        return float(sorted_vals[lo])
-    frac = pos - lo
-    return float(sorted_vals[lo] * (1 - frac) + sorted_vals[hi] * frac)
-
-
-def _col_from_values(values, nbins=10):
-    """Build a ``numeric`` sub-block shaped like describe_numeric's output from a
-    concrete list of raw values, so the profile percentiles and the raw sample
-    are consistent (the boxplot fences match the crudo)."""
-    vals = [float(v) for v in values]
-    s = sorted(vals)
-    n = len(s)
-    mean = sum(vals) / n
-    var = sum((v - mean) ** 2 for v in vals) / n
-    std = math.sqrt(var)
-    median = _percentile(s, 0.5)
-    p25 = _percentile(s, 0.25)
-    p75 = _percentile(s, 0.75)
-    mn, mx = s[0], s[-1]
-    # z-score outlier count (population), what the profile's n_outliers carries.
-    n_out = sum(1 for v in vals if std > 0 and abs((v - mean) / std) > 3.0)
-    width = (mx - mn) / nbins if mx > mn else 1.0
-    hist = [{"lo": mn + i * width, "hi": mn + (i + 1) * width, "count": 1}
-            for i in range(nbins)]
-    return {
-        "min": mn, "max": mx, "mean": mean, "median": median, "std": std,
-        "p25": p25, "p50": median, "p75": p75, "iqr": (p75 - p25),
-        "n_outliers": n_out, "outlier_pct": 100.0 * n_out / n,
-        "distribution_type": "right-skewed", "histogram": hist,
-    }
-
-
-def _fare_values():
-    """A heavy-tailed column (most ~10-30, a few 200-512): clear Tukey/z outliers."""
-    base = [7.0 + (i % 25) for i in range(120)]      # bulk 7..31
-    tail = [180.0, 210.0, 263.0, 512.0]              # extreme upper tail
-    return base + tail
-
-
-def _age_values():
-    """A roughly symmetric column with one extreme low value."""
-    base = [22.0 + (i % 40) for i in range(120)]     # 22..61
-    return base + [80.0, 0.5, 74.0, 1.0]
-
-
-def _quiet_values():
-    """A clean column with no atypical values."""
-    return [50.0 + (i % 5) for i in range(124)]
-
-
-def _profile_and_ctx(with_models=True, with_raw=True):
-    fare = _fare_values()
-    age = _age_values()
-    quiet = _quiet_values()
-    cols = [
-        {"name": "Fare", "inferred_type": "numeric", "numeric": _col_from_values(fare)},
-        {"name": "Age", "inferred_type": "numeric", "numeric": _col_from_values(age)},
-        {"name": "Quiet", "inferred_type": "numeric", "numeric": _col_from_values(quiet)},
-        {"name": "Sexo", "inferred_type": "categorical",
-         "categorical": {"top": [{"value": "male", "count": 80}]}},
-    ]
-    profile = {"table": "titanic", "n_rows": len(fare), "n_cols": len(cols),
-               "columns": cols}
-    if with_models:
-        profile["models"] = {
-            "outliers": {
-                "n_outliers": 4, "outlier_pct": 3.2,
-                "outlier_rows": [
-                    {"row_index": 123, "score": -0.21},
-                    {"row_index": 121, "score": -0.15},
-                ],
-                "threshold": -0.02, "n_rows_used": 124, "n_features": 3,
-            }
-        }
-    ctx = {}
-    if with_raw:
-        ctx["raw_numeric"] = {"Fare": fare, "Age": age, "Quiet": quiet}
-    return profile, ctx
-
-
-def _pdf_text(path: str) -> str:
-    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
-    return re.sub(r"\s+", " ", txt)
-
-
-def _flatten(blocks):
-    out = []
-    for b in blocks:
-        if getattr(b, "kind", "") == "group":
-            out.extend(_flatten(getattr(b, "blocks", []) or []))
-        else:
-            out.append(b)
-    return out
-
-
-# --------------------------------------------------------------------------- #
-# Golden.
-# --------------------------------------------------------------------------- #
-def test_golden_estructura_y_secciones():
-    profile, ctx = _profile_and_ctx()
-    ctx["glossary"] = model.GlossaryCollector()
-    ch = build_outliers(profile, ctx)
-    assert ch is not None
-    assert ch.id == "outliers"
-    assert ch.version == CHAPTER_VERSION
-
-    flat = _flatten(ch.blocks)
-    kinds = [b.kind for b in flat]
-    # Title heading + univariate DataTable + boxplots Figure + multivariate
-    # KVTable + interpretation Markdown.
-    assert kinds[0] == "heading" and flat[0].text == CHAPTER_TITLE
-    tables = [b for b in flat if b.kind == "data_table"]
-    titles = [t.title for t in tables]
-    assert any(t and "atípicos por columna" in t for t in titles)
-    assert any(b.kind == "figure" for b in flat), "falta la figura de boxplots"
-    assert any(b.kind == "kv_table" for b in flat), "falta el resumen multivariante"
-
-    # The boxplots figure maker yields a real matplotlib figure (or its fallback).
-    fig = next(b for b in flat if b.kind == "figure").make()
-    assert fig is not None
-    import matplotlib.pyplot as plt
-    plt.close(fig)
-
-
-def test_golden_fare_es_la_mas_contaminada():
-    # The univariate table must rank Fare (heavy tail) first and report a
-    # non-zero Tukey percentage for it.
-    profile, ctx = _profile_and_ctx()
-    ch = build_outliers(profile, ctx)
-    table = next(b for b in _flatten(ch.blocks)
-                 if b.kind == "data_table" and b.title
-                 and "atípicos por columna" in b.title)
-    first_col = table.rows[0][0]
-    assert first_col == "Fare", f"esperaba Fare primera, fue {first_col}"
-    # % Tukey column (index 2) of the first row must be > 0.
-    pct_cell = table.rows[0][2]
-    assert pct_cell not in ("—", "0%", "0.00%"), f"% Tukey de Fare vacío: {pct_cell}"
-    # The z-score rule (detect_outliers) must actually run with raw_numeric: at
-    # least one column reports a non-empty z count/percentage (regression guard
-    # for the detect_outliers import path).
-    z_pcts = [r[4] for r in table.rows]
-    assert any(c not in ("—",) for c in z_pcts), f"columna z toda vacía: {z_pcts}"
-    z_counts = [r[3] for r in table.rows]
-    assert any(c not in ("—",) for c in z_counts), f"conteo z vacío: {z_counts}"
-
-
-def test_golden_interpretacion_outlier_no_es_error():
-    profile, ctx = _profile_and_ctx()
-    ch = build_outliers(profile, ctx)
-    md = " ".join(b.text for b in _flatten(ch.blocks) if b.kind == "markdown")
-    assert "no es necesariamente un error" in md.lower()
-    # Mentions the actionable options (winsorize / re-express).
-    assert "winsoriz" in md.lower()
-    assert "re-expres" in md.lower() or "logarítmic" in md.lower()
-
-
-def test_golden_terminos_glosario_registrados():
-    profile, ctx = _profile_and_ctx()
-    gloss = model.GlossaryCollector()
-    ctx["glossary"] = gloss
-    build_outliers(profile, ctx)
-    for key in _TERM_DEFS:
-        assert gloss.has(key), f"término '{key}' no registrado en el glosario"
-    # Terms are marked clickable in the body text.
-    md = " ".join(b.text for b in _flatten(build_outliers(profile, ctx).blocks)
-                  if b.kind == "markdown")
-    assert "[[term:outlier]]" in md and "[[term:tukey_fence]]" in md
-
-
-# --------------------------------------------------------------------------- #
-# Multivariate.
-# --------------------------------------------------------------------------- #
-def test_multivariante_live_con_raw_y_dims():
-    # With a raw sample the chapter runs Isolation Forest live (over the same
-    # columns summarize_outlier_dims uses) and lists the anomalous rows with the
-    # dimensions that make each one rare.
-    profile, ctx = _profile_and_ctx(with_models=False, with_raw=True)
-    ch = build_outliers(profile, ctx)
-    flat = _flatten(ch.blocks)
-    kv = next(b for b in flat if b.kind == "kv_table")
-    flat_kv = " ".join(f"{k} {v}" for (k, v) in kv.rows)
-    assert "Filas atípicas" in flat_kv
-    # A non-zero number of anomalous rows is reported.
-    n_cell = dict(kv.rows).get("Filas atípicas")
-    assert n_cell not in (None, "—", "0"), f"sin filas atípicas: {n_cell}"
-    # The anomalous-rows table carries the per-row dimension breakdown.
-    tbls = [b for b in flat if b.kind == "data_table" and b.title
-            and "más atípicas" in b.title]
-    assert tbls, "falta la tabla de filas más atípicas"
-    assert any("hacen rara" in h for h in tbls[0].header), \
-        f"falta la columna de dimensiones: {tbls[0].header}"
-
-
-def test_multivariante_precomputed_sin_raw():
-    # Without a raw sample the chapter falls back to profile['models']['outliers']
-    # (lite preset path); the precomputed n_outliers (4) surfaces in the KV table.
-    profile, ctx = _profile_and_ctx(with_models=True, with_raw=False)
-    ch = build_outliers(profile, ctx)
-    kv = next(b for b in _flatten(ch.blocks) if b.kind == "kv_table")
-    assert any("4" in str(v) for (k, v) in kv.rows)
-
-
-def test_multivariante_ausente_degrada_a_nota():
-    # No models and no raw sample → an honest note, never a crash.
-    profile, ctx = _profile_and_ctx(with_models=False, with_raw=False)
-    ch = build_outliers(profile, ctx)
-    assert ch is not None
-    notes = [b.text for b in _flatten(ch.blocks) if b.kind == "note"]
-    assert any("Isolation Forest" in n for n in notes)
-
-
-# --------------------------------------------------------------------------- #
-# Edges / error path.
-# --------------------------------------------------------------------------- #
-def test_edge_sin_columnas_numericas_devuelve_none():
-    prof = {"columns": [{"name": "c", "inferred_type": "categorical",
-                         "categorical": {"top": [{"value": "x", "count": 3}]}}]}
-    assert build_outliers(prof, {}) is None
-
-
-def test_edge_solo_texto_sintetico_devuelve_none():
-    # A text-only synthetic table (no numeric column) yields None (does not break).
-    prof = {"table": "notas", "n_rows": 3, "n_cols": 1,
-            "columns": [{"name": "comentario", "inferred_type": "text",
-                         "text": {"n_docs": 3}}]}
-    assert build_outliers(prof, {}) is None
-
-
-def test_edge_profile_none_y_vacio_no_revienta():
-    assert build_outliers(None, None) is None
-    assert build_outliers({}, {}) is None
-    assert build_outliers({"columns": []}, {}) is None
-
-
-def test_edge_sin_raw_numeric_degrada_a_perfil():
-    # Without raw_numeric the chapter still builds, using the profile z-score
-    # counts; the univariate table exists and Tukey counts degrade to '—'.
-    profile, ctx = _profile_and_ctx(with_models=True, with_raw=False)
-    ch = build_outliers(profile, ctx)
-    assert ch is not None
-    table = next(b for b in _flatten(ch.blocks)
-                 if b.kind == "data_table" and b.title
-                 and "atípicos por columna" in b.title)
-    # z column comes from the profile; Tukey count is unknown ('—').
-    assert all(len(r) == 8 for r in table.rows)
-
-
-# --------------------------------------------------------------------------- #
-# Anti-cut render.
-# --------------------------------------------------------------------------- #
-def test_render_pdf_y_pptx_incluyen_el_capitulo():
-    profile, ctx = _profile_and_ctx()
-    # The renderers build the whole document; the chapter is reached via the
-    # registry. Render the chapter standalone through a one-chapter document by
-    # passing the profile directly (the renderers run the full chapter registry).
-    with tempfile.TemporaryDirectory() as d:
-        pdf = os.path.join(d, "out.pdf")
-        res_pdf = render_automatic_eda_pdf(profile, pdf,
-                                           {"write_manifest": False, "ctx": ctx})
-        assert res_pdf["path"] == pdf
-        txt = _pdf_text(pdf)
-        assert CHAPTER_TITLE in txt, "el capítulo OUTLIERS no aparece en el PDF"
-        assert "Fare" in txt
-        pptx = os.path.join(d, "out.pptx")
-        res_pptx = render_automatic_eda_pptx(profile, pptx,
-                                             {"write_manifest": False, "ctx": ctx})
-        assert res_pptx["path"] == pptx
-        assert res_pptx["n_slides"] >= 1
@@ -34,7 +34,6 @@ CHAPTER_ORDER = [
    "text_distr",    # free-text / NLP distributions (non-tabular content)
    "calidad",       # data quality
    "missingness",   # missing-data patterns (co-occurrence of absences; MCAR/MAR)
-    "outliers",      # atypical values: univariate (Tukey/z) + multivariate (IsolationForest)
    "correlacion",   # correlations / associations
    "relaciones",    # key relations: declared/candidate PK + FK (inter/intra-table)
    "modelos",       # cheap models (PCA/KMeans/outliers)
@@ -73,51 +72,24 @@ def build_chapter(chapter_id: str, profile: dict, ctx: dict):
    return model.as_chapter(result)


-def build_document(profile: dict, ctx: dict = None, only: list = None) -> list:
-    """Build the ordered list of chapters for a TableProfile.
+def build_document(profile: dict, ctx: dict = None) -> list:
+    """Build the full ordered list of chapters for a TableProfile.

    Args:
        profile: the ``eda`` group TableProfile dict (may be None/empty).
        ctx: optional context dict carrying presentation metadata not present in
            the profile (dataset_name, source_origin, storage, generated_at,
            description, granularity, quality_criteria, head_rows, ...).
-        only: optional list of chapter ids to render. ``None`` (default) keeps
-            the historical behaviour — every implemented & applicable chapter in
-            canonical order. A list restricts the BODY to just those ids (in
-            canonical order), but the cover (``portada``) and glossary
-            (``glosario``) are ALWAYS included so the document stays valid and
-            the clickable terms keep a destination — so passing ``only=["x"]``
-            yields portada + x + glosario. Unknown ids are simply skipped (the
-            caller is responsible for strict validation). ``only=[]`` yields the
-            minimal document (portada + glosario only). This argument is additive
-            and backward-compatible: the signature is unchanged for existing
-            callers (default ``None``).

    Returns:
        list[Chapter] in canonical order, containing only the chapters that are
-        implemented, applicable and selected. Never raises.
+        implemented and applicable. Never raises.
    """
    if not isinstance(profile, dict):
        profile = {}
    # Copy ctx so the shared collector / summary we add do not leak to the caller.
    ctx = dict(ctx) if isinstance(ctx, dict) else {}

-    # only=None -> all body chapters (historical). only=list -> restrict body to
-    # that selection (portada/glosario are added unconditionally below). The
-    # renderers call build_document(profile, meta['ctx']) without an `only`
-    # argument, so the pipeline forwards the selection through a reserved ctx key
-    # (``_only_chapters``); an explicit `only` argument always wins. The key is
-    # popped from the local ctx copy so it never reaches the chapters.
-    if only is None:
-        _carried = ctx.pop("_only_chapters", None)
-        if isinstance(_carried, (list, tuple, set)):
-            only = list(_carried)
-    else:
-        ctx.pop("_only_chapters", None)
-    # A set makes the membership test cheap; the iteration order stays
-    # CHAPTER_ORDER. only=[] is a valid (empty) selection -> minimal document.
-    only_set = set(only) if isinstance(only, (list, tuple, set)) else None
-
    # A single glossary collector is shared by every chapter via ctx['glossary'].
    # Chapters call ctx['glossary'].add(key, label, definition) and mark in-text
    # appearances with [[term:key]]…[[/term]]; the glosario chapter renders the
@@ -133,10 +105,6 @@ def build_document(profile: dict, ctx: dict = None, only: list = None) -> list:
    for cid in CHAPTER_ORDER:
        if cid in (_PORTADA, _GLOSARIO):
            continue
-        # When a selection is given, skip body chapters outside it. portada and
-        # glosario are never filtered (handled out of this loop).
-        if only_set is not None and cid not in only_set:
-            continue
        ch = build_chapter(cid, profile, ctx)
        if ch is not None and ch.blocks:
            body.append(ch)
@@ -1,125 +0,0 @@
---
-id: build_boxplots_figure_py_datascience
-name: build_boxplots_figure
-kind: function
-lang: py
-domain: datascience
-version: "1.0.0"
-purity: impure
-signature: "def build_boxplots_figure(boxes: list, title: str = \"\", max_boxes: int = 12) -> \"matplotlib.figure.Figure\""
-description: "Construye una unica figura matplotlib con boxplots de Tukey HORIZONTALES (uno por columna) usando ax.bxp: caja Q1-Q3, bigotes hasta 1.5*IQR, linea de mediana y puntos atipicos. Consume la salida de build_boxplot_stats (un dict box por columna, leido con .get) mas una lista opcional de outliers crudos por columna; si vienen los dibuja como puntos (showfliers), si no marca solo box[min]/box[max] cuando hay outliers de cola (igual que num_distr). Dibuja como mucho max_boxes cajas (las primeras, ya ordenadas por contaminacion por el caller) y avisa de la truncacion con (mostrando N de M). Backend Agg sin pyplot global; alto adaptativo al nº de cajas. Defensiva: omite entradas invalidas y NUNCA lanza — sin cajas validas devuelve una figura placeholder (sin boxplots). Es la version small-multiples del capitulo num_distr para responder que columnas tienen mas outliers de un vistazo."
-tags: [eda, outliers, boxplot, tukey, iqr, bxp, matplotlib, figure, visualization, small-multiples, datascience, impure]
-uses_functions: []
-uses_types: []
-returns: []
-returns_optional: false
-error_type: "error_go_core"
-imports: [matplotlib]
-example: |
-  from datascience.build_boxplot_stats import build_boxplot_stats
-  from datascience.build_boxplots_figure import build_boxplots_figure
-  boxes = [
-      {"name": "ingresos", "box": build_boxplot_stats({"min": 1.0, "max": 9e3,
-          "p25": 1e3, "median": 2e3, "p75": 3e3, "n_outliers": 7}), "fliers": None},
-      {"name": "edad", "box": build_boxplot_stats({"min": 0.0, "max": 99.0,
-          "p25": 25.0, "median": 38.0, "p75": 52.0}), "fliers": None},
-  ]
-  fig = build_boxplots_figure(boxes, title="Outliers por columna", max_boxes=12)
-tested: true
-tests:
-  - "test_returns_figure_with_axes"
-  - "test_empty_list_returns_placeholder_figure"
-  - "test_invalid_box_is_skipped_not_raised"
-  - "test_all_invalid_returns_placeholder"
-  - "test_raw_fliers_are_drawn"
-  - "test_max_boxes_truncates_and_does_not_raise"
-test_file_path: "python/functions/datascience/build_boxplots_figure_test.py"
-file_path: "python/functions/datascience/build_boxplots_figure.py"
-params:
-  - name: boxes
-    desc: "Lista de dicts, cada uno {\"name\": str, \"box\": dict, \"fliers\": list|None}. box es EXACTAMENTE la salida de build_boxplot_stats (claves leidas con .get: q1, median, q3, whisker_lo, whisker_hi, min, max, has_low_outliers, has_high_outliers, lower_fence, upper_fence, n_outliers). fliers es la lista opcional de outliers crudos: si viene se dibuja como puntos; si es None/ausente solo se marcan los extremos box[min]/box[max] cuando hay outliers de cola. Entradas que no son dict, sin box dict, o sin q1/median/q3 se omiten. El caller las pasa ya ordenadas por contaminacion (la mayor primera)."
-  - name: title
-    desc: "Titulo de la figura (fig.suptitle, alineado a la izquierda). Vacio => sin titulo. Si len(boxes) > max_boxes se le anade una nota \"(mostrando N de M)\" para que la truncacion no sea silenciosa. Default \"\"."
-  - name: max_boxes
-    desc: "Numero maximo de cajas a dibujar (las primeras de la lista). Default 12. Un valor no entero o <= 0 cae a 12. Si la lista trae mas entradas, las sobrantes se descartan pero se reporta en el titulo con (mostrando N de M)."
-output: "Un matplotlib.figure.Figure (figsize 7.0 x alto adaptativo = max(2.0, 0.5*n + 1.0), dpi 150) con un unico Axes que apila boxplots horizontales de Tukey (ax.bxp, orientation=horizontal con fallback vert=False), uno por columna valida, de arriba a abajo en el orden recibido. Cada caja: relleno #9ec6df, borde/bigotes/caps #5b8aa6, mediana #2e8b57, atipicos #c0392b. Etiquetas del eje Y = nombres de columna; eje X etiquetado \"valor\". Outliers dibujados desde fliers crudos (showfliers) o, si faltan, marcados en box[min]/box[max] segun has_low/high_outliers. Si no queda ninguna caja valida (lista vacia o todas invalidas) devuelve una Figure placeholder con texto centrado \"(sin boxplots)\"; cualquier error inesperado se captura y devuelve una Figure con el mensaje de error. NUNCA lanza. El caller rasteriza/cierra la figura; la funcion no la muestra ni la guarda."
---
-
-## Ejemplo
-
-```python
-import sys, os
-sys.path.insert(0, os.path.join("python", "functions"))
-from datascience.build_boxplot_stats import build_boxplot_stats
-from datascience.build_boxplots_figure import build_boxplots_figure
-
-# Un `box` por columna numérica, derivado del sub-bloque `numeric` del profile
-# (salida de describe_numeric). El caller los pasa ya ordenados por outlier_pct.
-boxes = [
-    {
-        "name": "ingresos",
-        "box": build_boxplot_stats({
-            "min": 1.0, "max": 9000.0,
-            "p25": 1000.0, "median": 2000.0, "p75": 3000.0,
-            "n_outliers": 7,
-        }),
-        "fliers": None,  # valores crudos desconocidos -> se marca solo el extremo.
-    },
-    {
-        "name": "edad",
-        "box": build_boxplot_stats({
-            "min": 0.0, "max": 99.0,
-            "p25": 25.0, "median": 38.0, "p75": 52.0,
-        }),
-        "fliers": [88.0, 95.0, 99.0],  # outliers crudos -> se dibujan como puntos.
-    },
-]
-
-fig = build_boxplots_figure(boxes, title="Outliers por columna", max_boxes=12)
-
-# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
-fig.savefig("/tmp/boxplots.png")
-```
-
-## Cuando usarla
-
-Úsala en el capítulo de outliers de un informe EDA cuando quieras comparar de un
-vistazo *qué columnas están más contaminadas por valores atípicos*: a diferencia
-de `num_distr` (que dibuja un histograma+boxplot por columna en figuras
-separadas), aquí apilas todos los boxplots horizontales en **una sola figura**
-(small multiples). Primero deriva el `box` de cada columna con
-`build_boxplot_stats`, ordénalas por `outlier_pct` descendente, envuélvelas como
-`{"name", "box", "fliers"}` y pásaselas. Si tienes los valores crudos fuera de
-las vallas, métele la lista `fliers` y se dibujarán como puntos; si no, la
-función marca solo los extremos `min`/`max` cuando hay cola.
-
-## Gotchas
-
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
-  y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
-  para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
-  es thread-safe; esta función construye el `Figure` directamente, así que es
-  segura de llamar en bucle desde el renderer.
- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo
-  guarda. Quien la consume debe rasterizarla y luego liberarla
-  (`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes.
- **`fliers` opcional, semántica distinta.** Si pasas la lista de outliers
-  crudos se dibujan todos como puntos (`showfliers=True`). Si es `None`/ausente
-  los valores son desconocidos y solo se marca un punto en `box["min"]` /
-  `box["max"]` cuando `has_low_outliers` / `has_high_outliers` — mismo criterio
-  que `num_distr`. No inventes fliers a partir del profile: el `box` no trae los
-  valores crudos, solo si los extremos superan las vallas.
- **API de orientación de `ax.bxp`.** matplotlib reciente usa
-  `orientation="horizontal"`; las versiones antiguas usan `vert=False`. La
-  función prueba la primera y cae a la segunda en `except TypeError`, así que
-  funciona en ambas. Si `bxp` falla del todo, el Axes degrada a un texto
-  "(boxplot no disponible)" en vez de propagar.
- **Truncación visible.** `max_boxes` (default 12) limita el nº de cajas para que
-  ninguna se solape; si la lista trae más, las sobrantes se descartan pero se
-  avisa en el título con "(mostrando N de M)". Pasa las columnas ya ordenadas por
-  contaminación para que las descartadas sean las menos relevantes.
- **Defensiva, nunca lanza.** Lista vacía, entradas no-dict, sin `box`, o sin
-  `q1`/`median`/`q3` se omiten sin propagar; sin cajas válidas devuelve un
-  placeholder "(sin boxplots)" y cualquier error inesperado se captura en una
-  figura con el texto del error. No envuelvas la llamada en try/except por miedo
-  a un raise — no lo hay.
@@ -1,250 +0,0 @@
-"""Impure EDA helper: a single figure of horizontal Tukey boxplots (`eda` group).
-
-Draws, in one ``matplotlib.figure.Figure``, a stack of horizontal Tukey boxplots
-(one per column) using ``ax.bxp``: each carries its box (Q1–Q3), whiskers (up to
-1.5·IQR), the median line and its outlier points. It consumes the output of the
-pure registry function ``build_boxplot_stats`` (one ``box`` dict per column) plus
-an optional list of raw outlier values per column; it never recomputes anything.
-
-It is the "small-multiples" companion of ``num_distr`` (which draws one
-histogram+boxplot per column): here every column shares a single figure so the
-caller can show, at a glance, *which* columns are the most contaminated by
-outliers (the caller passes them already ordered by contamination).
-
-Impure because it touches matplotlib's rendering machinery. It uses the headless
-Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
-global state and is safe to call repeatedly from a report renderer. It is fully
-defensive and NEVER raises: invalid entries are skipped and, if nothing valid
-remains, it returns a placeholder figure carrying a centered "(sin boxplots)".
-"""
-
-import matplotlib
-
-matplotlib.use("Agg")
-
-from matplotlib.figure import Figure  # noqa: E402
-
-# Blue palette shared with the ``num_distr`` chapter so the report stays coherent.
-_BOX_FACE = "#9ec6df"   # box fill.
-_BOX_EDGE = "#5b8aa6"   # box / whisker / cap border.
-_MEDIAN = "#2e8b57"     # median line (sea green).
-_OUTLIER = "#c0392b"    # outlier points (soft red).
-# Muted gray for the placeholder / fallback message text.
-_MUTED_TEXT = "#5f6b7a"
-# Soft red for the error fallback message.
-_ERROR_TEXT = "#b00020"
-
-
-def _num(value):
-    """Coerce ``value`` to float defensively; None for None/bool/non-numeric/NaN."""
-    # bool is a subclass of int; a stat value is never a real bool, so treat
-    # True/False as missing instead of silently coercing to 1.0/0.0.
-    if value is None or isinstance(value, bool):
-        return None
-    try:
-        f = float(value)
-    except (TypeError, ValueError):
-        return None
-    if f != f:  # NaN guard.
-        return None
-    return f
-
-
-def _placeholder_figure(message: str, color: str = _MUTED_TEXT) -> "Figure":
-    """Return a fallback ``Figure`` carrying a single centered message."""
-    fig = Figure(figsize=(7.0, 2.4), dpi=150)
-    ax = fig.add_subplot(111)
-    ax.axis("off")
-    ax.text(
-        0.5,
-        0.5,
-        message,
-        ha="center",
-        va="center",
-        fontsize=12,
-        color=color,
-        wrap=True,
-        transform=ax.transAxes,
-    )
-    fig.tight_layout()
-    return fig
-
-
-def build_boxplots_figure(
-    boxes: list,
-    title: str = "",
-    max_boxes: int = 12,
-) -> "matplotlib.figure.Figure":
-    """Build one figure of stacked horizontal Tukey boxplots (one per column).
-
-    For each entry the function builds a ``bxp`` stats record (``med, q1, q3,
-    whislo, whishi, fliers, label``) from its ``box`` sub-dict (the output of
-    ``build_boxplot_stats``) and draws all of them as horizontal boxplots sharing
-    the X axis, top-to-bottom in the order received (the caller is expected to
-    pass them already sorted by contamination).
-
-    Outliers are shown two ways:
-
-    - If an entry carries a ``fliers`` list (the raw out-of-fence values), they
-      are drawn as red points via ``ax.bxp(..., showfliers=True)``.
-    - If ``fliers`` is ``None``/absent, the raw values are unknown, so only the
-      extremes are marked: a red point at ``box["min"]`` when
-      ``box["has_low_outliers"]`` and at ``box["max"]`` when
-      ``box["has_high_outliers"]`` (same convention as ``num_distr``).
-
-    The function is fully defensive and NEVER raises. Entries that are not dicts,
-    lack a ``box`` dict, or miss any of ``q1``/``median``/``q3`` are skipped. If
-    after filtering no valid box remains it returns a placeholder ``Figure`` with
-    a centered "(sin boxplots)"; any unexpected error is caught and turned into a
-    fallback figure carrying the error text. It always returns a ``Figure``.
-
-    Args:
-        boxes: List of dicts ``{"name": str, "box": dict, "fliers": list|None}``.
-            ``box`` is exactly the output of ``build_boxplot_stats`` (read with
-            ``.get``: ``q1, median, q3, whisker_lo, whisker_hi, min, max,
-            has_low_outliers, has_high_outliers, ...``). ``fliers`` is the
-            optional list of raw outlier values; when present they are plotted,
-            otherwise only the extremes are marked.
-        title: Figure title (``fig.suptitle``). Empty => no title. When the list
-            is longer than ``max_boxes`` a "(mostrando N de M)" note is appended.
-        max_boxes: Draw at most the first ``max_boxes`` entries (default 12). The
-            rest are dropped but their omission is surfaced in the title note, so
-            the truncation is never silent.
-
-    Returns:
-        A ``matplotlib.figure.Figure`` with a single Axes holding the horizontal
-        boxplots (height adaptive to the box count so none overlap). The caller is
-        responsible for rasterizing/closing it; this function never shows nor
-        saves it.
-    """
-    try:
-        if not isinstance(boxes, (list, tuple)) or len(boxes) == 0:
-            return _placeholder_figure("(sin boxplots)")
-
-        total = len(boxes)
-
-        # Cap the number of boxes; tolerate a non-int / non-positive max_boxes.
-        try:
-            cap = int(max_boxes)
-        except (TypeError, ValueError):
-            cap = 12
-        if cap <= 0:
-            cap = 12
-        candidates = list(boxes)[:cap]
-
-        stats_list = []        # bxp stats records, in draw order.
-        labels = []            # Y tick labels (column names).
-        manual_markers = []    # (position, box) for entries without raw fliers.
-        any_fliers = False     # whether to enable showfliers in the bxp call.
-
-        for entry in candidates:
-            if not isinstance(entry, dict):
-                continue
-            box = entry.get("box")
-            if not isinstance(box, dict):
-                continue
-
-            q1 = _num(box.get("q1"))
-            med = _num(box.get("median"))
-            q3 = _num(box.get("q3"))
-            # Without the three quartiles a boxplot cannot be drawn — skip it.
-            if q1 is None or med is None or q3 is None:
-                continue
-
-            # Whisker extremes fall back to the quartiles when missing.
-            whislo = _num(box.get("whisker_lo"))
-            whishi = _num(box.get("whisker_hi"))
-            if whislo is None:
-                whislo = q1
-            if whishi is None:
-                whishi = q3
-
-            name = entry.get("name")
-            label = "" if name is None else str(name)
-
-            position = len(stats_list) + 1  # bxp positions are 1-indexed.
-            fliers_raw = entry.get("fliers")
-            if isinstance(fliers_raw, (list, tuple)):
-                fliers = [v for v in (_num(x) for x in fliers_raw) if v is not None]
-                if fliers:
-                    any_fliers = True
-            else:
-                # Raw values unknown: draw no bxp fliers, mark min/max by hand.
-                fliers = []
-                manual_markers.append((position, box))
-
-            stats_list.append({
-                "med": med,
-                "q1": q1,
-                "q3": q3,
-                "whislo": whislo,
-                "whishi": whishi,
-                "fliers": fliers,
-                "label": label,
-            })
-            labels.append(label)
-
-        if not stats_list:
-            return _placeholder_figure("(sin boxplots)")
-
-        n = len(stats_list)
-        positions = list(range(1, n + 1))
-
-        # Height grows with the box count so none of them overlap.
-        height = max(2.0, 0.5 * n + 1.0)
-        fig = Figure(figsize=(7.0, height), dpi=150)
-        ax = fig.add_subplot(111)
-
-        bxp_kw = dict(
-            showfliers=any_fliers, widths=0.5, patch_artist=True,
-            boxprops={"facecolor": _BOX_FACE, "edgecolor": _BOX_EDGE},
-            medianprops={"color": _MEDIAN, "linewidth": 1.6},
-            whiskerprops={"color": _BOX_EDGE},
-            capprops={"color": _BOX_EDGE},
-            flierprops={"marker": "o", "markersize": 3.5,
-                        "markerfacecolor": _OUTLIER, "markeredgecolor": _OUTLIER,
-                        "linestyle": "none"})
-        try:
-            # ``orientation`` is the current API; older matplotlib uses ``vert``.
-            try:
-                ax.bxp(stats_list, positions=positions,
-                       orientation="horizontal", **bxp_kw)
-            except TypeError:
-                ax.bxp(stats_list, positions=positions, vert=False, **bxp_kw)
-        except Exception:  # noqa: BLE001 — never let bxp kill the whole figure.
-            ax.text(0.5, 0.5, "(boxplot no disponible)", ha="center",
-                    va="center", fontsize=10, color=_MUTED_TEXT,
-                    transform=ax.transAxes)
-
-        # For entries without raw fliers, mark only the out-of-fence extremes.
-        for position, box in manual_markers:
-            mn = _num(box.get("min"))
-            mx = _num(box.get("max"))
-            if box.get("has_low_outliers") and mn is not None:
-                ax.plot([mn], [position], marker="o", markersize=3.5,
-                        color=_OUTLIER, zorder=5)
-            if box.get("has_high_outliers") and mx is not None:
-                ax.plot([mx], [position], marker="o", markersize=3.5,
-                        color=_OUTLIER, zorder=5)
-
-        # Pin the Y tick labels explicitly so they work across matplotlib
-        # versions regardless of whether ``bxp`` consumed the ``label`` key.
-        ax.set_yticks(positions)
-        ax.set_yticklabels(labels, fontsize=8)
-        ax.set_xlabel("valor", fontsize=9)
-        ax.tick_params(labelsize=7)
-        ax.margins(y=0.15)
-        for spine in ("top", "right"):
-            ax.spines[spine].set_visible(False)
-
-        # Surface truncation in the title instead of silently dropping boxes.
-        note = f"(mostrando {n} de {total})" if total > cap else ""
-        heading = "  ".join(p for p in (title, note) if p)
-        if heading:
-            fig.suptitle(heading, fontsize=12, x=0.02, ha="left")
-
-        fig.tight_layout()
-        return fig
-    except Exception as exc:  # noqa: BLE001 — never raise from a figure builder.
-        return _placeholder_figure(
-            f"error al dibujar boxplots: {exc}", color=_ERROR_TEXT)
@@ -1,109 +0,0 @@
-"""Tests para build_boxplots_figure (boxplots horizontales de Tukey, grupo eda).
-
-Usa el backend Agg sin display; no muestra ni guarda figuras. Cada test cierra
-explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular
-estado entre tests.
-"""
-
-import matplotlib
-
-matplotlib.use("Agg")
-
-import matplotlib.pyplot as plt  # noqa: E402
-from matplotlib.figure import Figure  # noqa: E402
-
-from build_boxplots_figure import build_boxplots_figure
-
-
-def _box(name, q1, median, q3, mn, mx, low=False, high=False, fliers=None):
-    """Construye una entrada {name, box, fliers} con un box estilo build_boxplot_stats."""
-    iqr = q3 - q1
-    return {
-        "name": name,
-        "box": {
-            "q1": q1,
-            "median": median,
-            "q3": q3,
-            "iqr": iqr,
-            "lower_fence": q1 - 1.5 * iqr,
-            "upper_fence": q3 + 1.5 * iqr,
-            "whisker_lo": max(mn, q1 - 1.5 * iqr),
-            "whisker_hi": min(mx, q3 + 1.5 * iqr),
-            "min": mn,
-            "max": mx,
-            "has_low_outliers": low,
-            "has_high_outliers": high,
-            "n_outliers": 0,
-        },
-        "fliers": fliers,
-    }
-
-
-def test_returns_figure_with_axes():
-    boxes = [
-        _box("edad", 10.0, 25.0, 40.0, 1.0, 100.0, high=True),
-        _box("ingresos", 100.0, 200.0, 300.0, 50.0, 400.0),
-        _box("score", -1.0, 0.0, 1.0, -5.0, 5.0, low=True, high=True),
-    ]
-    fig = build_boxplots_figure(boxes, title="Boxplots", max_boxes=12)
-    assert isinstance(fig, Figure)
-    assert len(fig.axes) >= 1
-    # Tres cajas -> tres etiquetas en el eje Y.
-    ax = fig.axes[0]
-    assert len(ax.get_yticks()) == 3
-    plt.close(fig)
-
-
-def test_empty_list_returns_placeholder_figure():
-    fig = build_boxplots_figure([], title="vacío")
-    assert isinstance(fig, Figure)
-    assert len(fig.axes) >= 1
-    plt.close(fig)
-
-
-def test_invalid_box_is_skipped_not_raised():
-    boxes = [
-        {"name": "rota", "box": {"q1": None, "median": None, "q3": None}},
-        {"name": "sin_box"},                         # falta la clave box.
-        "no_es_dict",                                 # entrada no-dict.
-        _box("buena", 1.0, 2.0, 3.0, 0.0, 10.0, high=True),
-    ]
-    fig = build_boxplots_figure(boxes)
-    assert isinstance(fig, Figure)
-    ax = fig.axes[0]
-    # Solo la caja válida sobrevive al filtrado.
-    assert len(ax.get_yticks()) == 1
-    plt.close(fig)
-
-
-def test_all_invalid_returns_placeholder():
-    boxes = [
-        {"name": "a", "box": {"q1": None, "median": 1.0, "q3": 2.0}},
-        {"name": "b"},
-    ]
-    fig = build_boxplots_figure(boxes)
-    assert isinstance(fig, Figure)
-    assert len(fig.axes) >= 1
-    plt.close(fig)
-
-
-def test_raw_fliers_are_drawn():
-    boxes = [
-        _box("con_fliers", 10.0, 20.0, 30.0, 5.0, 200.0,
-             high=True, fliers=[150.0, 180.0, 200.0]),
-    ]
-    fig = build_boxplots_figure(boxes)
-    assert isinstance(fig, Figure)
-    assert len(fig.axes) >= 1
-    plt.close(fig)
-
-
-def test_max_boxes_truncates_and_does_not_raise():
-    boxes = [_box(f"c{i}", float(i), float(i + 1), float(i + 2),
-                  float(i - 5), float(i + 10)) for i in range(20)]
-    fig = build_boxplots_figure(boxes, title="muchos", max_boxes=5)
-    assert isinstance(fig, Figure)
-    ax = fig.axes[0]
-    # Solo se dibujan las primeras 5 cajas.
-    assert len(ax.get_yticks()) == 5
-    plt.close(fig)
@@ -0,0 +1,77 @@
+---
+name: generate_synthetic_eda_folder
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def generate_synthetic_eda_folder(out_dir: str, n_rows: int = 2000, seed: int = 42) -> dict"
+description: "Genera una carpeta con 3 CSV RELACIONADOS (customers, orders, reviews) deterministas por seed (Faker + numpy) para ejercitar el motor AutomaticEDA multi-tabla / profile_database. orders.customer_id y reviews.customer_id estan contenidos al 100% en customers.customer_id (PK uuid), de modo que la deteccion FK por containment (min_inclusion=0.9) descubre ambas relaciones. customers es la tabla padre; reutiliza helpers de generate_synthetic_eda_table (texto multi-idioma, lat/lon validas, amount con outliers). Estilo dict-no-throw: nunca lanza."
+tags: [eda, synthetic, faker, testing, fixture, datascience]
+params:
+  - name: out_dir
+    desc: "Carpeta de salida. Se crea con mkdir -p si no existe. Recibe customers.csv, orders.csv y reviews.csv."
+  - name: n_rows
+    desc: "Numero de clientes (filas de customers). orders ~= 2*n_rows filas, reviews ~= n_rows filas. Default 2000."
+  - name: seed
+    desc: "Semilla para Faker (Faker.seed) y numpy (np.random.default_rng). Mismo seed -> CSVs identicos byte a byte. Default 42."
+output: "dict dict-no-throw. En exito {status:'ok', out_dir, files:{customers,orders,reviews}, n_customers, n_orders, n_reviews, expected_relations:[{from_table,from_col,to_table,to_col}, ...], seed}. En error (sin lanzar, p.ej. n_rows<=0) {status:'error', error:str}. expected_relations declara las 2 FK orders->customers y reviews->customers (ambas por customer_id)."
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+tested: true
+tests: ["test_genera_ok_y_archivos", "test_determinismo_mismo_seed", "test_seeds_distintos_difieren", "test_fk_containment", "test_review_text_mediana_palabras", "test_n_rows_invalido"]
+test_file_path: "python/functions/datascience/generate_synthetic_eda_folder_test.py"
+file_path: "python/functions/datascience/generate_synthetic_eda_folder.py"
+---
+
+## Ejemplo
+
+```bash
+# Genera /tmp/eda_folder/{customers,orders,reviews}.csv (300 customers, seed 42)
+fn run generate_synthetic_eda_folder /tmp/eda_folder 300 42
+```
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience import generate_synthetic_eda_folder
+
+res = generate_synthetic_eda_folder("/tmp/eda_folder", n_rows=300, seed=42)
+# res["files"] -> {"customers": ".../customers.csv", "orders": ..., "reviews": ...}
+# res["expected_relations"] -> orders.customer_id y reviews.customer_id -> customers.customer_id
+# Luego perfila la carpeta/base con el grupo eda:
+#   fn run profile_database /tmp/eda_folder
+```
+
+## Cuando usarla
+
+- Cuando necesites un fixture REPRODUCIBLE multi-tabla para evaluar el EDA de carpeta/base (`profile_database`, join graph, capitulo de relaciones inter-tabla) con relaciones FK reales y detectables.
+- Cuando escribas tests de la deteccion de claves foraneas por containment: orders y reviews referencian customer_id contenido al 100% en customers (inclusion 1.0 >= min_inclusion 0.9).
+- Como contraparte multi-tabla de `generate_synthetic_eda_table` (que cubre el EDA de UNA tabla).
+
+## Gotchas
+
+- **Impura**: escribe 3 CSV a disco (`mkdir -p` de la carpeta). Sobrescribe los CSV existentes con el mismo nombre.
+- **Requiere `faker`, `numpy` y `pandas`** en el venv. Sin `faker` devuelve `{status:'error'}` (no lanza).
+- **El containment depende del orden**: customers se genera PRIMERO y orders/reviews muestrean sus `customer_id`. Si se invierte el orden, la FK deja de estar contenida y el detector no la encuentra.
+- **`signup_date`/`ts` se escriben como texto ISO en el CSV** (`YYYY-MM-DD` / `YYYY-MM-DD HH:MM:SS`): es CSV, todo es texto; el profiler los promociona a datetime al leerlos.
+- **Determinismo dependiente del orden de llamadas**: se siembra `Faker.seed(seed)` + `np.random.default_rng(seed)` al inicio; mismo seed -> CSVs identicos byte a byte.
+- **Reutiliza helpers privados** de `generate_synthetic_eda_table` (`_make_fakers`, `_make_latlon`, `_make_reviews`, `_amount_with_outliers`): no romper esas firmas sin actualizar esta funcion.
+
+## Notas
+
+Estructura generada:
+
+| Archivo | PK | FK | Columnas clave |
+|---|---|---|---|
+| customers.csv | customer_id (uuid) | — | name, country, signup_date, latitude, longitude, email |
+| orders.csv | order_id (uuid) | customer_id -> customers | amount (lognormal + outliers), category, ts |
+| reviews.csv | review_id (uuid) | customer_id -> customers | review_text (multi-idioma, mediana palabras>=20), rating (1..5) |
+
+orders tiene ~2x filas que customers y reviews ~1x. Todos los `customer_id` de orders
+y reviews estan contenidos en customers (containment ⊆), por lo que la deteccion FK por
+inclusion descubre las dos relaciones declaradas en `expected_relations`.
@@ -0,0 +1,177 @@
+"""generate_synthetic_eda_folder — fixture multi-tabla relacionado para el EDA de base/carpeta.
+
+Funcion impura (escribe CSVs a disco) y determinista por ``seed``: crea una
+carpeta con 3 CSV RELACIONADOS (customers, orders, reviews) cuyo contenido esta
+disenado para que el motor AutomaticEDA multi-tabla / `profile_database` detecte
+las relaciones FK por containment de valores (orders.customer_id y
+reviews.customer_id contenidos al 100% en customers.customer_id, por encima del
+``min_inclusion=0.9`` que usa la deteccion).
+
+Reutiliza los helpers de ``generate_synthetic_eda_table`` (texto multi-idioma,
+lat/lon validas, amount con outliers, listas fijas de paises/categorias) para no
+reimplementar logica.
+
+Estilo dict-no-throw del grupo `eda`: NUNCA lanza; devuelve
+``{"status": "error", "error": str}`` ante cualquier fallo.
+"""
+
+import os
+
+from .generate_synthetic_eda_table import (
+    _CATEGORIES,
+    _COUNTRIES,
+    _amount_with_outliers,
+    _make_fakers,
+    _make_latlon,
+    _make_reviews,
+)
+
+
+def generate_synthetic_eda_folder(out_dir, n_rows=2000, seed=42):
+    """Genera una carpeta con 3 CSV relacionados (customers/orders/reviews).
+
+    customers es la tabla padre (PK ``customer_id`` uuid unica). orders y reviews
+    referencian ``customer_id`` muestreandolo de customers, de modo que TODOS sus
+    valores estan contenidos en customers (inclusion 1.0 -> FK detectable).
+
+    Funcion impura (escribe a disco) y determinista por ``seed``. NUNCA lanza.
+
+    Args:
+        out_dir: carpeta de salida. Se crea con ``mkdir -p`` si no existe.
+        n_rows: numero de clientes (customers). orders ~= 2*n_rows, reviews ~= n_rows.
+            Default 2000.
+        seed: semilla para Faker y numpy. Default 42.
+
+    Returns:
+        dict dict-no-throw. En exito::
+
+            {"status": "ok", "out_dir": ..., "files": {customers, orders, reviews},
+             "n_customers": ..., "n_orders": ..., "n_reviews": ...,
+             "expected_relations": [{from_table, from_col, to_table, to_col}, ...],
+             "seed": seed}
+
+        En error (sin lanzar)::
+
+            {"status": "error", "error": str}
+    """
+    try:
+        import numpy as np
+        import pandas as pd
+
+        n = int(n_rows)
+        if n <= 0:
+            return {"status": "error", "error": f"n_rows debe ser > 0, dado {n_rows!r}"}
+
+        os.makedirs(out_dir, exist_ok=True)
+
+        fakers = _make_fakers(seed)
+        rng = np.random.default_rng(seed)
+
+        # ---------------- customers (tabla padre) ----------------
+        n_cust = n
+        customer_ids = [fakers["en_US"].uuid4() for _ in range(n_cust)]
+        names = [fakers["en_US"].name() for _ in range(n_cust)]
+        cust_country = rng.choice(_COUNTRIES, n_cust)
+        base = np.datetime64("2022-01-01")
+        signup_offsets = rng.integers(0, 730, n_cust)
+        signup_date = pd.to_datetime(base) + pd.to_timedelta(signup_offsets, unit="D")
+        signup_iso = [d.strftime("%Y-%m-%d") for d in signup_date]
+        lat, lon = _make_latlon(cust_country, rng)
+        cust_email = [fakers["en_US"].email() for _ in range(n_cust)]
+
+        customers = pd.DataFrame(
+            {
+                "customer_id": customer_ids,
+                "name": names,
+                "country": cust_country,
+                "signup_date": signup_iso,
+                "latitude": lat,
+                "longitude": lon,
+                "email": cust_email,
+            }
+        )
+
+        # ---------------- orders (FK -> customers) ----------------
+        n_orders = n_cust * 2
+        order_ids = [fakers["en_US"].uuid4() for _ in range(n_orders)]
+        order_cust = rng.choice(customer_ids, n_orders)  # subset/multiset de customers
+        amount = _amount_with_outliers(n_orders, rng, n_extreme=10)
+        order_cat = rng.choice(_CATEGORIES, n_orders)
+        ts_offsets = rng.integers(0, 730 * 24 * 3600, n_orders)
+        ts = pd.to_datetime(np.datetime64("2022-01-01T00:00:00")) + pd.to_timedelta(
+            ts_offsets, unit="s"
+        )
+        ts_iso = [t.strftime("%Y-%m-%d %H:%M:%S") for t in ts]
+
+        orders = pd.DataFrame(
+            {
+                "order_id": order_ids,
+                "customer_id": order_cust,
+                "amount": amount,
+                "category": order_cat,
+                "ts": ts_iso,
+            }
+        )
+
+        # ---------------- reviews (FK -> customers) ----------------
+        n_reviews = n_cust
+        review_ids = [fakers["en_US"].uuid4() for _ in range(n_reviews)]
+        # Subconjunto de customers (no todos) -> containment estricto ⊆ customers.
+        rev_cust = rng.choice(customer_ids, n_reviews)
+        review_text = _make_reviews(n_reviews, rng, fakers, null_frac=0.0)
+        rating = rng.integers(1, 6, n_reviews)
+
+        reviews = pd.DataFrame(
+            {
+                "review_id": review_ids,
+                "customer_id": rev_cust,
+                "review_text": review_text,
+                "rating": rating,
+            }
+        )
+
+        files = {
+            "customers": os.path.join(out_dir, "customers.csv"),
+            "orders": os.path.join(out_dir, "orders.csv"),
+            "reviews": os.path.join(out_dir, "reviews.csv"),
+        }
+        customers.to_csv(files["customers"], index=False)
+        orders.to_csv(files["orders"], index=False)
+        reviews.to_csv(files["reviews"], index=False)
+
+        return {
+            "status": "ok",
+            "out_dir": out_dir,
+            "files": files,
+            "n_customers": n_cust,
+            "n_orders": n_orders,
+            "n_reviews": n_reviews,
+            "expected_relations": [
+                {
+                    "from_table": "orders",
+                    "from_col": "customer_id",
+                    "to_table": "customers",
+                    "to_col": "customer_id",
+                },
+                {
+                    "from_table": "reviews",
+                    "from_col": "customer_id",
+                    "to_table": "customers",
+                    "to_col": "customer_id",
+                },
+            ],
+            "seed": seed,
+        }
+    except Exception as exc:  # noqa: BLE001 — dict-no-throw del grupo eda.
+        return {"status": "error", "error": str(exc)}
+
+
+if __name__ == "__main__":
+    import json
+    import sys
+
+    args = sys.argv[1:]
+    out = args[0] if len(args) > 0 else "/tmp/synthetic_eda_folder"
+    rows = int(args[1]) if len(args) > 1 else 2000
+    sd = int(args[2]) if len(args) > 2 else 42
+    print(json.dumps(generate_synthetic_eda_folder(out, rows, sd), indent=2))
@@ -0,0 +1,74 @@
+"""Tests para generate_synthetic_eda_folder."""
+
+import os
+import statistics
+
+import pandas as pd
+
+from datascience.generate_synthetic_eda_folder import generate_synthetic_eda_folder
+
+
+def test_genera_ok_y_archivos(tmp_path):
+    out = str(tmp_path / "folder")
+    res = generate_synthetic_eda_folder(out, n_rows=300, seed=42)
+    assert res["status"] == "ok"
+    assert res["n_customers"] == 300
+    assert res["n_orders"] == 600
+    assert res["n_reviews"] == 300
+    for key in ("customers", "orders", "reviews"):
+        assert os.path.exists(res["files"][key])
+    # Relaciones esperadas declaradas.
+    rels = {(r["from_table"], r["to_table"]) for r in res["expected_relations"]}
+    assert ("orders", "customers") in rels
+    assert ("reviews", "customers") in rels
+
+
+def test_determinismo_mismo_seed(tmp_path):
+    out1 = str(tmp_path / "f1")
+    out2 = str(tmp_path / "f2")
+    generate_synthetic_eda_folder(out1, n_rows=250, seed=11)
+    generate_synthetic_eda_folder(out2, n_rows=250, seed=11)
+    for name in ("customers.csv", "orders.csv", "reviews.csv"):
+        a = open(os.path.join(out1, name), "rb").read()
+        b = open(os.path.join(out2, name), "rb").read()
+        assert a == b, f"{name} difiere entre dos generaciones con el mismo seed"
+
+
+def test_seeds_distintos_difieren(tmp_path):
+    out1 = str(tmp_path / "f1")
+    out2 = str(tmp_path / "f2")
+    generate_synthetic_eda_folder(out1, n_rows=250, seed=11)
+    generate_synthetic_eda_folder(out2, n_rows=250, seed=12)
+    a = open(os.path.join(out1, "customers.csv"), "rb").read()
+    b = open(os.path.join(out2, "customers.csv"), "rb").read()
+    assert a != b
+
+
+def test_fk_containment(tmp_path):
+    out = str(tmp_path / "folder")
+    res = generate_synthetic_eda_folder(out, n_rows=300, seed=42)
+    customers = pd.read_csv(res["files"]["customers"])
+    orders = pd.read_csv(res["files"]["orders"])
+    reviews = pd.read_csv(res["files"]["reviews"])
+    cust_ids = set(customers["customer_id"])
+    # Todos los customer_id de orders y reviews ⊆ customers.
+    assert set(orders["customer_id"]) <= cust_ids
+    assert set(reviews["customer_id"]) <= cust_ids
+    # customer_id es PK unica en customers.
+    assert customers["customer_id"].is_unique
+    assert orders["order_id"].is_unique
+    assert reviews["review_id"].is_unique
+
+
+def test_review_text_mediana_palabras(tmp_path):
+    out = str(tmp_path / "folder")
+    res = generate_synthetic_eda_folder(out, n_rows=300, seed=42)
+    reviews = pd.read_csv(res["files"]["reviews"])
+    words = [len(str(t).split()) for t in reviews["review_text"].dropna()]
+    assert statistics.median(words) >= 20
+
+
+def test_n_rows_invalido(tmp_path):
+    out = str(tmp_path / "folder")
+    res = generate_synthetic_eda_folder(out, n_rows=0, seed=42)
+    assert res["status"] == "error"
@@ -0,0 +1,82 @@
+---
+name: generate_synthetic_eda_table
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def generate_synthetic_eda_table(out_db_path: str, table: str = 'synthetic', n_rows: int = 2000, seed: int = 42) -> dict"
+description: "Genera una tabla DuckDB sintetica (Faker + numpy, determinista por seed) cuyo contenido esta disenado para ACTIVAR el maximo de capitulos del motor AutomaticEDA del grupo eda: numericas continuas con correlacion lineal/no-lineal, numericas con outliers, categoricas desbalanceadas, texto libre multi-idioma con duplicados, fecha para serie temporal, lat/lon validas, semanticos/PII (uuid/email/iban/phone) y nulos con patron MCAR/MAR. Fixture para evaluar el EDA de punta a punta. Estilo dict-no-throw: nunca lanza."
+tags: [eda, synthetic, faker, testing, fixture, datascience]
+params:
+  - name: out_db_path
+    desc: "Ruta al archivo DuckDB de salida. Se crea (o reutiliza) y la tabla se reemplaza con CREATE OR REPLACE TABLE si ya existe."
+  - name: table
+    desc: "Nombre de la tabla a crear. Se valida contra ^[A-Za-z_][A-Za-z0-9_]*$ y se cita en el DDL. Default 'synthetic'."
+  - name: n_rows
+    desc: "Numero de filas (clientes unicos). Cada fila es un cliente con id/email/iban/phone propios. Default 2000."
+  - name: seed
+    desc: "Semilla para Faker (Faker.seed) y numpy (np.random.default_rng). Mismo seed -> tabla identica byte a byte. Default 42."
+output: "dict dict-no-throw. En exito {status:'ok', db_path, table, n_rows, columns:[19 nombres de columna], seed}. En error (sin lanzar, p.ej. nombre de tabla invalido o n_rows<=0) {status:'error', error:str}. Columnas: customer_id,email,iban,phone,income,spending,age,risk_score,tenure_months,engagement_quad,amount,n_purchases,country,category,plan,review,signup_date,latitude,longitude."
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+tested: true
+tests: ["test_genera_ok_y_columnas", "test_determinismo_mismo_seed", "test_seeds_distintos_difieren", "test_latlon_en_rango", "test_plan_solo_niveles_validos", "test_income_spending_co_nulos", "test_review_mediana_palabras_y_signup_datetime", "test_phone_matchea_regex_internacional", "test_outliers_y_correlaciones", "test_tabla_invalida_devuelve_error"]
+test_file_path: "python/functions/datascience/generate_synthetic_eda_table_test.py"
+file_path: "python/functions/datascience/generate_synthetic_eda_table.py"
+---
+
+## Ejemplo
+
+```bash
+# Genera /tmp/x.duckdb con la tabla `synthetic` (2000 filas, seed 42)
+fn run generate_synthetic_eda_table /tmp/x.duckdb synthetic 2000 42
+```
+
+```python
+import sys, os
+sys.path.insert(0, os.path.join("python", "functions"))
+from datascience import generate_synthetic_eda_table
+
+res = generate_synthetic_eda_table("/tmp/x.duckdb", "synthetic", n_rows=2000, seed=42)
+# res == {"status":"ok", "db_path":"/tmp/x.duckdb", "table":"synthetic",
+#         "n_rows":2000, "columns":[...19...], "seed":42}
+# Luego perfilala con el grupo eda:
+#   fn run profile_table /tmp/x.duckdb synthetic
+```
+
+## Cuando usarla
+
+- Cuando necesites un dataset de prueba REPRODUCIBLE para evaluar el motor AutomaticEDA de punta a punta: su contenido dispara, a proposito, num_distr, cat_distr, text_distr, correlacion, missingness (MCAR/MAR), modelos (PCA/KMeans/outliers), timeseries, geospatial, calidad, agregacion y los detectores semanticos / PII (`infer_semantic_type`).
+- Cuando escribas tests de capitulos del EDA y quieras una tabla con una columna que active CADA detector sin montar datos a mano.
+- Cuando quieras un fixture determinista (mismo seed -> misma tabla) para comparar el render del EDA entre versiones.
+
+## Gotchas
+
+- **Impura**: escribe a disco (crea/reutiliza el archivo DuckDB). Reemplaza la tabla destino con `CREATE OR REPLACE`.
+- **Requiere `faker`, `duckdb`, `numpy` y `pandas`** instalados en el venv. Sin `faker` la generacion devuelve `{status:'error'}` (no lanza).
+- **`signup_date` queda como TIMESTAMP/DATE en DuckDB** (se construye con `datetime64[ns]`), NO VARCHAR — condicion para que `detect_time_column` la elija y se active el capitulo timeseries. Si fuese VARCHAR, el detector de fecha fallaria.
+- **El texto de `review` debe superar el gate de text_distr**: media de caracteres >= 50 y mediana de palabras >= 20. Por eso cada review concatena dos parrafos Faker (~50 palabras de mediana); no reducir el numero de frases o el capitulo text_distr no activa.
+- **Determinismo dependiente del orden de llamadas**: se siembra `Faker.seed(seed)` + `np.random.default_rng(seed)` al inicio; cambiar el orden de las extracciones cambia la salida aunque el seed sea el mismo.
+- **PII real-istica**: `email`/`iban`/`phone`/`customer_id` matchean los regex de `infer_semantic_type` (email/iban/phone_intl/uuid) al 100%; son datos sinteticos de Faker, no personas reales.
+
+## Notas
+
+Mapa columna -> detector que activa:
+
+| Columna(s) | Tipo | Detector / capitulo |
+|---|---|---|
+| income, spending | num continua | correlacion POSITIVA fuerte (Pearson > 0.8) |
+| age, risk_score | num continua | correlacion NEGATIVA |
+| tenure_months, engagement_quad | num continua | relacion NO LINEAL (cuadratica) |
+| amount, n_purchases | num + outliers | num_distr / outliers (cola pesada + extremos inyectados) |
+| country (12), category (6), plan (3 desbalanceado) | categorica | cat_distr / agregacion (entropia baja en plan) |
+| review | texto libre multi-idioma | text_distr (len_mean>=50, mediana palabras>=20) + duplicados exactos |
+| signup_date | DATE/TIMESTAMP | timeseries |
+| latitude, longitude | num [-90,90]/[-180,180] | geospatial (detect_latlon_columns) |
+| customer_id, email, iban, phone | texto | semantic_type uuid/email/iban/phone_intl (PII) |
+| income+spending (co-nulos 12%), risk_score (nulo si plan=alta), review (8%) | nulos con patron | missingness MCAR/MAR |
@@ -0,0 +1,314 @@
+"""generate_synthetic_eda_table — fixture sintetico para ejercitar el motor AutomaticEDA.
+
+Funcion impura (escribe un archivo DuckDB a disco) y determinista por ``seed``:
+construye una unica tabla cuyo CONTENIDO esta disenado para ACTIVAR el maximo
+numero de capitulos del motor AutomaticEDA del grupo `eda` (num_distr, cat_distr,
+text_distr, correlacion, missingness, modelos, timeseries, geospatial, relaciones,
+calidad, agregacion) y los detectores semanticos / PII (`infer_semantic_type`).
+
+Estilo dict-no-throw del grupo `eda`: NUNCA lanza; captura cualquier error y
+devuelve ``{"status": "error", "error": str}``.
+
+Determinismo: con el mismo ``seed`` el DataFrame y, por tanto, la tabla DuckDB
+resultante son identicos byte a byte. Se siembra Faker (``Faker.seed``) y numpy
+(``np.random.default_rng(seed)``) al inicio de cada generacion.
+"""
+
+import re
+
+# Lista fija de paises (12 -> cardinalidad media para cat_distr / agregacion).
+_COUNTRIES = [
+    "ES", "FR", "DE", "IT", "PT", "NL",
+    "BE", "US", "GB", "IE", "SE", "PL",
+]
+
+# Lista fija de categorias de producto (6 -> cardinalidad media).
+_CATEGORIES = [
+    "electronics", "clothing", "home", "sports", "books", "toys",
+]
+
+# Niveles de plan con probabilidades DESBALANCEADAS (entropia baja para cat_distr).
+_PLANS = ["baja", "media", "alta"]
+_PLAN_PROBS = [0.70, 0.25, 0.05]
+
+# Centroides (lat, lon) aproximados por pais: muestrean coordenadas validas
+# dentro de [-90, 90] x [-180, 180] para que detect_latlon_columns las acepte.
+_CENTROIDS = {
+    "ES": (40.4, -3.7), "FR": (46.6, 2.2), "DE": (51.1, 10.4), "IT": (41.9, 12.5),
+    "PT": (39.4, -8.2), "NL": (52.1, 5.3), "BE": (50.5, 4.5), "US": (39.0, -98.0),
+    "GB": (54.0, -2.0), "IE": (53.4, -8.0), "SE": (60.1, 18.6), "PL": (52.0, 19.1),
+}
+
+# Locales rotados para generar texto multi-idioma (es/en/fr).
+_TEXT_LOCALES = ["es_ES", "en_US", "fr_FR"]
+
+# Identificador SQL valido (DuckDB no parametriza el nombre de tabla en DDL).
+_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+
+def _make_fakers(seed):
+    """Crea los Faker por locale tras sembrar el generador compartido.
+
+    ``Faker.seed(seed)`` siembra el ``random.Random`` compartido por todas las
+    instancias Faker que usan el generador por defecto, asi que el orden de
+    llamadas determina por completo la salida (determinismo).
+    """
+    from faker import Faker
+
+    Faker.seed(seed)
+    es_es, en_us, fr_fr = (Faker(loc) for loc in _TEXT_LOCALES)
+    return {"es_ES": es_es, "en_US": en_us, "fr_FR": fr_fr}
+
+
+# Texto duplicado canonico (multi-idioma, > 20 palabras) que se inyecta en una
+# fraccion de las filas para que el analisis de duplicados exactos lo detecte.
+_DUP_REVIEW = (
+    "Servicio excelente y entrega muy rapida, el producto llego en perfecto "
+    "estado y coincide con la descripcion publicada en la tienda. The customer "
+    "support team answered every question quickly and the packaging was solid "
+    "and well protected during shipping. Je recommande vivement ce vendeur a "
+    "tous mes amis, la qualite est vraiment au rendez-vous cette fois."
+)
+
+
+def _make_reviews(n, rng, fakers, dup_frac=0.04, null_frac=0.08):
+    """Genera ``n`` reviews de texto libre largo multi-idioma (es/en/fr).
+
+    Cada review concatena dos parrafos de Faker en el idioma rotado por fila, de
+    modo que la MEDIANA de palabras por documento queda muy por encima de 20 y la
+    media de caracteres por encima de 50 (gates del capitulo text_distr). Se
+    inyectan duplicados exactos (``dup_frac``) y nulos (``null_frac``).
+
+    Devuelve una ``list`` de ``str`` o ``None`` (nulos) de longitud ``n``.
+    """
+    # Numero de frases por parrafo precomputado con numpy (determinista) para no
+    # interleavar draws de rng dentro del bucle de faker.
+    nb1 = rng.integers(4, 8, n)
+    nb2 = rng.integers(3, 7, n)
+
+    reviews = []
+    for i in range(n):
+        fk = fakers[_TEXT_LOCALES[i % 3]]
+        p1 = fk.paragraph(nb_sentences=int(nb1[i]))
+        p2 = fk.paragraph(nb_sentences=int(nb2[i]))
+        reviews.append(f"{p1} {p2}")
+
+    # Duplicados exactos: una fraccion de filas comparte un review identico.
+    if n > 0 and dup_frac > 0:
+        k_dup = max(1, int(n * dup_frac))
+        dup_idx = rng.choice(n, size=min(k_dup, n), replace=False)
+        for j in dup_idx:
+            reviews[int(j)] = _DUP_REVIEW
+
+    # Nulos MCAR-ish: una fraccion de filas al azar queda en None.
+    if n > 0 and null_frac > 0:
+        k_null = max(1, int(n * null_frac))
+        null_idx = rng.choice(n, size=min(k_null, n), replace=False)
+        for j in null_idx:
+            reviews[int(j)] = None
+
+    return reviews
+
+
+def _make_phone_intl(rng):
+    """Construye un telefono en formato internacional que casa phone_intl.
+
+    Regex objetivo (fullmatch): ``\\+\\d[\\d\\s()-]{6,}\\d``. Empieza por '+',
+    digito, bloques de digitos separados por espacios y termina en digito.
+    """
+    cc = int(rng.integers(1, 99))
+    a = int(rng.integers(100, 999))
+    b = int(rng.integers(100, 999))
+    c = int(rng.integers(100, 999))
+    return f"+{cc} {a} {b} {c}"
+
+
+def _make_latlon(countries, rng):
+    """Devuelve (latitudes, longitudes) muestreando centroides de pais + jitter.
+
+    Mantiene los valores dentro de [-90, 90] y [-180, 180] (validez exigida por
+    detect_latlon_columns). El jitter es pequeno para no salirse del rango.
+    """
+    import numpy as np
+
+    lats = np.empty(len(countries), dtype=float)
+    lons = np.empty(len(countries), dtype=float)
+    jitter_lat = rng.normal(0.0, 0.5, len(countries))
+    jitter_lon = rng.normal(0.0, 0.5, len(countries))
+    for i, code in enumerate(countries):
+        base_lat, base_lon = _CENTROIDS[code]
+        lats[i] = float(np.clip(base_lat + jitter_lat[i], -90.0, 90.0))
+        lons[i] = float(np.clip(base_lon + jitter_lon[i], -180.0, 180.0))
+    return lats, lons
+
+
+def _amount_with_outliers(n, rng, n_extreme=6, factor=50.0):
+    """Serie lognormal de cola pesada con ~``n_extreme`` outliers altos (x``factor``)."""
+    import numpy as np
+
+    amount = rng.lognormal(mean=4.0, sigma=1.0, size=n)
+    if n > 0 and n_extreme > 0:
+        idx = rng.choice(n, size=min(n_extreme, n), replace=False)
+        amount[idx] = amount[idx] * factor
+    return amount
+
+
+def generate_synthetic_eda_table(
+    out_db_path, table="synthetic", n_rows=2000, seed=42
+):
+    """Genera una tabla DuckDB sintetica que activa el maximo de capitulos del EDA.
+
+    Construye un DataFrame de ``n_rows`` clientes unicos con columnas elegidas para
+    disparar detectores concretos del motor AutomaticEDA (numericas continuas con
+    correlaciones lineal/no-lineal, numericas con outliers, categoricas
+    desbalanceadas, texto libre multi-idioma con duplicados, fecha para serie
+    temporal, lat/lon validas, semanticos/PII y nulos con patron MCAR/MAR), y la
+    materializa en ``out_db_path`` con ``CREATE OR REPLACE TABLE``.
+
+    Funcion impura (escribe a disco) y determinista por ``seed``: con el mismo
+    seed la tabla resultante es identica byte a byte. NUNCA lanza.
+
+    Args:
+        out_db_path: ruta al archivo DuckDB de salida. Se crea (o reutiliza) y la
+            tabla se reemplaza si ya existe.
+        table: nombre de la tabla a crear. Se valida contra
+            ``^[A-Za-z_][A-Za-z0-9_]*$`` y se cita en el DDL.
+        n_rows: numero de filas (clientes unicos). Default 2000.
+        seed: semilla para Faker y numpy. Default 42.
+
+    Returns:
+        dict dict-no-throw. En exito::
+
+            {"status": "ok", "db_path": out_db_path, "table": table,
+             "n_rows": n_rows, "columns": [<nombres de columna>], "seed": seed}
+
+        En error (sin lanzar)::
+
+            {"status": "error", "error": str}
+    """
+    try:
+        import duckdb
+        import numpy as np
+        import pandas as pd
+
+        if not _IDENT_RE.match(table or ""):
+            return {
+                "status": "error",
+                "error": (
+                    f"nombre de tabla invalido: {table!r} "
+                    "(debe casar con ^[A-Za-z_][A-Za-z0-9_]*$)"
+                ),
+            }
+        n = int(n_rows)
+        if n <= 0:
+            return {"status": "error", "error": f"n_rows debe ser > 0, dado {n_rows!r}"}
+
+        fakers = _make_fakers(seed)
+        rng = np.random.default_rng(seed)
+
+        # --- Numericas continuas (distinct alto, correlaciones) ---
+        income = np.clip(rng.normal(40000.0, 12000.0, n), 1000.0, None)
+        spending = income * 0.35 + rng.normal(0.0, 2000.0, n)  # corr POSITIVA fuerte
+        age = rng.integers(18, 91, n)
+        risk_score = 90.0 - age * 0.7 + rng.normal(0.0, 5.0, n)  # corr NEGATIVA con age
+        tenure_months = rng.uniform(0.0, 60.0, n)
+        engagement_quad = ((tenure_months - 30.0) ** 2) / 30.0 + rng.normal(0.0, 1.0, n)
+
+        # --- Numericas con outliers claros ---
+        amount = _amount_with_outliers(n, rng)
+        n_purchases = rng.poisson(3.0, n).astype(float)
+        if n > 0:
+            k_hi = min(max(1, int(n * 0.002)) + 2, n)  # ~3-5 valores altisimos
+            hi_idx = rng.choice(n, size=k_hi, replace=False)
+            n_purchases[hi_idx] = rng.integers(200, 400, len(hi_idx)).astype(float)
+
+        # --- Categoricas ---
+        country = rng.choice(_COUNTRIES, n)
+        category = rng.choice(_CATEGORIES, n)
+        plan = rng.choice(_PLANS, n, p=_PLAN_PROBS)
+
+        # --- Texto libre multi-idioma con duplicados ---
+        review = _make_reviews(n, rng, fakers)
+
+        # --- Fecha / serie temporal (rango ~2 anios, cadencia ~diaria) ---
+        base = np.datetime64("2022-01-01")
+        offsets = rng.integers(0, 730, n)
+        signup_date = pd.to_datetime(base) + pd.to_timedelta(offsets, unit="D")
+
+        # --- Geo lat/lon validas ---
+        latitude, longitude = _make_latlon(country, rng)
+
+        # --- Semanticos / PII (>=80% match para infer_semantic_type) ---
+        customer_id = [fakers["en_US"].uuid4() for _ in range(n)]
+        email = [fakers["en_US"].email() for _ in range(n)]
+        iban = [fakers["en_US"].iban() for _ in range(n)]
+        phone = [_make_phone_intl(rng) for _ in range(n)]
+
+        df = pd.DataFrame(
+            {
+                "customer_id": customer_id,
+                "email": email,
+                "iban": iban,
+                "phone": phone,
+                "income": income,
+                "spending": spending,
+                "age": age,
+                "risk_score": risk_score,
+                "tenure_months": tenure_months,
+                "engagement_quad": engagement_quad,
+                "amount": amount,
+                "n_purchases": n_purchases,
+                "country": country,
+                "category": category,
+                "plan": plan,
+                "review": review,
+                "signup_date": signup_date,
+                "latitude": latitude,
+                "longitude": longitude,
+            }
+        )
+
+        # --- Nulos con patron ---
+        # income + spending faltan JUNTAS en las MISMAS filas (co-ocurrencia -> MAR).
+        k_co = max(1, int(n * 0.12))
+        co_idx = rng.choice(n, size=min(k_co, n), replace=False)
+        df.loc[co_idx, "income"] = np.nan
+        df.loc[co_idx, "spending"] = np.nan
+        # risk_score falta cuando plan == "alta" (mas una pizca de azar) -> MAR.
+        risk_mask = (df["plan"] == "alta").to_numpy() | (rng.random(n) < 0.02)
+        df.loc[risk_mask, "risk_score"] = np.nan
+
+        columns = list(df.columns)
+
+        con = duckdb.connect(out_db_path)
+        try:
+            con.register("df_synth_eda", df)
+            con.execute(
+                f'CREATE OR REPLACE TABLE "{table}" AS SELECT * FROM df_synth_eda'
+            )
+            con.unregister("df_synth_eda")
+        finally:
+            con.close()
+
+        return {
+            "status": "ok",
+            "db_path": out_db_path,
+            "table": table,
+            "n_rows": n,
+            "columns": columns,
+            "seed": seed,
+        }
+    except Exception as exc:  # noqa: BLE001 — dict-no-throw del grupo eda.
+        return {"status": "error", "error": str(exc)}
+
+
+if __name__ == "__main__":
+    import json
+    import sys
+
+    args = sys.argv[1:]
+    db_path = args[0] if len(args) > 0 else "/tmp/synthetic_eda.duckdb"
+    tbl = args[1] if len(args) > 1 else "synthetic"
+    rows = int(args[2]) if len(args) > 2 else 2000
+    sd = int(args[3]) if len(args) > 3 else 42
+    print(json.dumps(generate_synthetic_eda_table(db_path, tbl, rows, sd), indent=2))
@@ -0,0 +1,129 @@
+"""Tests para generate_synthetic_eda_table."""
+
+import os
+import re
+import statistics
+
+import duckdb
+
+from datascience.generate_synthetic_eda_table import generate_synthetic_eda_table
+
+_EXPECTED_COLS = [
+    "customer_id", "email", "iban", "phone", "income", "spending", "age",
+    "risk_score", "tenure_months", "engagement_quad", "amount", "n_purchases",
+    "country", "category", "plan", "review", "signup_date", "latitude", "longitude",
+]
+_PHONE_RE = re.compile(r"\+\d[\d\s()-]{6,}\d")
+
+
+def _load(db_path, table="synthetic"):
+    con = duckdb.connect(db_path, read_only=True)
+    try:
+        return con.execute(f'SELECT * FROM "{table}"').fetch_df()
+    finally:
+        con.close()
+
+
+def test_genera_ok_y_columnas(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    res = generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42)
+    assert res["status"] == "ok"
+    assert res["table"] == "synthetic"
+    assert res["n_rows"] == 500
+    assert res["columns"] == _EXPECTED_COLS
+    assert os.path.exists(db)
+    df = _load(db)
+    assert list(df.columns) == _EXPECTED_COLS
+    assert len(df) == 500
+
+
+def test_determinismo_mismo_seed(tmp_path):
+    db1 = str(tmp_path / "a.duckdb")
+    db2 = str(tmp_path / "b.duckdb")
+    generate_synthetic_eda_table(db1, "synthetic", n_rows=400, seed=7)
+    generate_synthetic_eda_table(db2, "synthetic", n_rows=400, seed=7)
+    df1 = _load(db1).astype(str)
+    df2 = _load(db2).astype(str)
+    # Misma semilla -> tabla identica fila a fila.
+    assert df1.equals(df2)
+
+
+def test_seeds_distintos_difieren(tmp_path):
+    db1 = str(tmp_path / "a.duckdb")
+    db2 = str(tmp_path / "b.duckdb")
+    generate_synthetic_eda_table(db1, "synthetic", n_rows=400, seed=7)
+    generate_synthetic_eda_table(db2, "synthetic", n_rows=400, seed=8)
+    df1 = _load(db1).astype(str)
+    df2 = _load(db2).astype(str)
+    assert not df1.equals(df2)
+
+
+def test_latlon_en_rango(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42)
+    df = _load(db)
+    assert df["latitude"].between(-90, 90).all()
+    assert df["longitude"].between(-180, 180).all()
+
+
+def test_plan_solo_niveles_validos(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42)
+    df = _load(db)
+    assert set(df["plan"].unique()) <= {"baja", "media", "alta"}
+
+
+def test_income_spending_co_nulos(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    generate_synthetic_eda_table(db, "synthetic", n_rows=600, seed=42)
+    df = _load(db)
+    inc_null = df["income"].isna()
+    sp_null = df["spending"].isna()
+    # income y spending faltan exactamente en las MISMAS filas.
+    assert (inc_null == sp_null).all()
+    assert inc_null.sum() > 0
+
+
+def test_review_mediana_palabras_y_signup_datetime(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42)
+    df = _load(db)
+    words = [len(str(r).split()) for r in df["review"].dropna()]
+    assert statistics.median(words) >= 20
+    # signup_date debe ser datetime/date en DuckDB (no VARCHAR).
+    con = duckdb.connect(db, read_only=True)
+    try:
+        dtype = con.execute(
+            "SELECT column_type FROM (DESCRIBE synthetic) WHERE column_name='signup_date'"
+        ).fetchone()[0]
+    finally:
+        con.close()
+    assert dtype.upper().startswith(("DATE", "TIMESTAMP"))
+
+
+def test_phone_matchea_regex_internacional(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    generate_synthetic_eda_table(db, "synthetic", n_rows=500, seed=42)
+    df = _load(db)
+    phones = [p for p in df["phone"].tolist() if p is not None]
+    assert all(_PHONE_RE.fullmatch(str(p)) for p in phones)
+
+
+def test_outliers_y_correlaciones(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    generate_synthetic_eda_table(db, "synthetic", n_rows=800, seed=42)
+    df = _load(db)
+    # amount tiene cola con outliers altos evidentes.
+    assert df["amount"].max() > df["amount"].median() * 20
+    # correlacion positiva fuerte income~spending y negativa age~risk_score.
+    sub = df[["income", "spending"]].dropna()
+    assert sub["income"].corr(sub["spending"]) > 0.8
+    sub2 = df[["age", "risk_score"]].dropna()
+    assert sub2["age"].corr(sub2["risk_score"]) < -0.6
+
+
+def test_tabla_invalida_devuelve_error(tmp_path):
+    db = str(tmp_path / "t.duckdb")
+    res = generate_synthetic_eda_table(db, "bad name;", n_rows=10, seed=42)
+    assert res["status"] == "error"
+    assert "invalido" in res["error"]
@@ -1,79 +0,0 @@
---
-name: summarize_outlier_dims
-kind: function
-lang: py
-domain: datascience
-version: "1.0.0"
-purity: pure
-signature: "def summarize_outlier_dims(raw_numeric: dict, outlier_rows: list, top_k: int = 3) -> list"
-description: "Explica QUE columnas hacen rara cada fila anomala detectada por isolation_forest_outliers. Para cada {row_index, score} reconstruye la fila valida (mismo filtro de columnas numericas y mismo descarte de filas con None que el detector, asi row_index coincide) y devuelve las top_k columnas de mayor |z-score| poblacional (ddof=0). Capa de explicabilidad del paso de outliers multivariante en EDA. Pura y determinista; ante entradas vacias/invalidas o sin filas validas devuelve [] sin petar."
-tags: [eda, models, outliers, anomaly-detection, explainability, z-score, multivariate]
-params:
-  - name: raw_numeric
-    desc: "dict {nombre_columna: [valores]} alineado por fila (como ctx['raw_numeric'] del motor AutomaticEDA). Solo se usan columnas con todos los valores numericos (None permitido por fila; bool/str/NaN/Inf descartan la columna entera) — filtro IDENTICO al de isolation_forest_outliers para que row_index coincida."
-  - name: outlier_rows
-    desc: "Lista de {row_index, score} tal cual la devuelve isolation_forest_outliers. row_index cuenta SOLO las filas validas (sin None) en orden de aparicion, base 0. Entradas fuera de rango o malformadas se ignoran defensivamente."
-  - name: top_k
-    desc: "Numero de columnas (las de mayor |z-score|) a reportar por outlier. Default 3. Valores invalidos (no-int, bool, <1) caen a 3."
-output: "Lista paralela a outlier_rows (mismo orden) de dicts {row_index: int, score: float, dims: [{col: str, value: float, z: float}, ...]}. dims trae hasta top_k columnas ordenadas por |z| descendente, con z (z-score poblacional, ddof=0) redondeado a 3 decimales; si una columna tiene std==0 su z es 0. Las entradas de outlier_rows fuera de rango/malformadas se omiten. Ante raw_numeric vacio/no-dict, outlier_rows no-lista, 0 columnas numericas o 0 filas validas devuelve []."
-uses_functions: []
-uses_types: []
-returns: []
-returns_optional: false
-error_type: ""
-imports: []
-tested: true
-tests: ["test_row_index_skips_none_rows", "test_extreme_row_flagged_via_isolation", "test_out_of_range_row_index_is_ignored", "test_degrades_to_empty_on_invalid_inputs"]
-test_file_path: "python/functions/datascience/summarize_outlier_dims_test.py"
-file_path: "python/functions/datascience/summarize_outlier_dims.py"
---
-
-## Ejemplo
-
-```python
-from datascience import isolation_forest_outliers, summarize_outlier_dims
-
-# Nube densa alrededor del origen + 1 fila con un valor extremo en "c".
-raw_numeric = {
-    "a": [0.1, 0.2, -0.1, 0.0, 0.3, -0.2, 0.15, -0.05, 0.25, 0.2, -0.3, 0.1],
-    "b": [1.0, 1.1, 0.9, 1.2, 0.8, 1.0, 1.1, 0.95, 1.05, 0.9, 1.15, 1.0],
-    "c": [5.0, 5.2, 4.8, 5.1, 4.9, 5.0, 4.95, 5.05, 4.9, 500.0, 5.1, 5.0],
-}
-
-result = isolation_forest_outliers(raw_numeric, contamination=0.1)
-summary = summarize_outlier_dims(raw_numeric, result["outlier_rows"], top_k=3)
-
-for item in summary:
-    top = item["dims"][0]
-    print(item["row_index"], top["col"], top["value"], top["z"])
-# La fila del valor 500 sale con dim top "c" y |z| alto: es lo que la hace rara.
-```
-
-## Cuando usarla
-
-Justo **despues** de `isolation_forest_outliers`, cuando ya sabes QUE filas son
-anomalas y quieres explicar POR QUE: en que columnas se desvian mas respecto al
-resto. Util para rellenar la seccion de outliers de un report/notebook EDA con
-"la fila 9 es rara sobre todo por `c` (z=+3.3)" en lugar de solo un row_index
-opaco. Pasa el mismo `raw_numeric` que diste al detector y su `outlier_rows`
-intacto; el `row_index` apunta a la misma fila porque ambas funciones aplican el
-mismo filtro de columnas y el mismo descarte de filas con None.
-
-## Gotchas
-
- **Mismo `raw_numeric` que el detector**: el `row_index` solo coincide si pasas
-  el mismo dict de columnas (mismo orden, mismas listas) con el que llamaste a
-  `isolation_forest_outliers`. Si cambias las columnas o el orden, los indices
-  dejan de mapear.
- **`row_index` es relativo a las filas validas**: las filas con `None` en
-  cualquier columna usada se descartan y los indices se recalculan sobre las que
-  quedan (base 0, orden de aparicion). No mapea 1:1 con las listas de entrada si
-  hay None.
- **z-score poblacional (ddof=0)**: se usa la desviacion tipica poblacional,
-  consistente con el escalado del detector. Columnas con `std==0` (todos los
-  valores iguales) dan `z=0`, asi que nunca aparecen como "raras".
- **Devuelve `[]` en vez de petar**: entrada no-dict/no-lista, 0 columnas
-  numericas, 0 filas validas, o todas las entradas fuera de rango -> lista vacia.
-  No lanza excepciones.
- **No llama a `isolation_forest_outliers`**: solo consume su salida. Es una
-  funcion independiente (no la importa), por eso `uses_functions` esta vacio.
@@ -1,144 +0,0 @@
-"""Explica que dimensiones (columnas) hacen rara cada fila anomala.
-
-Toma la salida multivariante de `isolation_forest_outliers` (lista de
-`{row_index, score}`) y, para cada outlier, devuelve las columnas con mayor
-|z-score| respecto a la distribucion de las filas validas. Es la capa de
-"explicabilidad" del paso de outliers multivariante en la fase EDA: el
-Isolation Forest dice QUE filas son raras, esta funcion dice POR QUE (en que
-columnas se desvian mas).
-
-Pura y determinista: reconstruye EXACTAMENTE las mismas "filas validas" que usa
-`isolation_forest_outliers` (mismo filtro de columnas numericas y mismo descarte
-de filas con None), de modo que el `row_index` apunta a la misma fila en ambas
-funciones. No hace I/O ni depende de estado.
-"""
-
-import math
-
-import numpy as np
-
-
-def _is_finite_number(v) -> bool:
-    """True si v es int/float finito. bool NO cuenta; NaN/Inf tampoco."""
-    if isinstance(v, bool):
-        return False
-    if not isinstance(v, (int, float)):
-        return False
-    if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
-        return False
-    return True
-
-
-def summarize_outlier_dims(
-    raw_numeric: dict,
-    outlier_rows: list,
-    top_k: int = 3,
-) -> list:
-    """Resume las dimensiones que mas desvian a cada fila anomala.
-
-    Args:
-        raw_numeric: dict {nombre_columna: [valores]} alineado por fila (como
-            ctx['raw_numeric'] del motor AutomaticEDA). Solo se usan columnas
-            cuyos valores sean todos numericos (None permitido por fila; bool,
-            str, NaN e Inf descartan la columna entera) — filtro identico al de
-            isolation_forest_outliers.
-        outlier_rows: lista de {row_index, score} tal como la devuelve
-            isolation_forest_outliers. row_index cuenta SOLO las filas validas
-            (sin None) en orden de aparicion, empezando en 0.
-        top_k: numero de columnas (las de mayor |z-score|) a reportar por cada
-            outlier. Default 3. Valores invalidos caen a 3.
-
-    Returns:
-        Lista paralela a outlier_rows (mismo orden) de dicts
-        {row_index, score, dims}, donde dims es la lista de hasta top_k columnas
-        ordenadas por |z| descendente: [{col, value, z}, ...] con z redondeado a
-        3 decimales. Las entradas de outlier_rows fuera de rango o malformadas se
-        omiten (defensivo). Ante raw_numeric vacio/no-dict, outlier_rows
-        no-lista, 0 columnas numericas o 0 filas validas devuelve [].
-    """
-    # Validacion defensiva de los argumentos principales.
-    if not isinstance(raw_numeric, dict) or not isinstance(outlier_rows, list):
-        return []
-    if not isinstance(top_k, int) or isinstance(top_k, bool) or top_k < 1:
-        top_k = 3
-
-    # Seleccion de columnas numericas: identica a isolation_forest_outliers.
-    # Una columna entra solo si todos sus valores son numericos (None permitido
-    # por fila); cualquier bool/str/NaN/Inf descarta la columna completa.
-    numeric_cols: dict[str, list] = {}
-    for name, values in raw_numeric.items():
-        if not isinstance(values, (list, tuple)):
-            continue
-        ok = True
-        for v in values:
-            if v is None:
-                continue
-            if not _is_finite_number(v):
-                ok = False
-                break
-        if ok:
-            numeric_cols[name] = list(values)
-
-    if len(numeric_cols) < 1:
-        return []
-
-    col_names = list(numeric_cols.keys())
-    try:
-        n_rows_total = min(len(numeric_cols[c]) for c in col_names)
-    except ValueError:
-        return []
-
-    # Reconstruye las filas validas con el MISMO criterio que el detector: la
-    # fila i toma un valor por columna; si cualquier valor es None, la fila se
-    # descarta y NO incrementa el indice valido. Asi row_index de outlier_rows
-    # apunta a esta misma secuencia (base 0, orden de aparicion).
-    valid_rows: list[list[float]] = []
-    for i in range(n_rows_total):
-        row = [numeric_cols[c][i] for c in col_names]
-        if any(v is None for v in row):
-            continue
-        valid_rows.append([float(v) for v in row])
-
-    if not valid_rows:
-        return []
-
-    matrix = np.asarray(valid_rows, dtype=float)
-    n_valid = matrix.shape[0]
-    means = matrix.mean(axis=0)
-    stds = matrix.std(axis=0, ddof=0)  # poblacional (ddof=0)
-
-    out: list = []
-    for entry in outlier_rows:
-        if not isinstance(entry, dict):
-            continue
-        ri = entry.get("row_index")
-        # bool es subclase de int: lo excluimos explicitamente.
-        if not isinstance(ri, int) or isinstance(ri, bool):
-            continue
-        if ri < 0 or ri >= n_valid:
-            continue
-
-        try:
-            score = float(entry.get("score"))
-        except (TypeError, ValueError):
-            score = 0.0
-
-        row = matrix[ri]
-        dims = []
-        for j, name in enumerate(col_names):
-            std = stds[j]
-            if std == 0.0:
-                z = 0.0
-            else:
-                z = float((row[j] - means[j]) / std)
-            dims.append({"col": name, "value": float(row[j]), "z": z})
-
-        # Mayor |z| primero; sort estable, empates por orden de columna.
-        dims.sort(key=lambda d: abs(d["z"]), reverse=True)
-        dims = dims[:top_k]
-        for d in dims:
-            d["z"] = round(d["z"], 3)
-
-        out.append({"row_index": int(ri), "score": score, "dims": dims})
-
-    return out
@@ -1,93 +0,0 @@
-"""Tests para summarize_outlier_dims."""
-
-from isolation_forest_outliers import isolation_forest_outliers
-from summarize_outlier_dims import summarize_outlier_dims
-
-
-# Dataset compartido: 3 columnas, 13 filas. La fila ORIGINAL 6 tiene None en "a"
-# (se descarta), de modo que la fila ORIGINAL 10 -- con un valor extremo en "c"
-# -- queda en el indice VALIDO 9 (no 10). Esto verifica el salto de None.
-A = [0.1, 0.2, -0.1, 0.0, 0.3, -0.2, None, 0.15, -0.05, 0.25, 0.2, -0.3, 0.1]
-B = [1.0, 1.1, 0.9, 1.2, 0.8, 1.0, 1.3, 1.1, 0.95, 1.05, 0.9, 1.15, 1.0]
-C = [5.0, 5.2, 4.8, 5.1, 4.9, 5.0, 5.3, 4.95, 5.05, 4.9, 500.0, 5.1, 5.0]
-RAW = {"a": A, "b": B, "c": C}
-
-# Mapa original -> valido (saltando original 6):
-#   orig: 0 1 2 3 4 5 7 8 9 10 11 12
-#  valid: 0 1 2 3 4 5 6 7 8  9 10 11
-# => el extremo en "c" (original 10) esta en el indice valido 9.
-EXTREME_VALID_INDEX = 9
-
-
-def test_row_index_skips_none_rows():
-    # Mapeo directo (sin depender de la aleatoriedad de IsolationForest): el
-    # indice valido 9 debe corresponder a la fila con c == 500 -> el None de la
-    # fila original 6 se salto correctamente.
-    summary = summarize_outlier_dims(
-        RAW, [{"row_index": EXTREME_VALID_INDEX, "score": -0.5}], top_k=3
-    )
-    assert len(summary) == 1
-    entry = summary[0]
-    assert entry["row_index"] == EXTREME_VALID_INDEX
-    assert entry["score"] == -0.5
-    # La dimension dominante es "c", con su valor extremo y |z| alto.
-    top = entry["dims"][0]
-    assert top["col"] == "c"
-    assert top["value"] == 500.0
-    assert abs(top["z"]) > 2.0
-    # top_k respetado: como mucho 3 dims.
-    assert len(entry["dims"]) <= 3
-
-
-def test_extreme_row_flagged_via_isolation():
-    # Integracion real: detectar outliers y explicarlos.
-    result = isolation_forest_outliers(RAW, contamination=0.1)
-    assert "note" not in result
-    outlier_rows = result["outlier_rows"]
-    assert outlier_rows  # al menos un outlier
-
-    summary = summarize_outlier_dims(RAW, outlier_rows, top_k=3)
-    # Paralela a outlier_rows (todos los indices estan en rango).
-    assert len(summary) == len(outlier_rows)
-
-    by_index = {e["row_index"]: e for e in summary}
-    # El punto extremo debe estar entre los outliers detectados...
-    assert EXTREME_VALID_INDEX in by_index
-    # ...y su dimension top debe ser "c" (donde se desvia ~muchas sigmas).
-    extreme = by_index[EXTREME_VALID_INDEX]
-    assert extreme["dims"][0]["col"] == "c"
-    assert abs(extreme["dims"][0]["z"]) > 2.0
-
-
-def test_out_of_range_row_index_is_ignored():
-    # Indices fuera de rango se omiten en lugar de petar.
-    summary = summarize_outlier_dims(
-        RAW,
-        [
-            {"row_index": 999, "score": -1.0},
-            {"row_index": -1, "score": -1.0},
-            {"row_index": EXTREME_VALID_INDEX, "score": -0.5},
-        ],
-        top_k=2,
-    )
-    # Solo sobrevive el indice valido; los otros dos se descartan.
-    assert len(summary) == 1
-    assert summary[0]["row_index"] == EXTREME_VALID_INDEX
-    assert len(summary[0]["dims"]) <= 2
-
-
-def test_degrades_to_empty_on_invalid_inputs():
-    # raw_numeric vacio + outlier_rows vacio.
-    assert summarize_outlier_dims({}, [], 3) == []
-    # raw_numeric no es dict.
-    assert summarize_outlier_dims("not a dict", [{"row_index": 0}], 3) == []
-    # outlier_rows no es lista.
-    assert summarize_outlier_dims(RAW, "not a list", 3) == []
-    # Sin columnas numericas (todas con strings) -> [].
-    assert summarize_outlier_dims(
-        {"s": ["x", "y", "z"]}, [{"row_index": 0, "score": -1.0}], 3
-    ) == []
-    # Entradas malformadas dentro de outlier_rows se ignoran (no petan).
-    assert summarize_outlier_dims(
-        RAW, ["nope", 42, {"no_row_index": 1}], 3
-    ) == []
@@ -4,8 +4,8 @@ kind: pipeline
 lang: py
 domain: pipelines
 purity: impure
-version: "1.2.0"
-signature: "def render_automatic_eda(db_path: str, table: str, backend: str = \"duckdb\", sample: int = None, run_models: bool = None, run_series: bool = None, run_llm: bool = None, profile_level: str = \"standard\", out_dir: str = \"reports\", basename: str = None, ctx_extra: dict = None, emit_md: bool = True, only_chapters: list = None) -> dict"
+version: "1.1.0"
+signature: "def render_automatic_eda(db_path: str, table: str, backend: str = \"duckdb\", sample: int = None, run_models: bool = None, run_series: bool = None, run_llm: bool = None, profile_level: str = \"standard\", out_dir: str = \"reports\", basename: str = None, ctx_extra: dict = None) -> dict"
 description: "Informe AutomaticEDA COMPLETO one-shot de una tabla DuckDB/PostgreSQL: perfila con profile_table, construye el ctx con los datos crudos (build_eda_render_ctx: raw_numeric para modelos/geo, timeseries_raw para series, geo_points para el mapa, db_path/table para la agregacion push-down) y emite PDF (A5 movil) Y PPTX (16:9) del mismo documento por capitulos, con los 11 capitulos POBLADOS de verdad (clusters pintados sobre el PCA, evolucion temporal, mapa geografico y tablas de agregacion), no degradados. El parametro profile_level es un preset de consumo CPU/LLM (lite/standard/full) que mapea a los flags run_models/run_series/run_llm/sample; un flag explicito siempre prima sobre el preset. lite=bajo consumo (sin LLM, sin serie, modelos solo PCA+normalidad sin KMeans/IsolationForest, sample reducido); standard=comportamiento historico; full=standard+narrativa LLM. Devuelve las rutas de PDF/PPTX y el manifiesto de versiones por capitulo."
 tags: [eda, duckdb, postgres, profiling, pipeline, dataops, report, pdf, pptx]
 uses_functions:
@@ -46,10 +46,6 @@ params:
    desc: "Nombre base de los archivos sin extension. Default 'aeda_<table>_<timestamp>'."
  - name: ctx_extra
    desc: "Dict opcional con claves de presentacion/contexto extra que se mezclan en el ctx (dataset_name, description, source_origin, ...); no pisan las claves de datos calculadas por build_eda_render_ctx."
-  - name: emit_md
-    desc: "Ademas del PDF y el PPTX, emite un Markdown autocontenido del mismo documento por capitulos (texto + tablas markdown, sin binarios) para pegar a un LLM. Default True. La ruta sale en aeda_md_path."
-  - name: only_chapters
-    desc: "Lista opcional de ids de capitulo a renderizar (subconjunto de CHAPTER_ORDER) para iterar/testear un capitulo suelto sin generar el documento entero. Default None => documento COMPLETO (retrocompatible). Cuando se pasa una lista: (1) se VALIDA contra CHAPTER_ORDER, un id desconocido o lista vacia devuelve error claro listando los validos; (2) se RESUELVEN las dependencias de computo de esos capitulos (automatic_eda.chapter_deps) activando los flags que necesiten (run_models/run_series/run_llm) aunque el caller no los pidiera y construyendo SOLO las piezas de ctx que leen, de modo que el capitulo suelto SIEMPRE llega poblado (p.ej. ['outliers'] activa run_models y conserva raw_numeric -> Isolation Forest completo) sin malgastar CPU/LLM en lo que ningun capitulo pedido usa; (3) el documento y su manifest contienen SOLO esos capitulos MAS portada (primera) y glosario (ultima, cuando hay terminos clicables). Un flag explicito del caller prima sobre la resolucion de dependencias."
 output: "dict {status:'ok', pdf_path:str, pptx_path:str, manifest_path:str|None, n_pages:int, n_slides:int, pdf_note:str, pptx_note:str, profile:<TableProfile>} o {status:'error', error:str} (dict-no-throw)."
 ---

@@ -73,21 +69,6 @@ r = render_automatic_eda("/tmp/ventas.duckdb", "ventas", profile_level="full")
 # Precedencia: el flag explicito SIEMPRE prima sobre el preset. lite pero con LLM:
 r = render_automatic_eda("/tmp/ventas.duckdb", "ventas",
                         profile_level="lite", run_llm=True)  # el LLM SI se ejecuta
-
-# Capitulo SUELTO: itera/testea un capitulo sin generar el documento entero. La
-# resolucion de dependencias activa el computo que el capitulo necesita aunque no
-# se pase explicito. Pedir solo 'outliers' activa run_models y conserva
-# raw_numeric -> el bloque Isolation Forest sale COMPLETO. Documento = portada +
-# outliers + glosario.
-r = render_automatic_eda("/tmp/ventas.duckdb", "ventas", only_chapters=["outliers"])
-
-# Varios capitulos sueltos a la vez (se unen sus dependencias):
-r = render_automatic_eda("/tmp/ventas.duckdb", "ventas",
-                         only_chapters=["correlacion", "missingness"])
-
-# id desconocido -> error claro listando los validos (dict-no-throw, no lanza):
-r = render_automatic_eda("/tmp/ventas.duckdb", "ventas", only_chapters=["nope"])
-# {'status': 'error', 'error': 'only_chapters con ids desconocidos: nope. Capitulos validos: portada, overview, ...'}
 ```

 ## Cuando usarla
@@ -105,16 +86,6 @@ Para un EDA **barato/rapido** (CI, vistazo previo, maquina sin GPU o sin red) us
 temporal y el LLM. Para el **maximo** con interpretacion narrativa por capitulo,
 `profile_level="full"`. El default `"standard"` mantiene el comportamiento previo.

-Cuando estes **iterando o testeando UN capitulo concreto** (afinar el render de
-outliers, comprobar el mapa geoespacial, depurar la agregacion) usa
-`only_chapters=[...]`: genera el documento con solo esos capitulos (+ portada y
-glosario), pero **resuelve sus dependencias de computo** para que el capitulo
-suelto nunca salga degradado — pedir `['outliers']` activa run_models y conserva
-`raw_numeric` aunque no los pases, y a la vez no malgasta CPU/LLM en lo que ningun
-capitulo pedido necesita (pedir `['geospatial']` no corre modelos). Es mucho mas
-rapido que renderizar el informe entero en cada iteracion. El mapa central de
-dependencias vive en `automatic_eda/chapter_deps.py` (fuente de verdad).
-
 ## Gotchas

 - Impura: ESCRIBE el PDF, el PPTX y `automatic_eda_manifest.json` en `out_dir`.
@@ -140,29 +111,9 @@ dependencias vive en `automatic_eda/chapter_deps.py` (fuente de verdad).
 - Los datos crudos del ctx se muestrean con `sample` (LIMIT), no se trae la tabla
  entera a RAM; con tablas enormes sube `sample` si quieres mas representatividad
  (coste: mas memoria).
- **`only_chapters` y el glosario**: el glosario (ultimo capitulo) solo aparece si
-  algun capitulo del cuerpo registro terminos clicables. Un capitulo suelto que no
-  registra terminos (p.ej. `timeseries`, `geospatial`) sale como portada + ese
-  capitulo, sin glosario, porque no hay nada que enlazar — es correcto, no un fallo.
- **`only_chapters` con `profile_level="lite"`**: en capitulos sueltos el preset
-  solo gobierna `sample`; los modelos NO usan el camino "lite" (que podaria
-  `ctx['raw_numeric']` y dejaria a outliers sin su multivariante en vivo). Quien
-  manda en capitulos sueltos es la resolucion de dependencias, no el preset de
-  coste de modelos.

 ## Capability growth log

- v1.2.0 (2026-06-30) — anade el parametro `only_chapters`: renderiza un
-  SUBCONJUNTO de capitulos (para iterar/testear uno suelto) resolviendo sus
-  dependencias de computo via `automatic_eda/chapter_deps.py` (mapa central
-  CHAPTER_DEPS): activa los flags de coste que el capitulo necesita (run_models/
-  run_series/run_llm) aunque el caller no los pase y construye solo las piezas de
-  ctx que lee, de modo que el capitulo suelto SIEMPRE llega poblado (golden:
-  ['outliers'] -> Isolation Forest completo) sin malgastar en lo que no usa. La
-  seleccion viaja a build_document por la clave reservada `ctx['_only_chapters']`
-  (los renderers no cambian). Valida ids (error claro dict-no-throw). Cambio
-  aditivo y retro-compatible: `only_chapters=None` produce el documento completo
-  identico a v1.1.0.
 - v1.1.0 (2026-06-30) — anade el parametro `profile_level` (lite/standard/full),
  preset de consumo CPU/LLM que mapea a los flags run_models/run_series/run_llm/
  sample. lite limita los modelos a PCA+normalidad (cableado a run_eda_models con
@@ -99,7 +99,6 @@ def render_automatic_eda(
    basename: str = None,
    ctx_extra: dict = None,
    emit_md: bool = True,
-    only_chapters: list = None,
 ) -> dict:
    """Perfila una tabla y emite el informe AutomaticEDA completo (PDF + PPTX).

@@ -151,29 +150,6 @@ def render_automatic_eda(
            MISMO documento por capítulos (texto plano + tablas markdown, sin
            binarios), pensado para pegar a un LLM. Default True. La ruta sale en
            la clave de retorno ``aeda_md_path``. No altera las demás salidas.
-        only_chapters: lista opcional de ids de capítulo a renderizar (un
-            SUBCONJUNTO de CHAPTER_ORDER) para iterar/testear un capítulo concreto
-            sin generar el documento entero. Default None => documento COMPLETO,
-            idéntico al de hoy (retrocompatible). Cuando se pasa una lista:
-
-            - Se VALIDA contra CHAPTER_ORDER; un id desconocido devuelve un error
-              claro listando los válidos (dict-no-throw, no lanza). Lista vacía
-              ``[]`` también devuelve error (pasa al menos un capítulo o None).
-            - Se RESUELVEN las dependencias de cómputo de esos capítulos
-              (``automatic_eda.chapter_deps``): se activan los flags de coste que
-              necesiten (run_models / run_series / run_llm) AUNQUE el caller no
-              los pidiera, y se construyen SOLO las piezas de ``ctx`` que esos
-              capítulos leen. Así un capítulo suelto SIEMPRE llega poblado —
-              p.ej. ``only_chapters=['outliers']`` activa run_models y conserva
-              ``ctx['raw_numeric']`` para que el bloque IsolationForest salga
-              completo— y a la vez no se malgasta CPU/LLM en lo que ningún
-              capítulo pedido usa (pedir solo ``geospatial`` no corre modelos).
-            - El documento (PDF/PPTX/MD) y su manifest contienen SOLO esos
-              capítulos, MÁS la portada (primera) y el glosario (última), que se
-              incluyen siempre para que el documento sea válido y los términos
-              clicables tengan destino.
-            - Un flag explícito del caller (run_models/run_series/run_llm != None)
-              SIEMPRE prima sobre lo que resuelvan las dependencias.

    Returns:
        dict (nunca lanza). En éxito::
@@ -193,56 +169,11 @@ def render_automatic_eda(
        # "standard" (comportamiento histórico), sin lanzar.
        preset = _PROFILE_PRESETS.get(profile_level, _PROFILE_PRESETS["standard"])
        sample = preset["sample"] if sample is None else sample
+        run_models = preset["run_models"] if run_models is None else run_models
+        run_series = preset["run_series"] if run_series is None else run_series
+        run_llm = preset["run_llm"] if run_llm is None else run_llm
        model_opts = preset["model_opts"]

-        # 0.bis) Modo "capítulos sueltos": valida la selección y RESUELVE sus
-        # dependencias de cómputo. Es lo que garantiza que un capítulo pedido
-        # llegue completo (activa lo que necesita) sin malgastar en lo que no.
-        # Cuando only_chapters es None se conserva el camino histórico (preset).
-        if only_chapters is not None:
-            from datascience.automatic_eda import CHAPTER_ORDER
-            from datascience.automatic_eda.chapter_deps import (
-                needs_render_ctx,
-                resolve_ctx_data_keys,
-                resolve_requirements,
-                validate_chapter_ids,
-            )
-
-            if not isinstance(only_chapters, (list, tuple)):
-                return {"status": "error",
-                        "error": "only_chapters debe ser una lista de ids de "
-                                 "capítulo o None (documento completo)."}
-            only_chapters = [c for c in only_chapters]
-            if not only_chapters:
-                return {"status": "error",
-                        "error": "only_chapters=[] está vacío. Pasa al menos un "
-                                 "capítulo, o None para el documento completo. "
-                                 "Capítulos válidos: " + ", ".join(CHAPTER_ORDER)}
-            checked = validate_chapter_ids(only_chapters, CHAPTER_ORDER)
-            if checked["unknown"]:
-                return {"status": "error",
-                        "error": "only_chapters con ids desconocidos: "
-                                 + ", ".join(checked["unknown"])
-                                 + ". Capítulos válidos: "
-                                 + ", ".join(CHAPTER_ORDER)}
-            only_chapters = checked["valid"]
-
-            # Las dependencias fijan el DEFAULT de cada flag de coste (eficiencia:
-            # lo que ningún capítulo pedido necesita queda en False); un flag
-            # explícito del caller (!= None) sigue primando.
-            dep_flags = resolve_requirements(only_chapters)["profile_flags"]
-            run_models = ("run_models" in dep_flags) if run_models is None else run_models
-            run_series = ("run_series" in dep_flags) if run_series is None else run_series
-            run_llm = ("run_llm" in dep_flags) if run_llm is None else run_llm
-            # En capítulos sueltos no se usa el camino "modelos baratos" (lite),
-            # que poda ctx['raw_numeric']: un capítulo como outliers lo necesita
-            # para su multivariante en vivo. El preset solo gobierna `sample`.
-            model_opts = None
-        else:
-            run_models = preset["run_models"] if run_models is None else run_models
-            run_series = preset["run_series"] if run_series is None else run_series
-            run_llm = preset["run_llm"] if run_llm is None else run_llm
-
        # En el camino "modelos baratos" (lite) profile_table NO corre los
        # modelos: los ejecuta este pipeline con run_eda_models y la granularidad
        # del preset, evitando pagar el coste CPU de KMeans + IsolationForest.
@@ -286,25 +217,10 @@ def render_automatic_eda(
        if ctx_extra:
            base_ctx.update(ctx_extra)

-        # En modo capítulos sueltos, si NINGÚN capítulo pedido necesita datos
-        # crudos del ctx, se salta build_eda_render_ctx por completo (ahorro real
-        # de I/O): solo se conservan presentación + db_path/table. Si sí los
-        # necesita, se construye el ctx y luego se PODAN las piezas de datos que
-        # ningún capítulo pedido usa (db_path/table nunca se podan).
-        if only_chapters is not None and not needs_render_ctx(only_chapters):
-            ctx = dict(base_ctx)
-            ctx["db_path"] = db_path
-            ctx["table"] = table
-        else:
-            ctx = build_eda_render_ctx(
-                db_path, table, prof, backend=backend, sample=sample,
-                base_ctx=base_ctx,
-            )
-            if only_chapters is not None and isinstance(ctx, dict):
-                keep = resolve_ctx_data_keys(only_chapters)
-                for k in ("head_rows", "raw_numeric", "timeseries_raw", "geo_points"):
-                    if k not in keep:
-                        ctx.pop(k, None)
+        ctx = build_eda_render_ctx(
+            db_path, table, prof, backend=backend, sample=sample,
+            base_ctx=base_ctx,
+        )

        # 2.5) Camino lite — modelos baratos (PCA + normalidad, sin KMeans ni
        # IsolationForest). profile_table no corrió los modelos; aquí se corren
@@ -329,13 +245,6 @@ def render_automatic_eda(
                ctx.pop("raw_numeric", None)

        # 3) Render a ambos formatos desde el MISMO documento por capítulos.
-        # En modo capítulos sueltos, la selección viaja a build_document por una
-        # clave reservada del ctx (los renderers llaman build_document sin pasar
-        # `only`): build_document filtra el cuerpo a esos capítulos y siempre
-        # añade portada (primera) + glosario (última). build_document la consume
-        # y la quita, así que no llega a los capítulos.
-        if only_chapters is not None and isinstance(ctx, dict):
-            ctx["_only_chapters"] = list(only_chapters)
        os.makedirs(out_dir, exist_ok=True)
        ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
        base = basename or f"aeda_{table}_{ts}"
@@ -374,7 +283,6 @@ def render_automatic_eda(
            "pdf_note": rpdf.get("note"),
            "pptx_note": rpptx.get("note"),
            "md_note": rmd.get("note"),
-            "only_chapters": only_chapters,
            "profile": prof,
        }
    except Exception as e:  # noqa: BLE001 — dict-no-throw: degradar, nunca lanzar.
@@ -1,235 +0,0 @@
-"""Tests del modo `only_chapters` del pipeline render_automatic_eda.
-
-Cubre la tarea de "capítulos sueltos con resolución de dependencias":
-
-  - Golden (DuckDB real): pedir SOLO un capítulo genera un documento con solo
-    portada + ese capítulo + glosario, y el capítulo llega COMPLETO porque la
-    resolución de dependencias activó el cómputo que necesita aunque el caller
-    no lo pidiera (outliers → run_models + raw_numeric → IsolationForest poblado;
-    timeseries → run_series; correlacion → raw_numeric).
-  - Eficiencia: pedir un capítulo que NO necesita flags caros (geospatial) no los
-    activa, y un capítulo puramente agregado (num_distr) ni siquiera construye el
-    ctx de datos crudos.
-  - Edge: id desconocido / lista vacía / no-lista devuelven error claro sin
-    lanzar; only_chapters=None mantiene el comportamiento histórico.
-"""
-
-import json
-import os
-import random
-import sys
-from datetime import date, timedelta
-
-_HERE = os.path.dirname(os.path.abspath(__file__))
-_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", ".."))  # python/functions
-if _FUNCTIONS not in sys.path:
-    sys.path.insert(0, _FUNCTIONS)
-
-import duckdb  # noqa: E402
-
-from pipelines.render_automatic_eda import render_automatic_eda  # noqa: E402
-
-
-def _make_db_models(path):
-    """DB con fecha + 3 numéricas continuas en 3 clusters gaussianos.
-
-    Garantiza material para outliers/modelos (>=2 numéricas → IsolationForest),
-    timeseries (columna DATE) y correlacion (numéricas). Mismo shape que el
-    fixture del test del pipeline base.
-    """
-    con = duckdb.connect(path)
-    con.execute("CREATE TABLE pts (d DATE, grp VARCHAR, x1 DOUBLE, x2 DOUBLE, x3 DOUBLE)")
-    random.seed(42)
-    centers = [(0.0, 0.0, 0.0), (10.0, 10.0, 10.0), (20.0, 5.0, 15.0)]
-    d0 = date(2024, 1, 1)
-    rows = []
-    for i in range(150):
-        cx, cy, cz = centers[i % 3]
-        rows.append((
-            d0 + timedelta(days=i), f"g{i % 3}",
-            round(cx + random.gauss(0, 1.0), 4),
-            round(cy + random.gauss(0, 1.0), 4),
-            round(cz + random.gauss(0, 1.0), 4),
-        ))
-    con.executemany("INSERT INTO pts VALUES (?,?,?,?,?)", rows)
-    con.close()
-
-
-def _manifest_chapters(result):
-    with open(result["manifest_path"], encoding="utf-8") as fh:
-        return set((json.load(fh).get("chapters") or {}).keys())
-
-
-# --------------------------------------------------------------------------- #
-# GOLDEN — outliers suelto: IsolationForest poblado por resolución de deps.
-# --------------------------------------------------------------------------- #
-def test_only_outliers_isolation_forest_populated_without_explicit_run_models(tmp_path):
-    """El corazón de la tarea: pedir SOLO 'outliers' sin run_models explícito
-    activa run_models por dependencias y conserva ctx['raw_numeric'], de modo que
-    el bloque multivariante (Isolation Forest) sale con datos, no degradado."""
-    db = str(tmp_path / "pts.duckdb")
-    _make_db_models(db)
-    out = str(tmp_path / "out")
-
-    # NB: no se pasa run_models — la resolución de dependencias debe activarlo.
-    r = render_automatic_eda(db, "pts", only_chapters=["outliers"],
-                             out_dir=out, basename="only_outliers")
-    assert r["status"] == "ok", r.get("error")
-    assert r["only_chapters"] == ["outliers"]
-
-    # Documento = portada + outliers + glosario, nada más.
-    assert _manifest_chapters(r) == {"portada", "outliers", "glosario"}
-
-    # El multivariante salió POBLADO (no la nota de degradación). Se comprueba en
-    # el Markdown (mismo documento por capítulos, texto plano fiable).
-    md = open(r["aeda_md_path"], encoding="utf-8").read()
-    assert "Filas atípicas (multivariante)" in md
-    assert "Filas analizadas" in md, "el Isolation Forest no trae su tabla poblada"
-    assert "No se pudo analizar la anomalía multivariante" not in md, \
-        "el bloque multivariante salió degradado pese a resolver las deps"
-
-    # La resolución activó run_models → el perfil trae el bloque de modelos.
-    assert ((r["profile"] or {}).get("models") or {}).get("outliers") is not None
-
-
-# --------------------------------------------------------------------------- #
-# GOLDEN — timeseries suelto activa run_series.
-# --------------------------------------------------------------------------- #
-def test_only_timeseries_activates_run_series(tmp_path):
-    db = str(tmp_path / "pts.duckdb")
-    _make_db_models(db)
-    out = str(tmp_path / "out")
-
-    r = render_automatic_eda(db, "pts", only_chapters=["timeseries"],
-                             out_dir=out, basename="only_ts")
-    assert r["status"] == "ok", r.get("error")
-    assert "timeseries" in _manifest_chapters(r)
-    assert "modelos" not in _manifest_chapters(r)
-    # run_series resuelto por deps → el perfil trae el análisis de serie.
-    assert (r["profile"] or {}).get("series") is not None, \
-        "only_chapters=['timeseries'] debe activar run_series"
-
-
-# --------------------------------------------------------------------------- #
-# GOLDEN — correlacion suelto construye raw_numeric (sin activar modelos).
-# --------------------------------------------------------------------------- #
-def test_only_correlacion_builds_raw_numeric_without_models(tmp_path):
-    db = str(tmp_path / "pts.duckdb")
-    _make_db_models(db)
-    out = str(tmp_path / "out")
-
-    r = render_automatic_eda(db, "pts", only_chapters=["correlacion"],
-                             out_dir=out, basename="only_corr")
-    assert r["status"] == "ok", r.get("error")
-    assert _manifest_chapters(r) == {"portada", "correlacion", "glosario"}
-    # Eficiencia: correlacion no necesita los modelos → no se corrieron.
-    assert ((r["profile"] or {}).get("models") or {}).get("outliers") is None
-    assert (r["profile"] or {}).get("series") is None
-
-
-# --------------------------------------------------------------------------- #
-# Eficiencia y precedencia — vía stub (sin DuckDB).
-# --------------------------------------------------------------------------- #
-def _patch(monkeypatch, cap):
-    import pipelines.render_automatic_eda as mod
-
-    def fake_pt(db, t, **kw):
-        cap["run_models"] = kw.get("run_models")
-        cap["run_series"] = kw.get("run_series")
-        cap["run_llm"] = kw.get("run_llm")
-        return {"status": "ok", "profile": {"columns": []}}
-
-    def fake_ctx(db, t, prof, **kw):
-        cap["ctx_called"] = True
-        return {"db_path": db, "table": t}
-
-    cap["ctx_called"] = False
-    monkeypatch.setattr(mod, "profile_table", fake_pt)
-    monkeypatch.setattr(mod, "build_eda_render_ctx", fake_ctx)
-    monkeypatch.setattr(mod, "render_automatic_eda_pdf",
-                        lambda *a, **k: {"path": "x.pdf", "n_pages": 1,
-                                         "manifest_path": "m.json"})
-    monkeypatch.setattr(mod, "render_automatic_eda_pptx",
-                        lambda *a, **k: {"path": "x.pptx", "n_slides": 1})
-    monkeypatch.setattr(mod, "render_automatic_eda_markdown",
-                        lambda *a, **k: {"path": "x.md", "n_chars": 1})
-
-
-def test_only_geospatial_does_not_activate_cost_flags(monkeypatch):
-    """Eficiencia: pedir solo geospatial NO corre modelos/serie/LLM."""
-    cap = {}
-    _patch(monkeypatch, cap)
-    render_automatic_eda("db", "t", only_chapters=["geospatial"])
-    assert cap["run_models"] is False
-    assert cap["run_series"] is False
-    assert cap["run_llm"] is False
-
-
-def test_only_outliers_activates_run_models_via_deps(monkeypatch):
-    cap = {}
-    _patch(monkeypatch, cap)
-    render_automatic_eda("db", "t", only_chapters=["outliers"])
-    assert cap["run_models"] is True
-    assert cap["run_series"] is False
-
-
-def test_explicit_flag_overrides_dependency_resolution(monkeypatch):
-    """run_models=False explícito gana, aunque outliers lo pediría por deps."""
-    cap = {}
-    _patch(monkeypatch, cap)
-    render_automatic_eda("db", "t", only_chapters=["outliers"], run_models=False)
-    assert cap["run_models"] is False
-
-
-def test_purely_aggregated_chapter_skips_render_ctx(monkeypatch):
-    """num_distr solo lee el profile → build_eda_render_ctx no se llama."""
-    cap = {}
-    _patch(monkeypatch, cap)
-    render_automatic_eda("db", "t", only_chapters=["num_distr"])
-    assert cap["ctx_called"] is False, \
-        "num_distr no necesita datos crudos: el ctx no debe construirse"
-
-
-def test_chapter_that_needs_ctx_builds_it(monkeypatch):
-    cap = {}
-    _patch(monkeypatch, cap)
-    render_automatic_eda("db", "t", only_chapters=["outliers"])
-    assert cap["ctx_called"] is True
-
-
-# --------------------------------------------------------------------------- #
-# EDGE — errores claros sin lanzar.
-# --------------------------------------------------------------------------- #
-def test_unknown_chapter_id_returns_clear_error(tmp_path):
-    r = render_automatic_eda(str(tmp_path / "x.duckdb"), "t",
-                             only_chapters=["no_existe"])
-    assert r["status"] == "error"
-    assert "no_existe" in r["error"]
-    assert "Capítulos válidos" in r["error"]
-    # Algún id válido conocido aparece en la lista.
-    assert "outliers" in r["error"]
-
-
-def test_empty_only_list_returns_error(tmp_path):
-    r = render_automatic_eda(str(tmp_path / "x.duckdb"), "t", only_chapters=[])
-    assert r["status"] == "error"
-    assert "vac" in r["error"].lower()
-
-
-def test_only_chapters_not_a_list_returns_error(tmp_path):
-    r = render_automatic_eda(str(tmp_path / "x.duckdb"), "t",
-                             only_chapters="outliers")
-    assert r["status"] == "error"
-
-
-def test_only_none_keeps_full_document(tmp_path):
-    """Retro-compat: only_chapters=None genera el documento completo."""
-    db = str(tmp_path / "pts.duckdb")
-    _make_db_models(db)
-    out = str(tmp_path / "out")
-    r = render_automatic_eda(db, "pts", out_dir=out, basename="full")
-    assert r["status"] == "ok", r.get("error")
-    chapters = _manifest_chapters(r)
-    # Documento completo: muchos más capítulos que portada/glosario.
-    assert {"portada", "glosario", "overview", "correlacion"} <= chapters
-    assert len(chapters) > 4
@@ -9,6 +9,7 @@ dependencies = [
    "contextily>=1.7.0",
    "cryptography>=46.0.6",
    "duckdb>=1.5.2",
+    "faker>=40.27.0",
    "fpdf2>=2.8.7",
    "geopandas>=1.1.3",
    "google-api-python-client>=2.197.0",
@@ -839,6 +839,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" },
 ]

+[[package]]
+name = "faker"
+version = "40.27.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "tzdata", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1a/7b/c62c98764137c949be240ad83f763b6f96cf76055952a3e2835359acc3af/faker-40.27.0.tar.gz", hash = "sha256:f697cf07f461474ad7d511164c21f45317e69f1d531d25f3e0f872b639e346a1", size = 2018361, upload-time = "2026-06-30T18:05:17.775Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/b2/788aae329da3d7e4f08f8e1a82e82243c3376c0f3f49b75ae29eea40b371/faker-40.27.0-py3-none-any.whl", hash = "sha256:6099bd6d7bc79041b46c28e100815e2558952bcf384b76ce6c71c8bdca744256", size = 2057897, upload-time = "2026-06-30T18:05:15.555Z" },
+]
+
 [[package]]
 name = "fastapi"
 version = "0.136.3"
@@ -890,6 +902,7 @@ dependencies = [
    { name = "contextily" },
    { name = "cryptography" },
    { name = "duckdb" },
+    { name = "faker" },
    { name = "fpdf2" },
    { name = "geopandas" },
    { name = "google-api-python-client" },
@@ -949,6 +962,7 @@ requires-dist = [
    { name = "contextily", specifier = ">=1.7.0" },
    { name = "cryptography", specifier = ">=46.0.6" },
    { name = "duckdb", specifier = ">=1.5.2" },
+    { name = "faker", specifier = ">=40.27.0" },
    { name = "fpdf2", specifier = ">=2.8.7" },
    { name = "geopandas", specifier = ">=1.1.3" },
    { name = "gliner", marker = "extra == 'nlp'", specifier = ">=0.2.13" },