feat(eda): poblar head_rows real en el capitulo OVERVIEW (df.head)

El capitulo OVERVIEW del motor AutomaticEDA mostraba "df.head no disponible" porque ninguna fase de calculo poblaba las primeras filas crudas de la tabla. - build_eda_render_ctx: nuevo bloque que muestrea SELECT * LIMIT head_n (param nuevo head_n=10) y lo expone en ctx["head_rows"] como lista de dicts fila. Estilo dict-no-throw: si la query falla, se omite la clave. - profile_table: puebla prof["head_rows"] reusando _sample_rows (SELECT de las columnas LIMIT 10) tras recalcular el type_breakdown. Asi el report JSON sidecar tambien lo lleva y el capitulo lo recoge via profile aunque no se construya el ctx. - overview.py: la nota del DataTable de df.head ahora indica el total de filas del dataset cuando se conoce ("primeras 10 filas de 891"). Bump CHAPTER_VERSION 1.0.0 -> 1.1.0. - overview_test.py (nuevo): golden (head via profile y via ctx, render PDF + PPTX muestran las filas reales, placeholder ausente), edge (sin head_rows degrada a nota honesta sin romper, None/vacio devuelven None). Verificado end-to-end con titanic: render_automatic_eda emite PDF + PPTX con df.head visible (Braund/Cumings/Heikkinen + columnas) y sin el placeholder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 17:56:24 +02:00
10 changed files with 276 additions and 296 deletions
@@ -89,35 +89,6 @@ _DEF_MAX_CARD = 20
 _DEF_MAX_MEASURES = 4
 _DEF_TOP_N = 12

-# Glossary terms this chapter explains. Both appear in the always-rendered intro,
-# so they are registered and marked clickable whenever a collector is in ctx —
-# the canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
-# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
-# block. Mapping key -> (label, definition).
-_TERM_DEFS = {
-    "groupby": (
-        "Agrupación (split-apply-combine)",
-        "Operación de agrupación (group by): parte la tabla en grupos según los "
-        "valores de una columna categórica, aplica un cálculo (conteo, media, "
-        "mediana…) dentro de cada grupo y combina los resultados en una tabla "
-        "resumen. Es el patrón split-apply-combine."),
-    "pivot_table": (
-        "Tabla dinámica (pivot)",
-        "Tabla dinámica que cruza dos variables categóricas — una en las filas y "
-        "otra en las columnas — y rellena cada celda con un agregado (media, "
-        "suma…) de una medida numérica. Resume de un vistazo cómo interactúan las "
-        "dos categóricas sobre esa medida."),
-}
-
-
-def _term(mark: bool, key: str, text: str) -> str:
-    """Wrap ``text`` as a clickable glossary span when ``mark`` is True.
-
-    The visible text is identical with or without the marker (the renderers strip
-    it), so wrapping never changes line layout — it only adds the link.
-    """
-    return f"[[term:{key}]]{text}[[/term]]" if mark else text
-

 # --------------------------------------------------------------------------- #
 # Formatting helpers (mirror the other chapters' defensive style).
@@ -554,18 +525,13 @@ def _sections_live(profile: dict, ctx: dict, candidates: dict) -> list:
 # --------------------------------------------------------------------------- #
 # Entry point.
 # --------------------------------------------------------------------------- #
-def _intro_blocks(gloss=None, mark_term: bool = False) -> list:
-    if gloss is not None:
-        for key, (label, definition) in _TERM_DEFS.items():
-            gloss.add(key, label, definition)
-    t_groupby = _term(mark_term, "groupby", "**por grupos** (split-apply-combine)")
-    t_pivot = _term(mark_term, "pivot_table", "**tablas dinámicas** (pivot)")
+def _intro_blocks() -> list:
    text = (
-        f"Este capítulo analiza la tabla {t_groupby}: "
+        "Este capítulo analiza la tabla **por grupos** (split-apply-combine): "
        "elige las columnas categóricas más informativas — por su cardinalidad "
        "y relevancia, no todas contra todas, para no inflar comparaciones "
        "espurias — y resume las variables numéricas dentro de cada grupo "
-        f"(conteo, media, mediana, desviación). Las {t_pivot} "
+        "(conteo, media, mediana, desviación). Las **tablas dinámicas** (pivot) "
        "cruzan dos categóricas sobre una medida, y los **gráficos de barras** "
        "(siempre desde cero) comparan los grupos de un vistazo."
    )
@@ -590,21 +556,13 @@ def build_agregacion(profile: dict, ctx: dict):
    if not isinstance(profile, dict):
        return None

-    # Shared glossary collector: groupby + pivot_table live in the always-present
-    # intro, so they are registered + marked there. Degrades silently (mark_term
-    # False) when no collector is in ctx (standalone render).
-    glossary = ctx.get("glossary")
-    gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
-    mark_term = gloss is not None
-
    # Pre-computed results take precedence (offline / tests / forward-compat).
    pre = ctx.get("aggregations")
    if _is_dict(pre) and (pre.get("groupby") or pre.get("pivots")):
        sections = _sections_from_precomputed(pre)
        if not sections:
            return None
-        blocks = (_intro_blocks(gloss, mark_term) + sections
-                  + _insights_section(ctx))
+        blocks = _intro_blocks() + sections + _insights_section(ctx)
        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                             version=CHAPTER_VERSION, blocks=blocks)

@@ -625,11 +583,10 @@ def build_agregacion(profile: dict, ctx: dict):
            "crudos. Pasa ctx['db_path'] + ctx['table'] (para el cálculo "
            "push-down en DuckDB) o ctx['aggregations'] ya precalculado. "
            f"Columnas categóricas candidatas: {keys or '—'}.")
-        blocks = (_intro_blocks(gloss, mark_term) + [note]
-                  + _insights_section(ctx))
+        blocks = _intro_blocks() + [note] + _insights_section(ctx)
        return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                             version=CHAPTER_VERSION, blocks=blocks)

-    blocks = _intro_blocks(gloss, mark_term) + sections + _insights_section(ctx)
+    blocks = _intro_blocks() + sections + _insights_section(ctx)
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -254,25 +254,3 @@ def test_anti_corte_muchos_grupos_y_texto_largo():
        # First, middle and last words of the long paragraph all present.
        for i in (0, 60, 119):
            assert f"palabra{i}" in txt
-
-
-def test_glosario_engancha_groupby_y_pivot():
-    """Mejora 4b: la agrupación (split-apply-combine) y la tabla dinámica (pivot)
-    se registran en el colector compartido y se marcan clicables en el cuerpo.
-    Sin colector en ctx, el capítulo degrada y no marca nada."""
-    from datascience.automatic_eda.model import GlossaryCollector
-
-    g = GlossaryCollector()
-    ctx = dict(_ctx_precomputed())
-    ctx["glossary"] = g
-    ch = build_agregacion(_profile(), ctx)
-    assert ch is not None
-    keys = {t["key"] for t in g.terms()}
-    assert {"groupby", "pivot_table"} <= keys
-    body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
-    assert "[[term:groupby]]" in body and "[[term:pivot_table]]" in body
-
-    # Sin colector: degrada limpio (ningún marcador en el cuerpo).
-    ch2 = build_agregacion(_profile(), _ctx_precomputed())
-    body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
-    assert "[[term:" not in body2
@@ -47,53 +47,6 @@ _MAX_MATRIX_LABELS = 16
 # How many pairs to show in each of the top-positive / top-negative tables.
 _TOP_N = 10

-# Glossary terms this chapter explains. Each is registered in the shared
-# collector (ctx['glossary']) and marked clickable on its first appearance in the
-# body — the canonical two-step pattern (see ``cat_distr`` for the reference
-# implementation): ``glossary.add(key, label, definition)`` + the inline span
-# ``[[term:KEY]]texto visible[[/term]]`` in a Markdown block. Mapping key ->
-# (label, definition). ``fdr`` is only registered when the FDR summary is present.
-_TERM_DEFS = {
-    "pearson": (
-        "Pearson (coeficiente r)",
-        "Coeficiente de correlación lineal de Pearson (r) entre dos variables "
-        "numéricas. Va de −1 (relación lineal inversa perfecta) a +1 (directa "
-        "perfecta); 0 indica ausencia de relación lineal. Sólo capta relaciones "
-        "lineales, por eso lleva signo."),
-    "spearman": (
-        "Spearman (correlación de rangos)",
-        "Correlación de rangos de Spearman: el coeficiente de Pearson calculado "
-        "sobre los puestos (rangos) de los valores en vez de sus magnitudes. Mide "
-        "relaciones monótonas (no necesariamente lineales), va de −1 a +1 y es "
-        "robusta frente a valores atípicos."),
-    "cramers_v": (
-        "Cramér's V",
-        "Medida de asociación entre dos variables categóricas, derivada del "
-        "estadístico chi-cuadrado y normalizada al rango 0–1 (0 = independientes, "
-        "1 = asociación total). No tiene signo: sólo mide la intensidad."),
-    "correlation_ratio": (
-        "Razón de correlación (η)",
-        "Razón de correlación (eta) entre una variable numérica y una "
-        "categórica: la fracción de la varianza de la numérica explicada por los "
-        "grupos de la categórica. Va de 0 (los grupos no explican nada) a 1 (la "
-        "explican toda); no tiene signo."),
-    "fdr": (
-        "Comparaciones múltiples (FDR)",
-        "Al evaluar muchos pares a la vez, algunos parecen significativos por "
-        "puro azar. La corrección por tasa de falsos descubrimientos (FDR, "
-        "Benjamini-Hochberg) ajusta los p-valores para controlar la proporción "
-        "esperada de falsos positivos entre los pares declarados significativos."),
-}
-
-
-def _term(mark: bool, key: str, text: str) -> str:
-    """Wrap ``text`` as a clickable glossary span when ``mark`` is True.
-
-    The visible text is identical with or without the marker (the renderers strip
-    the marker), so wrapping never changes line layout — it only adds the link.
-    """
-    return f"[[term:{key}]]{text}[[/term]]" if mark else text
-

 def _is_num(v) -> bool:
    """True for a real, finite int/float (not bool, not NaN/inf)."""
@@ -292,7 +245,7 @@ def _methods_block(corr: dict):
    return model.KVTable(rows=rows, title="Métodos de asociación")


-def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
+def _fdr_text(corr: dict) -> str | None:
    """One-line summary of the multiple-testing (FDR) correction, or None."""
    mt = corr.get("multiple_testing")
    if not isinstance(mt, dict) or not mt:
@@ -301,8 +254,7 @@ def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
    alpha = mt.get("alpha")
    n_tests = mt.get("n_tests")
    n_rej = mt.get("n_rejected")
-    multi = _term(mark_term, "fdr", "comparaciones múltiples")
-    parts = [f"Corrección por {multi} ({method}"]
+    parts = [f"Corrección por comparaciones múltiples ({method}"]
    if _is_num(alpha):
        parts[0] += f", α={float(alpha):g}"
    parts[0] += ")."
@@ -337,31 +289,13 @@ def build_correlacion(profile: dict, ctx: dict):

    blocks: list = []

-    # Register the always-present method terms in the shared glossary and mark
-    # their first appearance clickable (the FDR term is registered lazily below,
-    # only when the FDR summary is actually emitted). Degrades silently when no
-    # collector is in ctx (standalone render) — mark_term stays False.
-    glossary = ctx.get("glossary")
-    gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
-    mark_term = gloss is not None
-    if gloss is not None:
-        for key in ("pearson", "spearman", "cramers_v", "correlation_ratio"):
-            label, definition = _TERM_DEFS[key]
-            gloss.add(key, label, definition)
-
-    # Intro: what this chapter shows and how to read the sign. Build the marked
-    # method names as locals first (avoids backslash-in-f-string for "Cramér's V").
-    t_pearson = _term(mark_term, "pearson", "Pearson")
-    t_spearman = _term(mark_term, "spearman", "Spearman")
-    t_cramers = _term(mark_term, "cramers_v", "Cramér's V")
-    t_corr_ratio = _term(mark_term, "correlation_ratio", "razón de correlación")
+    # Intro: what this chapter shows and how to read the sign.
    blocks.append(model.Markdown(text=(
        "Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
-        f"sus tipos ({t_pearson}/{t_spearman} entre numéricas — con **signo**; "
-        f"{t_cramers} entre categóricas; {t_corr_ratio} num-categórica; "
-        "información mutua como medida común no lineal). Sólo las correlaciones "
-        "**num-num** tienen dirección: por eso los pares **negativos** son siempre "
-        "num-num.")))
+        "sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
+        "entre categóricas; razón de correlación num-categórica; información mutua "
+        "como medida común no lineal). Sólo las correlaciones **num-num** tienen "
+        "dirección: por eso los pares **negativos** son siempre num-num.")))

    # 1) Association matrix (heatmap).
    labels, trimmed = _ordered_labels(pairs)
@@ -403,13 +337,9 @@ def build_correlacion(profile: dict, ctx: dict):
            "no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
            "sobre los retornos/diferencias antes de interpretarlas.")))

-    # 4) FDR summary + methods legend. Register the FDR term only when its
-    # summary is emitted, so the glossary never lists an unreferenced entry.
-    fdr_text = _fdr_text(corr, mark_term=mark_term)
+    # 4) FDR summary + methods legend.
+    fdr_text = _fdr_text(corr)
    if fdr_text:
-        if gloss is not None:
-            label, definition = _TERM_DEFS["fdr"]
-            gloss.add("fdr", label, definition)
        blocks.append(model.Markdown(text=fdr_text))
    methods = _methods_block(corr)
    if methods is not None:
@@ -173,25 +173,3 @@ def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
        assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
        # A short, unbreakable fragment of the long label survives the wrap.
        assert "azufre" in _pdf_text(pdf)
-
-
-def test_glosario_engancha_metodos_y_fdr():
-    """Mejora 4b: los métodos de correlación (Pearson, Spearman, Cramér's V,
-    razón de correlación) y la corrección por comparaciones múltiples (FDR) se
-    registran en el colector compartido y se marcan clicables en el cuerpo. Sin
-    colector en ctx, el capítulo degrada y no marca nada."""
-    from datascience.automatic_eda.model import GlossaryCollector
-
-    g = GlossaryCollector()
-    ch = build_correlacion(_profile(), {"glossary": g})
-    assert ch is not None
-    keys = {t["key"] for t in g.terms()}
-    assert {"pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"} <= keys
-    body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
-    for k in ("pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"):
-        assert f"[[term:{k}]]" in body, k
-
-    # Sin colector: degrada limpio (ningún marcador en el cuerpo).
-    ch2 = build_correlacion(_profile(), {})
-    body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
-    assert "[[term:" not in body2
@@ -55,62 +55,6 @@ _CLUSTER_COLORS = [
    "#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac",
 ]

-# Glossary terms this chapter explains. Each is registered in the shared
-# collector (ctx['glossary']) and marked clickable on its first appearance — the
-# canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
-# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
-# block. A term is registered only when its section is actually rendered, so the
-# glossary never lists an entry no in-text appearance points to.
-_TERM_DEFS = {
-    "zscore": (
-        "Estandarización z-score",
-        "Transformación que lleva cada columna numérica a media 0 y desviación "
-        "típica 1: a cada valor le resta la media de su columna y lo divide por "
-        "la desviación típica. Así variables con escalas muy distintas (euros "
-        "frente a un ratio 0–1) pesan por igual en las distancias y la varianza."),
-    "pca": (
-        "PCA (componentes principales)",
-        "El análisis de componentes principales resume muchas variables "
-        "numéricas correlacionadas en pocos ejes nuevos (componentes), "
-        "ortogonales entre sí y ordenados por la cantidad de varianza que "
-        "capturan. Permite ver la estructura de los datos en 2D y saber cuántas "
-        "dimensiones bastan para explicarlos."),
-    "kmeans": (
-        "KMeans (segmentación)",
-        "Algoritmo de agrupamiento no supervisado que reparte las filas en k "
-        "segmentos: asigna cada fila al centro (centroide) más cercano y recoloca "
-        "los centroides de forma iterativa hasta minimizar la distancia interna "
-        "de cada grupo. Aquí k se elige automáticamente."),
-    "silhouette": (
-        "Coeficiente de silueta (silhouette)",
-        "Métrica de calidad de un agrupamiento, en el rango −1 a 1: para cada "
-        "fila compara cómo de cerca está de su propio segmento frente al segmento "
-        "vecino más próximo. Cuanto más alto el promedio, más compactos y "
-        "separados están los segmentos."),
-    "isolation_forest": (
-        "Isolation Forest (anomalías)",
-        "Algoritmo de detección de anomalías multivariante: construye árboles que "
-        "parten el espacio con cortes aleatorios y mide cuántos cortes hacen "
-        "falta para aislar cada fila. Las filas raras se aíslan con muy pocos "
-        "cortes y se marcan como outliers según un umbral de contaminación."),
-}
-
-
-def _term(mark: bool, key: str, text: str) -> str:
-    """Wrap ``text`` as a clickable glossary span when ``mark`` is True.
-
-    The visible text is identical with or without the marker (the renderers strip
-    it), so wrapping never changes line layout — it only adds the link.
-    """
-    return f"[[term:{key}]]{text}[[/term]]" if mark else text
-
-
-def _register(gloss, key: str) -> None:
-    """Register term ``key`` in the collector (idempotent); no-op if gloss None."""
-    if gloss is not None:
-        label, definition = _TERM_DEFS[key]
-        gloss.add(key, label, definition)
-

 # --------------------------------------------------------------------------- #
 # Formatting helpers (mirror the overview chapter's defensive style).
@@ -308,37 +252,34 @@ def _make_cluster_scatter(projection: dict):
 # --------------------------------------------------------------------------- #
 # Section builders. Each returns a list of blocks (possibly empty).
 # --------------------------------------------------------------------------- #
-def _normalization_intro(gloss=None, mark_term: bool = False) -> list:
-    _register(gloss, "zscore")
-    zscore = _term(mark_term, "zscore", "**estandarizan con z-score**")
+def _normalization_intro() -> list:
    text = (
        "Estos modelos son **no supervisados**: buscan estructura latente sin "
        "una variable objetivo. Antes de aplicarlos, todas las columnas "
-        f"numéricas se {zscore} (cada valor menos la media, dividido por la "
-        "desviación típica). Sin esta normalización, una variable con escala "
-        "grande (p.ej. ingresos en euros) dominaría las distancias y la varianza "
-        "frente a otra de escala pequeña (p.ej. un ratio entre 0 y 1), sesgando "
-        "tanto el PCA como el KMeans. Tras la estandarización todas las variables "
-        "pesan por igual."
+        "numéricas se **estandarizan con z-score** (cada valor menos la media, "
+        "dividido por la desviación típica). Sin esta normalización, una "
+        "variable con escala grande (p.ej. ingresos en euros) dominaría las "
+        "distancias y la varianza frente a otra de escala pequeña (p.ej. un "
+        "ratio entre 0 y 1), sesgando tanto el PCA como el KMeans. Tras la "
+        "estandarización todas las variables pesan por igual."
    )
    return [model.Heading(text="Modelos no supervisados", level=1),
            model.Markdown(text=text)]


-def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
+def _pca_section(pca: dict) -> list:
    if not _is_dict(pca) or not pca.get("explained_variance_ratio"):
        return []
-    _register(gloss, "pca")
    blocks = [model.Heading(text="PCA — varianza explicada", level=2)]

    n_used = pca.get("n_rows_used")
    n_feat = pca.get("n_features")
    intro = (
-        f"El {_term(mark_term, 'pca', 'PCA')} resume {_fmt_num(n_feat)} variables "
-        "numéricas en componentes ortogonales ordenados por la varianza que "
-        f"capturan ({_fmt_num(n_used)} filas usadas tras eliminar nulos). El "
-        "gráfico de sedimentación (scree) muestra cuánta varianza aporta cada "
-        "componente y su acumulado: un codo marca cuántos componentes bastan."
+        f"El PCA resume {_fmt_num(n_feat)} variables numéricas en componentes "
+        f"ortogonales ordenados por la varianza que capturan "
+        f"({_fmt_num(n_used)} filas usadas tras eliminar nulos). El gráfico de "
+        "sedimentación (scree) muestra cuánta varianza aporta cada componente y "
+        "su acumulado: un codo marca cuántos componentes bastan."
    )
    blocks.append(model.Markdown(text=intro))

@@ -384,14 +325,11 @@ def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
    return blocks


-def _kmeans_section(kmeans: dict, projection: dict, titles,
-                    gloss=None, mark_term: bool = False) -> list:
+def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
    has_km = _is_dict(kmeans) and kmeans.get("best_k")
    has_proj = _is_dict(projection) and projection.get("points")
    if not has_km and not has_proj:
        return []
-    _register(gloss, "kmeans")
-    _register(gloss, "silhouette")

    blocks = [model.Heading(text="Segmentación (KMeans)", level=2)]

@@ -399,11 +337,9 @@ def _kmeans_section(kmeans: dict, projection: dict, titles,
    sil = (projection or {}).get("silhouette")
    if sil is None:
        sil = (kmeans or {}).get("silhouette")
-    t_kmeans = _term(mark_term, "kmeans", "KMeans")
-    t_sil = _term(mark_term, "silhouette", "*silhouette*")
    intro = (
-        f"{t_kmeans} agrupa las filas en **{_fmt_num(best_k)} segmentos** "
-        f"elegidos automáticamente maximizando el coeficiente de {t_sil} "
+        f"KMeans agrupa las filas en **{_fmt_num(best_k)} segmentos** elegidos "
+        "automáticamente maximizando el coeficiente de *silhouette* "
        f"(**{_fmt_num(sil)}**, rango −1 a 1: cuanto más alto, segmentos más "
        "compactos y separados). Los segmentos se proyectan sobre el plano de "
        "los dos primeros componentes principales para visualizarlos."
@@ -458,18 +394,16 @@ def _kmeans_section(kmeans: dict, projection: dict, titles,
    return blocks


-def _outliers_section(outliers: dict, gloss=None, mark_term: bool = False) -> list:
+def _outliers_section(outliers: dict) -> list:
    if not _is_dict(outliers) or outliers.get("n_outliers") is None:
        return []
    if outliers.get("note") and not outliers.get("n_rows_used"):
        # insufficient data — nothing meaningful to show.
        return []
-    _register(gloss, "isolation_forest")
    blocks = [model.Heading(text="Detección de anomalías (Isolation Forest)",
                            level=2)]
-    isof = _term(mark_term, "isolation_forest", "**Isolation Forest**")
    explain = (
-        f"{isof} detecta filas anómalas de forma *multivariante*: "
+        "**Isolation Forest** detecta filas anómalas de forma *multivariante*: "
        "construye árboles que parten el espacio con cortes aleatorios y mide "
        "cuántos cortes hacen falta para aislar cada fila. Las filas raras "
        "(combinaciones de valores poco frecuentes considerando **todas las "
@@ -550,21 +484,15 @@ def build_modelos(profile: dict, ctx: dict):
        (kmeans and kmeans.get("best_k")) or (projection and projection.get("points"))
    ) else None

-    # Shared glossary collector: terms are registered + marked clickable inside
-    # each section, only when that section actually renders (no orphan entries).
-    glossary = ctx.get("glossary")
-    gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
-    mark_term = gloss is not None
-
    sections = []
-    sections += _pca_section(pca, gloss, mark_term) if pca else []
-    sections += _kmeans_section(kmeans, projection, titles, gloss, mark_term)
-    sections += _outliers_section(outliers, gloss, mark_term) if outliers else []
+    sections += _pca_section(pca) if pca else []
+    sections += _kmeans_section(kmeans, projection, titles)
+    sections += _outliers_section(outliers) if outliers else []
    sections += _normality_section(normality) if normality else []

    if not sections:
        return None  # models block present but nothing renderable.

-    blocks = _normalization_intro(gloss, mark_term) + sections
+    blocks = _normalization_intro() + sections
    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)
@@ -257,26 +257,3 @@ def test_anticortes_tabla_normalidad_larga_no_corta():
        # Every column name survives (wrapped/split, never truncated).
        for i in (0, 19, 39):
            assert f"col_{i}" in txt
-
-
-def test_glosario_engancha_terminos_modelos():
-    """Mejora 4b: PCA, KMeans, silhouette, Isolation Forest y la estandarización
-    z-score se registran en el colector compartido y se marcan clicables en el
-    cuerpo. Sin colector en ctx, el capítulo degrada y no marca nada."""
-    from datascience.automatic_eda.model import GlossaryCollector
-
-    g = GlossaryCollector()
-    ctx = dict(_ctx_full())
-    ctx["glossary"] = g
-    ch = build_modelos(_profile(), ctx)
-    assert ch is not None
-    keys = {t["key"] for t in g.terms()}
-    assert {"zscore", "pca", "kmeans", "silhouette", "isolation_forest"} <= keys
-    body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
-    for k in ("zscore", "pca", "kmeans", "silhouette", "isolation_forest"):
-        assert f"[[term:{k}]]" in body, k
-
-    # Sin colector: degrada limpio (ningún marcador en el cuerpo).
-    ch2 = build_modelos(_profile(), _ctx_full())
-    body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
-    assert "[[term:" not in body2
@@ -20,7 +20,7 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"

@@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict):
        if not cols:
            cols = list(head[0].keys())
        rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
-        return model.DataTable(header=cols, rows=rows,
-                               note=f"primeras {len(rows)} filas")
+        # Honest note: how many rows are shown and, when known, out of how many
+        # rows the dataset has (so "primeras 10 filas de 891" gives context).
+        note = f"primeras {len(rows)} filas"
+        n_rows = profile.get("n_rows")
+        if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
+                and n_rows > len(rows):
+            note += f" de {n_rows:,}".replace(",", ".")
+        return model.DataTable(header=cols, rows=rows, note=note)
    return model.Note(
        "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
        "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
@@ -0,0 +1,187 @@
+"""Tests for the OVERVIEW chapter — DoD: golden + edges + degradation.
+
+Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
+and deterministic. Verifies that ``build_overview`` renders the raw first rows
+(``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives
+via ``profile['head_rows']`` (populated by ``profile_table``) and via
+``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter
+also renders the column dictionary and the numeric describe, that the full
+document renders to PDF and PPTX showing the head values, and that a profile with
+NO head data degrades to an honest note instead of raising or inventing rows.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.model import DataTable, Note
+from datascience.automatic_eda.chapters.overview import (
+    CHAPTER_ID, CHAPTER_VERSION, build_overview,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _columns() -> list:
+    return [
+        {"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0,
+                                      "max": 3.0, "std": 1.0}},
+        {"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0,
+                                      "max": 1.0, "std": 0.58}},
+        {"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0,
+                                      "max": 3.0, "std": 1.15}},
+        {"name": "Name", "inferred_type": "categorical", "null_pct": 0.0,
+         "null_count": 0, "distinct_count": 3},
+        {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
+         "null_count": 0, "distinct_count": 2,
+         "categorical": {"top": [{"value": "male", "count": 2},
+                                 {"value": "female", "count": 1}]}},
+    ]
+
+
+def _head_rows() -> list:
+    return [
+        {"PassengerId": 1, "Survived": 0, "Pclass": 3,
+         "Name": "Braund Owen", "Sex": "male"},
+        {"PassengerId": 2, "Survived": 1, "Pclass": 1,
+         "Name": "Cumings Florence", "Sex": "female"},
+        {"PassengerId": 3, "Survived": 1, "Pclass": 3,
+         "Name": "Heikkinen Laina", "Sex": "female"},
+    ]
+
+
+def _profile(with_head: bool = True) -> dict:
+    prof = {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 891,
+        "n_cols": 5,
+        "quality_score": 88.0,
+        "columns": _columns(),
+    }
+    if with_head:
+        prof["head_rows"] = _head_rows()
+    return prof
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def _flatten(blocks):
+    """Recursively flatten Group blocks into a flat list (none here today)."""
+    out = []
+    for b in blocks:
+        inner = getattr(b, "blocks", None)
+        if inner is not None and getattr(b, "kind", None) == "group":
+            out.extend(_flatten(inner))
+        else:
+            out.append(b)
+    return out
+
+
+def test_golden_build_overview_muestra_head_desde_profile():
+    ch = build_overview(_profile(), {})
+    assert ch is not None
+    assert ch.id == CHAPTER_ID
+    assert ch.version == CHAPTER_VERSION
+    blocks = _flatten(ch.blocks)
+    # The first DataTable is df.head: its header is the column names and the
+    # real first rows are present (not a placeholder note).
+    tables = [b for b in blocks if isinstance(b, DataTable)]
+    assert tables, "overview must emit at least the df.head DataTable"
+    head_tbl = tables[0]
+    assert head_tbl.header == ["PassengerId", "Survived", "Pclass",
+                               "Name", "Sex"]
+    assert len(head_tbl.rows) == 3
+    flat = [str(c) for row in head_tbl.rows for c in row]
+    assert "Braund Owen" in flat and "Cumings Florence" in flat
+    # Honest note carries how many rows shown out of the dataset total.
+    assert head_tbl.note is not None
+    assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note
+    # No "df.head no disponible" placeholder when head_rows is present.
+    assert not any(isinstance(b, Note) and "no disponible" in b.text
+                   for b in blocks)
+
+
+def test_golden_head_desde_ctx_tambien_funciona():
+    # head_rows absent in profile but present in ctx (build_eda_render_ctx path).
+    prof = _profile(with_head=False)
+    ch = build_overview(prof, {"head_rows": _head_rows()})
+    assert ch is not None
+    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
+    flat = [str(c) for row in tables[0].rows for c in row]
+    assert "Braund Owen" in flat
+
+
+def test_golden_render_pdf_muestra_head():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pdf_text(out)
+        assert "Braund" in txt and "male" in txt
+        assert "primeras" in txt          # head note rendered.
+        assert "df.head" in txt           # chapter heading rendered.
+        assert "no disponible" not in txt  # placeholder NOT shown.
+
+
+def test_golden_render_pptx_muestra_head():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pptx_text(out)
+        assert "Braund" in txt and "Cumings" in txt
+
+
+def test_edge_sin_head_rows_degrada_a_nota_honesta():
+    # No head data anywhere: chapter still builds (columns exist), shows the
+    # honest placeholder note, and never invents rows nor raises.
+    prof = _profile(with_head=False)
+    ch = build_overview(prof, {})
+    assert ch is not None
+    blocks = _flatten(ch.blocks)
+    assert any(isinstance(b, Note) and "no disponible" in b.text
+               for b in blocks)
+    # The first DataTable now is the column dictionary, not df.head rows.
+    tables = [b for b in blocks if isinstance(b, DataTable)]
+    assert all("Braund" not in str(c)
+               for tbl in tables for row in tbl.rows for c in row)
+
+
+def test_edge_none_y_vacio_no_rompen():
+    # Nothing to render at all -> None, no raise.
+    assert build_overview(None, None) is None
+    assert build_overview({}, {}) is None
+    assert build_overview({"columns": []}, {}) is None
+    # Only head_rows (no columns) still yields a chapter with the head table.
+    ch = build_overview({"columns": []}, {"head_rows": _head_rows()})
+    assert ch is not None
+    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
+    assert tables and len(tables[0].rows) == 3
@@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al
 menos ``{**base_ctx, "db_path": db_path, "table": table}``.

 Claves de DATOS que produce (las consumen los capitulos):
+  - ``head_rows``      : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la
+                         tabla (``SELECT * LIMIT head_n``), una entrada por fila.
+                         La lee el capitulo OVERVIEW para mostrar df.head real en
+                         lugar del placeholder "df.head no disponible".
  - ``raw_numeric``    : {col: [float|None, ...]} muestra cruda de las columnas
                         numericas, ALINEADA POR FILA (una entrada por fila aunque
                         sea None). La leen modelos (clustering 2D en vivo) y
@@ -56,7 +60,7 @@ def _to_float(value):
        return None


-def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None):
+def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10):
    """Construye el ctx de datos crudos para los renderers de AutomaticEDA.

    Args:
@@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        base_ctx: dict opcional con claves de presentacion ya preparadas
            (dataset_name, source_origin, ...). Se parte de una copia y NO se
            pisan sus claves; solo se añaden las de datos. Default None -> {}.
+        head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]``
+            (df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave.

    Returns:
        El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
        cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
-        Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw,
-        geo_points (omitidas si no aplican o fallan), y siempre db_path + table
-        para backends validos.
+        Nunca lanza. Claves que puede contener: head_rows, raw_numeric,
+        timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre
+        db_path + table para backends validos.
    """
    # Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
    # presentacion que ya traiga se conservan; las de datos se añaden encima.
@@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        ctx["db_path"] = db_path
        ctx["table"] = table

+        # 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n)
+        # para que el capitulo OVERVIEW muestre df.head real en vez del
+        # placeholder. Una sola query, dict-no-throw: si falla, se omite la
+        # clave (el capitulo degrada a su nota honesta). No se pisa una clave
+        # head_rows que ya viniera en base_ctx (presentacion).
+        if head_n and int(head_n) > 0 and "head_rows" not in ctx:
+            try:
+                hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}')
+                if isinstance(hq, dict) and hq.get("status") == "ok":
+                    hrows = [
+                        dict(r) for r in (hq.get("rows") or [])
+                        if isinstance(r, dict)
+                    ]
+                    if hrows:
+                        ctx["head_rows"] = hrows
+            except Exception:  # noqa: BLE001 - dict-no-throw: omitir la clave
+                pass
+
        # 2) Columnas del perfil agregado (lectura defensiva).
        cols = profile.get("columns") if isinstance(profile, dict) else None
        cols = cols or []
@@ -536,6 +536,21 @@ def profile_table(
                type_breakdown[it] += 1
        prof["type_breakdown"] = type_breakdown

+        # 8.1) Primeras filas crudas (df.head) para el capitulo OVERVIEW del motor
+        # AutomaticEDA: una muestra SELECT col1,col2,... LIMIT 10 alineada por fila.
+        # Se reusa _sample_rows (mismo lector read-only). Estilo dict-no-throw: si
+        # falla, head_rows queda None y el capitulo degrada a su nota honesta. El
+        # capitulo lo recoge via profile["head_rows"]; build_eda_render_ctx ademas
+        # lo replica en ctx["head_rows"] cuando se construye el contexto de render.
+        try:
+            head_names = [c.get("name") for c in cols if c.get("name")]
+            head_rows = _sample_rows(_q, table, head_names, 10)
+            prof["head_rows"] = [
+                dict(r) for r in head_rows if isinstance(r, dict)
+            ] or None
+        except Exception:  # noqa: BLE001
+            prof["head_rows"] = None
+
        # 8.5) Matriz de correlacion/asociacion sobre una muestra de filas
        # alineadas. Elige la metrica por par de tipos (Pearson/Spearman,
        # Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.