feat(eda): poblar head_rows real en el capitulo OVERVIEW (df.head)

El capitulo OVERVIEW del motor AutomaticEDA mostraba "df.head no disponible" porque ninguna fase de calculo poblaba las primeras filas crudas de la tabla. - build_eda_render_ctx: nuevo bloque que muestrea SELECT * LIMIT head_n (param nuevo head_n=10) y lo expone en ctx["head_rows"] como lista de dicts fila. Estilo dict-no-throw: si la query falla, se omite la clave. - profile_table: puebla prof["head_rows"] reusando _sample_rows (SELECT de las columnas LIMIT 10) tras recalcular el type_breakdown. Asi el report JSON sidecar tambien lo lleva y el capitulo lo recoge via profile aunque no se construya el ctx. - overview.py: la nota del DataTable de df.head ahora indica el total de filas del dataset cuando se conoce ("primeras 10 filas de 891"). Bump CHAPTER_VERSION 1.0.0 -> 1.1.0. - overview_test.py (nuevo): golden (head via profile y via ctx, render PDF + PPTX muestran las filas reales, placeholder ausente), edge (sin head_rows degrada a nota honesta sin romper, None/vacio devuelven None). Verificado end-to-end con titanic: render_automatic_eda emite PDF + PPTX con df.head visible (Braund/Cumings/Heikkinen + columnas) y sin el placeholder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 17:56:24 +02:00
6 changed files with 255 additions and 324 deletions
@@ -20,7 +20,7 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "overview"
 CHAPTER_TITLE = "Overview"

@@ -90,8 +90,14 @@ def _head_block(profile: dict, ctx: dict):
        if not cols:
            cols = list(head[0].keys())
        rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
-        return model.DataTable(header=cols, rows=rows,
-                               note=f"primeras {len(rows)} filas")
+        # Honest note: how many rows are shown and, when known, out of how many
+        # rows the dataset has (so "primeras 10 filas de 891" gives context).
+        note = f"primeras {len(rows)} filas"
+        n_rows = profile.get("n_rows")
+        if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
+                and n_rows > len(rows):
+            note += f" de {n_rows:,}".replace(",", ".")
+        return model.DataTable(header=cols, rows=rows, note=note)
    return model.Note(
        "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
        "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
@@ -0,0 +1,187 @@
+"""Tests for the OVERVIEW chapter — DoD: golden + edges + degradation.
+
+Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
+and deterministic. Verifies that ``build_overview`` renders the raw first rows
+(``df.head``) as a DataTable when ``head_rows`` is present — both when it arrives
+via ``profile['head_rows']`` (populated by ``profile_table``) and via
+``ctx['head_rows']`` (populated by ``build_eda_render_ctx``) — that the chapter
+also renders the column dictionary and the numeric describe, that the full
+document renders to PDF and PPTX showing the head values, and that a profile with
+NO head data degrades to an honest note instead of raising or inventing rows.
+"""
+
+import os
+import re
+import tempfile
+
+from pypdf import PdfReader
+from pptx import Presentation
+
+from datascience.automatic_eda.model import DataTable, Note
+from datascience.automatic_eda.chapters.overview import (
+    CHAPTER_ID, CHAPTER_VERSION, build_overview,
+)
+from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
+from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
+
+
+def _columns() -> list:
+    return [
+        {"name": "PassengerId", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 2.0, "median": 2.0, "min": 1.0,
+                                      "max": 3.0, "std": 1.0}},
+        {"name": "Survived", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 0.33, "median": 0.0, "min": 0.0,
+                                      "max": 1.0, "std": 0.58}},
+        {"name": "Pclass", "inferred_type": "numeric", "null_pct": 0.0,
+         "null_count": 0, "numeric": {"mean": 2.33, "median": 3.0, "min": 1.0,
+                                      "max": 3.0, "std": 1.15}},
+        {"name": "Name", "inferred_type": "categorical", "null_pct": 0.0,
+         "null_count": 0, "distinct_count": 3},
+        {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
+         "null_count": 0, "distinct_count": 2,
+         "categorical": {"top": [{"value": "male", "count": 2},
+                                 {"value": "female", "count": 1}]}},
+    ]
+
+
+def _head_rows() -> list:
+    return [
+        {"PassengerId": 1, "Survived": 0, "Pclass": 3,
+         "Name": "Braund Owen", "Sex": "male"},
+        {"PassengerId": 2, "Survived": 1, "Pclass": 1,
+         "Name": "Cumings Florence", "Sex": "female"},
+        {"PassengerId": 3, "Survived": 1, "Pclass": 3,
+         "Name": "Heikkinen Laina", "Sex": "female"},
+    ]
+
+
+def _profile(with_head: bool = True) -> dict:
+    prof = {
+        "table": "titanic",
+        "source": "/data/titanic.csv",
+        "profiled_at": "2026-06-30T10:00:00+00:00",
+        "n_rows": 891,
+        "n_cols": 5,
+        "quality_score": 88.0,
+        "columns": _columns(),
+    }
+    if with_head:
+        prof["head_rows"] = _head_rows()
+    return prof
+
+
+def _pdf_text(path: str) -> str:
+    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
+    return re.sub(r"\s+", " ", txt)
+
+
+def _pptx_text(path: str) -> str:
+    prs = Presentation(path)
+    parts = []
+    for sl in prs.slides:
+        for sh in sl.shapes:
+            if sh.has_text_frame:
+                parts.append(sh.text_frame.text)
+            if sh.has_table:
+                tb = sh.table
+                for r in range(len(tb.rows)):
+                    for c in range(len(tb.columns)):
+                        parts.append(tb.cell(r, c).text)
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def _flatten(blocks):
+    """Recursively flatten Group blocks into a flat list (none here today)."""
+    out = []
+    for b in blocks:
+        inner = getattr(b, "blocks", None)
+        if inner is not None and getattr(b, "kind", None) == "group":
+            out.extend(_flatten(inner))
+        else:
+            out.append(b)
+    return out
+
+
+def test_golden_build_overview_muestra_head_desde_profile():
+    ch = build_overview(_profile(), {})
+    assert ch is not None
+    assert ch.id == CHAPTER_ID
+    assert ch.version == CHAPTER_VERSION
+    blocks = _flatten(ch.blocks)
+    # The first DataTable is df.head: its header is the column names and the
+    # real first rows are present (not a placeholder note).
+    tables = [b for b in blocks if isinstance(b, DataTable)]
+    assert tables, "overview must emit at least the df.head DataTable"
+    head_tbl = tables[0]
+    assert head_tbl.header == ["PassengerId", "Survived", "Pclass",
+                               "Name", "Sex"]
+    assert len(head_tbl.rows) == 3
+    flat = [str(c) for row in head_tbl.rows for c in row]
+    assert "Braund Owen" in flat and "Cumings Florence" in flat
+    # Honest note carries how many rows shown out of the dataset total.
+    assert head_tbl.note is not None
+    assert "primeras 3 filas" in head_tbl.note and "891" in head_tbl.note
+    # No "df.head no disponible" placeholder when head_rows is present.
+    assert not any(isinstance(b, Note) and "no disponible" in b.text
+                   for b in blocks)
+
+
+def test_golden_head_desde_ctx_tambien_funciona():
+    # head_rows absent in profile but present in ctx (build_eda_render_ctx path).
+    prof = _profile(with_head=False)
+    ch = build_overview(prof, {"head_rows": _head_rows()})
+    assert ch is not None
+    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
+    flat = [str(c) for row in tables[0].rows for c in row]
+    assert "Braund Owen" in flat
+
+
+def test_golden_render_pdf_muestra_head():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pdf")
+        res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pdf_text(out)
+        assert "Braund" in txt and "male" in txt
+        assert "primeras" in txt          # head note rendered.
+        assert "df.head" in txt           # chapter heading rendered.
+        assert "no disponible" not in txt  # placeholder NOT shown.
+
+
+def test_golden_render_pptx_muestra_head():
+    with tempfile.TemporaryDirectory() as d:
+        out = os.path.join(d, "eda.pptx")
+        res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
+        assert res["path"] == out and os.path.exists(out)
+        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
+        txt = _pptx_text(out)
+        assert "Braund" in txt and "Cumings" in txt
+
+
+def test_edge_sin_head_rows_degrada_a_nota_honesta():
+    # No head data anywhere: chapter still builds (columns exist), shows the
+    # honest placeholder note, and never invents rows nor raises.
+    prof = _profile(with_head=False)
+    ch = build_overview(prof, {})
+    assert ch is not None
+    blocks = _flatten(ch.blocks)
+    assert any(isinstance(b, Note) and "no disponible" in b.text
+               for b in blocks)
+    # The first DataTable now is the column dictionary, not df.head rows.
+    tables = [b for b in blocks if isinstance(b, DataTable)]
+    assert all("Braund" not in str(c)
+               for tbl in tables for row in tbl.rows for c in row)
+
+
+def test_edge_none_y_vacio_no_rompen():
+    # Nothing to render at all -> None, no raise.
+    assert build_overview(None, None) is None
+    assert build_overview({}, {}) is None
+    assert build_overview({"columns": []}, {}) is None
+    # Only head_rows (no columns) still yields a chapter with the head table.
+    ch = build_overview({"columns": []}, {"head_rows": _head_rows()})
+    assert ch is not None
+    tables = [b for b in _flatten(ch.blocks) if isinstance(b, DataTable)]
+    assert tables and len(tables[0].rows) == 3
@@ -2,17 +2,8 @@

 Builds the document cover from a TableProfile plus an optional ``ctx`` of
 presentation metadata. Reads everything defensively (``.get``) and degrades
-honestly.
-
-The dataset size (N rows x M columns) is always shown big, as a heading right
-under the dataset name (kept together in a ``Group``), not buried in the
-metadata table. The Description and Granularity are resolved through a cascade
-so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
-(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
-``row_meaning``; otherwise a short summary is derived from the profile itself
-(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
-key-candidate columns or the table shape. Nothing is invented: the derived
-fallbacks state that they come from the profile.
+honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
+placeholder rather than invented, leaving a hook for the LLM layer to fill it.

 Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
    build_<id>(profile: dict, ctx: dict) -> Chapter | None
@@ -26,15 +17,10 @@ from datetime import datetime, timezone

 from .. import model

-CHAPTER_VERSION = "1.2.0"
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "portada"
 CHAPTER_TITLE = "Portada"

-# Key under which eda_llm_insights stores its interpretive block in the profile.
-# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
-# row represents) from it when the LLM layer ran (``run_llm``).
-_LLM_KEY = "llm"
-
 # Default human description of what the table quality score measures. Chapters
 # can override it via ctx["quality_criteria"].
 _DEFAULT_QUALITY_CRITERIA = (
@@ -156,88 +142,6 @@ def _fmt_date_eu(value) -> str:
        return s


-def _llm_block(profile: dict, ctx: dict) -> dict:
-    """Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
-
-    It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
-    may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
-    not a dict degrades to an empty dict so the cover never raises.
-    """
-    block = profile.get(_LLM_KEY)
-    if not isinstance(block, dict):
-        block = ctx.get(_LLM_KEY)
-    return block if isinstance(block, dict) else {}
-
-
-def _count_column_types(profile: dict, ctx: dict):
-    """Best-effort (n_numeric, n_categorical) for the dataset.
-
-    Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
-    over the whole body); falls back to counting the profile columns directly so
-    the cover still has the numbers when no summary was passed.
-    """
-    summary = ctx.get("document_summary")
-    if isinstance(summary, dict):
-        n_num = summary.get("n_numeric")
-        n_cat = summary.get("n_categorical")
-        if n_num is not None or n_cat is not None:
-            return n_num, n_cat
-    cols = profile.get("columns") or []
-    n_num = sum(1 for c in cols if isinstance(c, dict)
-                and c.get("inferred_type") == "numeric")
-    n_cat = sum(1 for c in cols if isinstance(c, dict)
-                and isinstance(c.get("categorical"), dict)
-                and c.get("categorical", {}).get("top")
-                and c.get("inferred_type") != "numeric")
-    return n_num, n_cat
-
-
-def _derive_description(profile: dict, ctx: dict) -> str:
-    """A short, honest description of the dataset from the profile.
-
-    Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
-    available. Summarizes shape, column-type mix and quality score; never empty,
-    never invents business meaning (it states the description was derived)."""
-    n_rows = profile.get("n_rows")
-    n_cols = profile.get("n_cols")
-    n_num, n_cat = _count_column_types(profile, ctx)
-    head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
-    type_bits = []
-    if n_num:
-        type_bits.append(f"{_fmt_int(n_num)} numéricas")
-    if n_cat:
-        type_bits.append(f"{_fmt_int(n_cat)} categóricas")
-    if type_bits:
-        head += " (" + ", ".join(type_bits) + ")"
-    parts = [head + "."]
-    score = profile.get("quality_score")
-    if score is not None:
-        parts.append(f"Calidad media estimada: {score}/100.")
-    parts.append(
-        "Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
-        "para una descripción de negocio más rica.")
-    return " ".join(parts)
-
-
-def _derive_granularity(profile: dict, dataset_name: str) -> str:
-    """A ``Cada fila es…`` granularity sentence from the profile.
-
-    Prefers the key-candidate columns (a row is identified by them); when no key
-    is detected, falls back to the table shape so the line is always meaningful
-    and starts with ``Cada fila es`` as the user requested."""
-    keys = profile.get("key_candidates") or []
-    if keys:
-        shown = ", ".join(str(k) for k in keys[:3])
-        more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
-        return (f"Cada fila es un registro identificado por {shown}{more}, "
-                "candidata(s) a clave por ser únicas y sin nulos.")
-    n_rows = profile.get("n_rows")
-    tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
-    return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
-            "columna identificadora única, así que la granularidad se infiere "
-            "de la forma de la tabla." + tail)
-
-
 def build_portada(profile: dict, ctx: dict):
    """Build the cover Chapter, or None if there is truly nothing to show."""
    profile = profile or {}
@@ -262,38 +166,30 @@ def build_portada(profile: dict, ctx: dict):
    quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
    quality_value = "—" if score is None else f"{score} / 100"

-    llm = _llm_block(profile, ctx)
-
-    # Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
-    # candidates; finally a shape-based fallback. Always a real "Cada fila es…".
+    # Granularity: ctx wins; else derive from key candidates; else be honest.
    granularity = ctx.get("granularity")
    if not granularity:
-        granularity = (llm.get("row_meaning") or "").strip() or None
-    if not granularity:
-        granularity = _derive_granularity(profile, str(dataset_name))
+        keys = profile.get("key_candidates") or []
+        if keys:
+            granularity = ("Cada fila parece identificada por "
+                           + ", ".join(str(k) for k in keys[:3]) + ".")
+        else:
+            granularity = ("Cada fila es… (granularidad no determinada — "
+                           "pendiente de la capa de cálculo/LLM).")

-    # Description: explicit ctx wins; then the LLM "summary"; finally a short
-    # profile-derived summary. Never the old empty placeholder.
    description = ctx.get("description")
    if not description:
-        description = (llm.get("summary") or "").strip() or None
-    if not description:
-        description = _derive_description(profile, ctx)
-
-    # Title + dataset size shown together and BIG (Heading) at the top, kept on
-    # the same page (Group). The size is no longer buried in the metadata table.
-    cover = [
-        model.Heading(text=str(dataset_name), level=1),
-        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
-        model.Heading(text=shape, level=2),
-    ]
+        description = ("Descripción no provista — pendiente de la capa LLM "
+                       "(`run_llm`) o de `ctx['description']`.")

    blocks = [
-        model.Group(blocks=cover),
+        model.Heading(text=str(dataset_name), level=1),
+        model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
        model.KVTable(rows=[
            ("Fuente", source_origin),
            ("Almacenamiento", storage),
            ("Generado", when),
+            ("Tamaño", shape),
            ("Calidad", quality_value),
            ("Criterios de calidad", quality_criteria),
        ]),
@@ -1,197 +0,0 @@
-"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
-
-Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
-and deterministic. Verifies the Fase 4b improvements:
-
-1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
-   heading kept together with the dataset name in a ``Group`` — and is no longer
-   a row of the metadata table.
-2. Description and Granularity are resolved through a real cascade and are never
-   the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
-   block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
-   short summary is derived from the profile and a "Cada fila es…" sentence from
-   the key-candidate columns or the table shape.
-3. The chapter degrades without raising on empty/None input.
-4. It renders inside the full document to both PDF and PPTX showing that content.
-"""
-
-import os
-import re
-import tempfile
-
-from pypdf import PdfReader
-from pptx import Presentation
-
-from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
-from datascience.automatic_eda.chapters.portada import (
-    CHAPTER_ID, CHAPTER_VERSION, build_portada,
-)
-from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
-from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
-
-
-def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
-    prof = {
-        "table": "titanic",
-        "source": "/data/titanic.csv",
-        "profiled_at": "2026-06-30T10:00:00+00:00",
-        "n_rows": 891,
-        "n_cols": 12,
-        "quality_score": 78.0,
-        "columns": [
-            {"name": "PassengerId", "inferred_type": "numeric",
-             "null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
-                                          "max": 891.0, "std": 257.0}},
-            {"name": "Survived", "inferred_type": "numeric",
-             "null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
-                                          "max": 1.0, "std": 0.49}},
-            {"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
-             "categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
-                                     {"value": "female", "count": 314,
-                                      "pct": 0.35}],
-                             "mode": "male", "n_distinct": 2, "entropy": 0.93}},
-        ],
-    }
-    if with_keys:
-        prof["key_candidates"] = ["PassengerId"]
-    if with_llm:
-        prof["llm"] = {
-            "summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
-            "row_meaning": "Cada fila es un pasajero del Titanic.",
-            "dictionary": [], "pii": [], "cleaning": [], "analyses": [],
-        }
-    return prof
-
-
-def _pdf_text(path: str) -> str:
-    txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
-    return re.sub(r"\s+", " ", txt)
-
-
-def _pptx_text(path: str) -> str:
-    prs = Presentation(path)
-    parts = []
-    for sl in prs.slides:
-        for sh in sl.shapes:
-            if sh.has_text_frame:
-                parts.append(sh.text_frame.text)
-            if sh.has_table:
-                tb = sh.table
-                for r in range(len(tb.rows)):
-                    for c in range(len(tb.columns)):
-                        parts.append(tb.cell(r, c).text)
-    return re.sub(r"\s+", " ", " ".join(parts))
-
-
-def _markdown_after(blocks, heading_text):
-    """Return the Markdown block that follows a Heading whose text matches."""
-    for i, b in enumerate(blocks):
-        if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
-            for nb in blocks[i + 1:]:
-                if isinstance(nb, Markdown):
-                    return nb
-    return None
-
-
-def test_golden_tamano_grande_y_textos_llm():
-    ch = build_portada(_profile(), {})
-    assert ch is not None
-    assert ch.id == CHAPTER_ID
-    assert ch.version == CHAPTER_VERSION
-
-    # 1) Title + size kept together in a Group; size is a BIG level-2 heading.
-    group = next(b for b in ch.blocks if isinstance(b, Group))
-    inner = group.blocks
-    assert isinstance(inner[0], Heading) and inner[0].level == 1
-    assert inner[0].text == "titanic"
-    size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
-    assert "891" in size_h.text and "12" in size_h.text
-    assert "filas" in size_h.text and "columnas" in size_h.text
-
-    # 2) Size is no longer a row of the metadata table.
-    kv = next(b for b in ch.blocks if isinstance(b, KVTable))
-    labels = [r[0] for r in kv.rows]
-    assert "Tamaño" not in labels
-    assert "Fuente" in labels and "Calidad" in labels
-
-    # 3) Description and Granularity come from the LLM block.
-    desc = _markdown_after(ch.blocks, "Descripción")
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    assert desc is not None and "Titanic" in desc.text
-    assert gran is not None and gran.text.startswith("Cada fila es")
-    assert "pasajero" in gran.text.lower()
-
-
-def test_fallback_sin_llm_usa_keys_y_perfil():
-    # No LLM block: description derived from the profile, granularity from keys.
-    ch = build_portada(_profile(with_llm=False, with_keys=True), {})
-    desc = _markdown_after(ch.blocks, "Descripción")
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    # Description is the derived summary, never the old "pendiente" placeholder.
-    assert "pendiente" not in desc.text.lower()
-    assert "891" in desc.text and "columnas" in desc.text
-    assert "numéricas" in desc.text or "categóricas" in desc.text
-    # Granularity mentions the key candidate and starts with "Cada fila es".
-    assert gran.text.startswith("Cada fila es")
-    assert "PassengerId" in gran.text
-    assert "…" not in gran.text  # the old ellipsis placeholder is gone.
-
-
-def test_fallback_sin_llm_sin_keys_usa_forma():
-    ch = build_portada(_profile(with_llm=False, with_keys=False), {})
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    assert gran.text.startswith("Cada fila es")
-    assert "titanic" in gran.text.lower()
-    assert "pendiente" not in gran.text.lower()
-
-
-def test_ctx_explicito_gana_sobre_llm():
-    ctx = {"description": "Descripción manual.",
-           "granularity": "Cada fila es una unidad manual."}
-    ch = build_portada(_profile(), ctx)
-    desc = _markdown_after(ch.blocks, "Descripción")
-    gran = _markdown_after(ch.blocks, "Granularidad")
-    assert desc.text == "Descripción manual."
-    assert gran.text == "Cada fila es una unidad manual."
-
-
-def test_edge_perfil_vacio_no_lanza():
-    # Empty / None never raise; the cover still shows a size and real texts.
-    for prof, ctx in (({}, {}), (None, None)):
-        ch = build_portada(prof, ctx)
-        assert ch is not None
-        group = next(b for b in ch.blocks if isinstance(b, Group))
-        size_h = next(b for b in group.blocks
-                      if isinstance(b, Heading) and b.level == 2)
-        assert "filas" in size_h.text and "columnas" in size_h.text
-        desc = _markdown_after(ch.blocks, "Descripción")
-        gran = _markdown_after(ch.blocks, "Granularidad")
-        assert desc.text and "pendiente" not in desc.text.lower()
-        assert gran.text.startswith("Cada fila es")
-
-
-def test_golden_render_pdf_muestra_portada():
-    prof = _profile()
-    with tempfile.TemporaryDirectory() as d:
-        out = os.path.join(d, "eda.pdf")
-        res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
-        assert res["path"] == out and os.path.exists(out)
-        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
-        txt = _pdf_text(out)
-        assert "titanic" in txt.lower()
-        assert "891" in txt and "filas" in txt and "columnas" in txt
-        assert "Titanic" in txt          # LLM summary in the Description.
-        assert "Cada fila es" in txt     # granularity sentence.
-
-
-def test_golden_render_pptx_muestra_portada():
-    prof = _profile()
-    with tempfile.TemporaryDirectory() as d:
-        out = os.path.join(d, "eda.pptx")
-        res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
-        assert res["path"] == out and os.path.exists(out)
-        assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
-        txt = _pptx_text(out)
-        assert "titanic" in txt.lower()
-        assert "891" in txt and "columnas" in txt
-        assert "Cada fila es" in txt
@@ -20,6 +20,10 @@ vacia y el resto del ctx se construye igual. Ante un fallo global devuelve al
 menos ``{**base_ctx, "db_path": db_path, "table": table}``.

 Claves de DATOS que produce (las consumen los capitulos):
+  - ``head_rows``      : [ {col: valor, ...}, ... ] primeras filas CRUDAS de la
+                         tabla (``SELECT * LIMIT head_n``), una entrada por fila.
+                         La lee el capitulo OVERVIEW para mostrar df.head real en
+                         lugar del placeholder "df.head no disponible".
  - ``raw_numeric``    : {col: [float|None, ...]} muestra cruda de las columnas
                         numericas, ALINEADA POR FILA (una entrada por fila aunque
                         sea None). La leen modelos (clustering 2D en vivo) y
@@ -56,7 +60,7 @@ def _to_float(value):
        return None


-def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None):
+def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000, base_ctx=None, head_n=10):
    """Construye el ctx de datos crudos para los renderers de AutomaticEDA.

    Args:
@@ -77,13 +81,15 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        base_ctx: dict opcional con claves de presentacion ya preparadas
            (dataset_name, source_origin, ...). Se parte de una copia y NO se
            pisan sus claves; solo se añaden las de datos. Default None -> {}.
+        head_n: numero de filas crudas a muestrear para ``ctx["head_rows"]``
+            (df.head del capitulo OVERVIEW). Default 10. <=0 omite la clave.

    Returns:
        El dict ``ctx`` directamente (NO un wrapper {status,...}): se pasa tal
        cual como ``meta={"ctx": <ese dict>}`` a render_automatic_eda_pdf/pptx.
-        Nunca lanza. Claves que puede contener: raw_numeric, timeseries_raw,
-        geo_points (omitidas si no aplican o fallan), y siempre db_path + table
-        para backends validos.
+        Nunca lanza. Claves que puede contener: head_rows, raw_numeric,
+        timeseries_raw, geo_points (omitidas si no aplican o fallan), y siempre
+        db_path + table para backends validos.
    """
    # Copia de base_ctx: nunca mutamos el dict del caller. Las claves de
    # presentacion que ya traiga se conservan; las de datos se añaden encima.
@@ -117,6 +123,24 @@ def build_eda_render_ctx(db_path, table, profile, backend="duckdb", sample=5000,
        ctx["db_path"] = db_path
        ctx["table"] = table

+        # 1.5) head_rows: primeras filas CRUDAS de la tabla (SELECT * LIMIT n)
+        # para que el capitulo OVERVIEW muestre df.head real en vez del
+        # placeholder. Una sola query, dict-no-throw: si falla, se omite la
+        # clave (el capitulo degrada a su nota honesta). No se pisa una clave
+        # head_rows que ya viniera en base_ctx (presentacion).
+        if head_n and int(head_n) > 0 and "head_rows" not in ctx:
+            try:
+                hq = query_fn(f'SELECT * FROM "{table}" LIMIT {int(head_n)}')
+                if isinstance(hq, dict) and hq.get("status") == "ok":
+                    hrows = [
+                        dict(r) for r in (hq.get("rows") or [])
+                        if isinstance(r, dict)
+                    ]
+                    if hrows:
+                        ctx["head_rows"] = hrows
+            except Exception:  # noqa: BLE001 - dict-no-throw: omitir la clave
+                pass
+
        # 2) Columnas del perfil agregado (lectura defensiva).
        cols = profile.get("columns") if isinstance(profile, dict) else None
        cols = cols or []
@@ -536,6 +536,21 @@ def profile_table(
                type_breakdown[it] += 1
        prof["type_breakdown"] = type_breakdown

+        # 8.1) Primeras filas crudas (df.head) para el capitulo OVERVIEW del motor
+        # AutomaticEDA: una muestra SELECT col1,col2,... LIMIT 10 alineada por fila.
+        # Se reusa _sample_rows (mismo lector read-only). Estilo dict-no-throw: si
+        # falla, head_rows queda None y el capitulo degrada a su nota honesta. El
+        # capitulo lo recoge via profile["head_rows"]; build_eda_render_ctx ademas
+        # lo replica en ctx["head_rows"] cuando se construye el contexto de render.
+        try:
+            head_names = [c.get("name") for c in cols if c.get("name")]
+            head_rows = _sample_rows(_q, table, head_names, 10)
+            prof["head_rows"] = [
+                dict(r) for r in head_rows if isinstance(r, dict)
+            ] or None
+        except Exception:  # noqa: BLE001
+            prof["head_rows"] = None
+
        # 8.5) Matriz de correlacion/asociacion sobre una muestra de filas
        # alineadas. Elige la metrica por par de tipos (Pearson/Spearman,
        # Cramer's V/Theil's U, correlation ratio, MI) via association_matrix.