merge: 4b portada — tamano grande junto al nombre + descripcion y granularidad funcionando (verificado met)
This commit is contained in:
@@ -2,8 +2,17 @@
|
|||||||
|
|
||||||
Builds the document cover from a TableProfile plus an optional ``ctx`` of
|
Builds the document cover from a TableProfile plus an optional ``ctx`` of
|
||||||
presentation metadata. Reads everything defensively (``.get``) and degrades
|
presentation metadata. Reads everything defensively (``.get``) and degrades
|
||||||
honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
|
honestly.
|
||||||
placeholder rather than invented, leaving a hook for the LLM layer to fill it.
|
|
||||||
|
The dataset size (N rows x M columns) is always shown big, as a heading right
|
||||||
|
under the dataset name (kept together in a ``Group``), not buried in the
|
||||||
|
metadata table. The Description and Granularity are resolved through a cascade
|
||||||
|
so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
|
||||||
|
(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
|
||||||
|
``row_meaning``; otherwise a short summary is derived from the profile itself
|
||||||
|
(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
|
||||||
|
key-candidate columns or the table shape. Nothing is invented: the derived
|
||||||
|
fallbacks state that they come from the profile.
|
||||||
|
|
||||||
Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
|
Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
|
||||||
build_<id>(profile: dict, ctx: dict) -> Chapter | None
|
build_<id>(profile: dict, ctx: dict) -> Chapter | None
|
||||||
@@ -17,10 +26,15 @@ from datetime import datetime, timezone
|
|||||||
|
|
||||||
from .. import model
|
from .. import model
|
||||||
|
|
||||||
CHAPTER_VERSION = "1.1.0"
|
CHAPTER_VERSION = "1.2.0"
|
||||||
CHAPTER_ID = "portada"
|
CHAPTER_ID = "portada"
|
||||||
CHAPTER_TITLE = "Portada"
|
CHAPTER_TITLE = "Portada"
|
||||||
|
|
||||||
|
# Key under which eda_llm_insights stores its interpretive block in the profile.
|
||||||
|
# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
|
||||||
|
# row represents) from it when the LLM layer ran (``run_llm``).
|
||||||
|
_LLM_KEY = "llm"
|
||||||
|
|
||||||
# Default human description of what the table quality score measures. Chapters
|
# Default human description of what the table quality score measures. Chapters
|
||||||
# can override it via ctx["quality_criteria"].
|
# can override it via ctx["quality_criteria"].
|
||||||
_DEFAULT_QUALITY_CRITERIA = (
|
_DEFAULT_QUALITY_CRITERIA = (
|
||||||
@@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str:
|
|||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _llm_block(profile: dict, ctx: dict) -> dict:
|
||||||
|
"""Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
|
||||||
|
|
||||||
|
It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
|
||||||
|
may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
|
||||||
|
not a dict degrades to an empty dict so the cover never raises.
|
||||||
|
"""
|
||||||
|
block = profile.get(_LLM_KEY)
|
||||||
|
if not isinstance(block, dict):
|
||||||
|
block = ctx.get(_LLM_KEY)
|
||||||
|
return block if isinstance(block, dict) else {}
|
||||||
|
|
||||||
|
|
||||||
|
def _count_column_types(profile: dict, ctx: dict):
|
||||||
|
"""Best-effort (n_numeric, n_categorical) for the dataset.
|
||||||
|
|
||||||
|
Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
|
||||||
|
over the whole body); falls back to counting the profile columns directly so
|
||||||
|
the cover still has the numbers when no summary was passed.
|
||||||
|
"""
|
||||||
|
summary = ctx.get("document_summary")
|
||||||
|
if isinstance(summary, dict):
|
||||||
|
n_num = summary.get("n_numeric")
|
||||||
|
n_cat = summary.get("n_categorical")
|
||||||
|
if n_num is not None or n_cat is not None:
|
||||||
|
return n_num, n_cat
|
||||||
|
cols = profile.get("columns") or []
|
||||||
|
n_num = sum(1 for c in cols if isinstance(c, dict)
|
||||||
|
and c.get("inferred_type") == "numeric")
|
||||||
|
n_cat = sum(1 for c in cols if isinstance(c, dict)
|
||||||
|
and isinstance(c.get("categorical"), dict)
|
||||||
|
and c.get("categorical", {}).get("top")
|
||||||
|
and c.get("inferred_type") != "numeric")
|
||||||
|
return n_num, n_cat
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_description(profile: dict, ctx: dict) -> str:
|
||||||
|
"""A short, honest description of the dataset from the profile.
|
||||||
|
|
||||||
|
Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
|
||||||
|
available. Summarizes shape, column-type mix and quality score; never empty,
|
||||||
|
never invents business meaning (it states the description was derived)."""
|
||||||
|
n_rows = profile.get("n_rows")
|
||||||
|
n_cols = profile.get("n_cols")
|
||||||
|
n_num, n_cat = _count_column_types(profile, ctx)
|
||||||
|
head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
|
||||||
|
type_bits = []
|
||||||
|
if n_num:
|
||||||
|
type_bits.append(f"{_fmt_int(n_num)} numéricas")
|
||||||
|
if n_cat:
|
||||||
|
type_bits.append(f"{_fmt_int(n_cat)} categóricas")
|
||||||
|
if type_bits:
|
||||||
|
head += " (" + ", ".join(type_bits) + ")"
|
||||||
|
parts = [head + "."]
|
||||||
|
score = profile.get("quality_score")
|
||||||
|
if score is not None:
|
||||||
|
parts.append(f"Calidad media estimada: {score}/100.")
|
||||||
|
parts.append(
|
||||||
|
"Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
|
||||||
|
"para una descripción de negocio más rica.")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_granularity(profile: dict, dataset_name: str) -> str:
|
||||||
|
"""A ``Cada fila es…`` granularity sentence from the profile.
|
||||||
|
|
||||||
|
Prefers the key-candidate columns (a row is identified by them); when no key
|
||||||
|
is detected, falls back to the table shape so the line is always meaningful
|
||||||
|
and starts with ``Cada fila es`` as the user requested."""
|
||||||
|
keys = profile.get("key_candidates") or []
|
||||||
|
if keys:
|
||||||
|
shown = ", ".join(str(k) for k in keys[:3])
|
||||||
|
more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
|
||||||
|
return (f"Cada fila es un registro identificado por {shown}{more}, "
|
||||||
|
"candidata(s) a clave por ser únicas y sin nulos.")
|
||||||
|
n_rows = profile.get("n_rows")
|
||||||
|
tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
|
||||||
|
return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
|
||||||
|
"columna identificadora única, así que la granularidad se infiere "
|
||||||
|
"de la forma de la tabla." + tail)
|
||||||
|
|
||||||
|
|
||||||
def build_portada(profile: dict, ctx: dict):
|
def build_portada(profile: dict, ctx: dict):
|
||||||
"""Build the cover Chapter, or None if there is truly nothing to show."""
|
"""Build the cover Chapter, or None if there is truly nothing to show."""
|
||||||
profile = profile or {}
|
profile = profile or {}
|
||||||
@@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict):
|
|||||||
quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
|
quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
|
||||||
quality_value = "—" if score is None else f"{score} / 100"
|
quality_value = "—" if score is None else f"{score} / 100"
|
||||||
|
|
||||||
# Granularity: ctx wins; else derive from key candidates; else be honest.
|
llm = _llm_block(profile, ctx)
|
||||||
|
|
||||||
|
# Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
|
||||||
|
# candidates; finally a shape-based fallback. Always a real "Cada fila es…".
|
||||||
granularity = ctx.get("granularity")
|
granularity = ctx.get("granularity")
|
||||||
if not granularity:
|
if not granularity:
|
||||||
keys = profile.get("key_candidates") or []
|
granularity = (llm.get("row_meaning") or "").strip() or None
|
||||||
if keys:
|
if not granularity:
|
||||||
granularity = ("Cada fila parece identificada por "
|
granularity = _derive_granularity(profile, str(dataset_name))
|
||||||
+ ", ".join(str(k) for k in keys[:3]) + ".")
|
|
||||||
else:
|
|
||||||
granularity = ("Cada fila es… (granularidad no determinada — "
|
|
||||||
"pendiente de la capa de cálculo/LLM).")
|
|
||||||
|
|
||||||
|
# Description: explicit ctx wins; then the LLM "summary"; finally a short
|
||||||
|
# profile-derived summary. Never the old empty placeholder.
|
||||||
description = ctx.get("description")
|
description = ctx.get("description")
|
||||||
if not description:
|
if not description:
|
||||||
description = ("Descripción no provista — pendiente de la capa LLM "
|
description = (llm.get("summary") or "").strip() or None
|
||||||
"(`run_llm`) o de `ctx['description']`.")
|
if not description:
|
||||||
|
description = _derive_description(profile, ctx)
|
||||||
|
|
||||||
blocks = [
|
# Title + dataset size shown together and BIG (Heading) at the top, kept on
|
||||||
|
# the same page (Group). The size is no longer buried in the metadata table.
|
||||||
|
cover = [
|
||||||
model.Heading(text=str(dataset_name), level=1),
|
model.Heading(text=str(dataset_name), level=1),
|
||||||
model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
|
model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
|
||||||
|
model.Heading(text=shape, level=2),
|
||||||
|
]
|
||||||
|
|
||||||
|
blocks = [
|
||||||
|
model.Group(blocks=cover),
|
||||||
model.KVTable(rows=[
|
model.KVTable(rows=[
|
||||||
("Fuente", source_origin),
|
("Fuente", source_origin),
|
||||||
("Almacenamiento", storage),
|
("Almacenamiento", storage),
|
||||||
("Generado", when),
|
("Generado", when),
|
||||||
("Tamaño", shape),
|
|
||||||
("Calidad", quality_value),
|
("Calidad", quality_value),
|
||||||
("Criterios de calidad", quality_criteria),
|
("Criterios de calidad", quality_criteria),
|
||||||
]),
|
]),
|
||||||
|
|||||||
@@ -0,0 +1,197 @@
|
|||||||
|
"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
|
||||||
|
|
||||||
|
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||||
|
and deterministic. Verifies the Fase 4b improvements:
|
||||||
|
|
||||||
|
1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
|
||||||
|
heading kept together with the dataset name in a ``Group`` — and is no longer
|
||||||
|
a row of the metadata table.
|
||||||
|
2. Description and Granularity are resolved through a real cascade and are never
|
||||||
|
the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
|
||||||
|
block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
|
||||||
|
short summary is derived from the profile and a "Cada fila es…" sentence from
|
||||||
|
the key-candidate columns or the table shape.
|
||||||
|
3. The chapter degrades without raising on empty/None input.
|
||||||
|
4. It renders inside the full document to both PDF and PPTX showing that content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
|
||||||
|
from datascience.automatic_eda.chapters.portada import (
|
||||||
|
CHAPTER_ID, CHAPTER_VERSION, build_portada,
|
||||||
|
)
|
||||||
|
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||||
|
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||||
|
|
||||||
|
|
||||||
|
def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
|
||||||
|
prof = {
|
||||||
|
"table": "titanic",
|
||||||
|
"source": "/data/titanic.csv",
|
||||||
|
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||||
|
"n_rows": 891,
|
||||||
|
"n_cols": 12,
|
||||||
|
"quality_score": 78.0,
|
||||||
|
"columns": [
|
||||||
|
{"name": "PassengerId", "inferred_type": "numeric",
|
||||||
|
"null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
|
||||||
|
"max": 891.0, "std": 257.0}},
|
||||||
|
{"name": "Survived", "inferred_type": "numeric",
|
||||||
|
"null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
|
||||||
|
"max": 1.0, "std": 0.49}},
|
||||||
|
{"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
|
||||||
|
"categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
|
||||||
|
{"value": "female", "count": 314,
|
||||||
|
"pct": 0.35}],
|
||||||
|
"mode": "male", "n_distinct": 2, "entropy": 0.93}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
if with_keys:
|
||||||
|
prof["key_candidates"] = ["PassengerId"]
|
||||||
|
if with_llm:
|
||||||
|
prof["llm"] = {
|
||||||
|
"summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
|
||||||
|
"row_meaning": "Cada fila es un pasajero del Titanic.",
|
||||||
|
"dictionary": [], "pii": [], "cleaning": [], "analyses": [],
|
||||||
|
}
|
||||||
|
return prof
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_text(path: str) -> str:
|
||||||
|
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||||
|
return re.sub(r"\s+", " ", txt)
|
||||||
|
|
||||||
|
|
||||||
|
def _pptx_text(path: str) -> str:
|
||||||
|
prs = Presentation(path)
|
||||||
|
parts = []
|
||||||
|
for sl in prs.slides:
|
||||||
|
for sh in sl.shapes:
|
||||||
|
if sh.has_text_frame:
|
||||||
|
parts.append(sh.text_frame.text)
|
||||||
|
if sh.has_table:
|
||||||
|
tb = sh.table
|
||||||
|
for r in range(len(tb.rows)):
|
||||||
|
for c in range(len(tb.columns)):
|
||||||
|
parts.append(tb.cell(r, c).text)
|
||||||
|
return re.sub(r"\s+", " ", " ".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def _markdown_after(blocks, heading_text):
|
||||||
|
"""Return the Markdown block that follows a Heading whose text matches."""
|
||||||
|
for i, b in enumerate(blocks):
|
||||||
|
if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
|
||||||
|
for nb in blocks[i + 1:]:
|
||||||
|
if isinstance(nb, Markdown):
|
||||||
|
return nb
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_tamano_grande_y_textos_llm():
|
||||||
|
ch = build_portada(_profile(), {})
|
||||||
|
assert ch is not None
|
||||||
|
assert ch.id == CHAPTER_ID
|
||||||
|
assert ch.version == CHAPTER_VERSION
|
||||||
|
|
||||||
|
# 1) Title + size kept together in a Group; size is a BIG level-2 heading.
|
||||||
|
group = next(b for b in ch.blocks if isinstance(b, Group))
|
||||||
|
inner = group.blocks
|
||||||
|
assert isinstance(inner[0], Heading) and inner[0].level == 1
|
||||||
|
assert inner[0].text == "titanic"
|
||||||
|
size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
|
||||||
|
assert "891" in size_h.text and "12" in size_h.text
|
||||||
|
assert "filas" in size_h.text and "columnas" in size_h.text
|
||||||
|
|
||||||
|
# 2) Size is no longer a row of the metadata table.
|
||||||
|
kv = next(b for b in ch.blocks if isinstance(b, KVTable))
|
||||||
|
labels = [r[0] for r in kv.rows]
|
||||||
|
assert "Tamaño" not in labels
|
||||||
|
assert "Fuente" in labels and "Calidad" in labels
|
||||||
|
|
||||||
|
# 3) Description and Granularity come from the LLM block.
|
||||||
|
desc = _markdown_after(ch.blocks, "Descripción")
|
||||||
|
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||||
|
assert desc is not None and "Titanic" in desc.text
|
||||||
|
assert gran is not None and gran.text.startswith("Cada fila es")
|
||||||
|
assert "pasajero" in gran.text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_fallback_sin_llm_usa_keys_y_perfil():
|
||||||
|
# No LLM block: description derived from the profile, granularity from keys.
|
||||||
|
ch = build_portada(_profile(with_llm=False, with_keys=True), {})
|
||||||
|
desc = _markdown_after(ch.blocks, "Descripción")
|
||||||
|
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||||
|
# Description is the derived summary, never the old "pendiente" placeholder.
|
||||||
|
assert "pendiente" not in desc.text.lower()
|
||||||
|
assert "891" in desc.text and "columnas" in desc.text
|
||||||
|
assert "numéricas" in desc.text or "categóricas" in desc.text
|
||||||
|
# Granularity mentions the key candidate and starts with "Cada fila es".
|
||||||
|
assert gran.text.startswith("Cada fila es")
|
||||||
|
assert "PassengerId" in gran.text
|
||||||
|
assert "…" not in gran.text # the old ellipsis placeholder is gone.
|
||||||
|
|
||||||
|
|
||||||
|
def test_fallback_sin_llm_sin_keys_usa_forma():
|
||||||
|
ch = build_portada(_profile(with_llm=False, with_keys=False), {})
|
||||||
|
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||||
|
assert gran.text.startswith("Cada fila es")
|
||||||
|
assert "titanic" in gran.text.lower()
|
||||||
|
assert "pendiente" not in gran.text.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ctx_explicito_gana_sobre_llm():
|
||||||
|
ctx = {"description": "Descripción manual.",
|
||||||
|
"granularity": "Cada fila es una unidad manual."}
|
||||||
|
ch = build_portada(_profile(), ctx)
|
||||||
|
desc = _markdown_after(ch.blocks, "Descripción")
|
||||||
|
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||||
|
assert desc.text == "Descripción manual."
|
||||||
|
assert gran.text == "Cada fila es una unidad manual."
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_perfil_vacio_no_lanza():
|
||||||
|
# Empty / None never raise; the cover still shows a size and real texts.
|
||||||
|
for prof, ctx in (({}, {}), (None, None)):
|
||||||
|
ch = build_portada(prof, ctx)
|
||||||
|
assert ch is not None
|
||||||
|
group = next(b for b in ch.blocks if isinstance(b, Group))
|
||||||
|
size_h = next(b for b in group.blocks
|
||||||
|
if isinstance(b, Heading) and b.level == 2)
|
||||||
|
assert "filas" in size_h.text and "columnas" in size_h.text
|
||||||
|
desc = _markdown_after(ch.blocks, "Descripción")
|
||||||
|
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||||
|
assert desc.text and "pendiente" not in desc.text.lower()
|
||||||
|
assert gran.text.startswith("Cada fila es")
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_render_pdf_muestra_portada():
|
||||||
|
prof = _profile()
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
out = os.path.join(d, "eda.pdf")
|
||||||
|
res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
|
||||||
|
assert res["path"] == out and os.path.exists(out)
|
||||||
|
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||||
|
txt = _pdf_text(out)
|
||||||
|
assert "titanic" in txt.lower()
|
||||||
|
assert "891" in txt and "filas" in txt and "columnas" in txt
|
||||||
|
assert "Titanic" in txt # LLM summary in the Description.
|
||||||
|
assert "Cada fila es" in txt # granularity sentence.
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_render_pptx_muestra_portada():
|
||||||
|
prof = _profile()
|
||||||
|
with tempfile.TemporaryDirectory() as d:
|
||||||
|
out = os.path.join(d, "eda.pptx")
|
||||||
|
res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
|
||||||
|
assert res["path"] == out and os.path.exists(out)
|
||||||
|
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||||
|
txt = _pptx_text(out)
|
||||||
|
assert "titanic" in txt.lower()
|
||||||
|
assert "891" in txt and "columnas" in txt
|
||||||
|
assert "Cada fila es" in txt
|
||||||
Reference in New Issue
Block a user