feat(datascience): auto-commit con 7 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-03 00:48:43 +02:00
parent 5a4f82cf76
commit 8a78a70ef6
7 changed files with 817 additions and 8 deletions
@@ -17,7 +17,7 @@ from __future__ import annotations
from .. import model
CHAPTER_VERSION = "1.1.0"
CHAPTER_VERSION = "1.1.1"
CHAPTER_ID = "glosario"
CHAPTER_TITLE = "Glosario"
@@ -89,14 +89,19 @@ def build_glosario(profile: dict, ctx: dict):
"Cada término va resaltado en el texto y, al pulsarlo, salta a su "
"definición en esta sección.")),
]
# One clickable destination per term, alphabetically by visible label. A term
# registered without a definition is completed from the canonical baseline.
for term in glossary.terms(by="label"):
# One clickable destination per term, alphabetically by *visible* label. The
# baseline resolution must happen BEFORE sorting: a term registered bare (no
# label) carries its key as label in the collector, so ordering by the
# collector's label would place it by its key instead of by the human label
# supplied by the baseline catalog. Resolve first, then sort by the final label.
resolved = []
for term in glossary.terms(by="order"):
label, definition = _resolve_term(term)
resolved.append((label, definition, model._safe_str(term.get("key"))))
resolved.sort(key=lambda e: model._safe_str(e[0]).lower())
for label, definition, key in resolved:
blocks.append(model.GlossaryEntry(
key=model._safe_str(term.get("key")),
label=label,
definition=definition))
key=key, label=label, definition=definition))
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
version=CHAPTER_VERSION, blocks=blocks)
@@ -0,0 +1,181 @@
"""Tests for the GLOSARIO chapter — DoD: golden + edges + degradation + no-cut render.
The glossary is the last chapter of every AutomaticEDA document. It does not read
the profile: it turns the terms that the other chapters registered on the shared
``GlossaryCollector`` (``ctx['glossary']``) into one clickable ``GlossaryEntry``
destination each, alphabetically by visible label.
Covered here:
- **Golden**: a collector with three terms (one carrying its own definition, two
registered bare and completed from the canonical baseline catalog) builds a
``Chapter`` with three ``GlossaryEntry`` blocks, alphabetically ordered, and
renders to PDF and PPTX with nothing cut.
- **Baseline resolution** (``_resolve_term``): a bare term whose key is in the
baseline gets its label *and* definition filled in; a term that already carries
its own definition is never overwritten.
- **Edges**: ``None`` / ``{}`` ctx, an empty collector and a non-collector value in
``ctx['glossary']`` all return ``None`` (the chapter simply disappears) and never
raise, even with a ``None`` profile.
- **Click target**: every emitted entry carries the registered ``key`` so each
in-text ``[[term:key]]`` appearance resolves to a real jump.
"""
import os
import tempfile
from pptx import Presentation
from pypdf import PdfReader
from datascience.automatic_eda.chapters.glosario import (
_BASELINE_TERMS,
_resolve_term,
build_glosario,
)
from datascience.automatic_eda.model import (
Chapter,
GlossaryCollector,
GlossaryEntry,
)
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
# --------------------------------------------------------------------------- #
# Helpers.
# --------------------------------------------------------------------------- #
def _entries(chapter: Chapter) -> list:
"""The GlossaryEntry blocks of a built chapter, in document order."""
return [b for b in chapter.blocks if isinstance(b, GlossaryEntry)]
def _render_both(chapter: Chapter, tag: str):
"""Render the chapter to PDF and PPTX; return (pdf_text, n_slides)."""
tmp = tempfile.mkdtemp(prefix=f"glosario_{tag}_")
pdf_path = os.path.join(tmp, "out.pdf")
pptx_path = os.path.join(tmp, "out.pptx")
meta = {"title": f"EDA — {tag}"}
render_automatic_eda_pdf([chapter], pdf_path, meta)
render_automatic_eda_pptx([chapter], pptx_path, meta)
assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0
text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages)
n_slides = len(Presentation(pptx_path).slides)
return text, n_slides
def _collector_three_terms() -> GlossaryCollector:
"""A collector with three terms registered out of alphabetical order:
- ``entropia``: its own label + definition (must not be baseline-overwritten).
- ``pagina_categorica``: bare, completed from the baseline.
- ``histograma_boxplot``: bare, completed from the baseline.
"""
g = GlossaryCollector()
g.add("entropia", "Entropía",
"Medida de la incertidumbre o dispersión de una variable categórica.")
g.add("pagina_categorica") # bare -> baseline label + definition
g.add("histograma_boxplot") # bare -> baseline label + definition
return g
# --------------------------------------------------------------------------- #
# Golden.
# --------------------------------------------------------------------------- #
def test_golden_terms_render_clickable_entries():
g = _collector_three_terms()
chapter = build_glosario({"table": "x"}, {"glossary": g})
assert isinstance(chapter, Chapter)
assert chapter.id == "glosario"
assert chapter.title == "Glosario"
assert chapter.version == "1.1.1"
entries = _entries(chapter)
assert len(entries) == 3
assert all(isinstance(e, GlossaryEntry) for e in entries)
# Alphabetical by visible label: "Cómo leer…" < "Cómo se organiza…" < "Entropía".
labels = [e.label for e in entries]
assert labels == sorted(labels, key=str.lower)
assert labels[0] == "Cómo leer el histograma y el boxplot"
assert labels[-1] == "Entropía"
# Bare terms were completed from the baseline; the own-definition term survived.
by_key = {e.key: e for e in entries}
assert "boxplot de Tukey" in by_key["histograma_boxplot"].definition
assert "identificador" in by_key["pagina_categorica"].definition
assert by_key["entropia"].definition.startswith("Medida de la incertidumbre")
# Renders with nothing cut; the labels and a definition fragment reach the PDF.
pdf_text, n_slides = _render_both(chapter, "golden")
assert "Entropía" in pdf_text
assert n_slides >= 1
# --------------------------------------------------------------------------- #
# Baseline resolution (_resolve_term).
# --------------------------------------------------------------------------- #
def test_resolve_term_completes_label_and_definition_from_baseline():
# A bare registration keeps label == key and an empty definition; the resolver
# fills both from the canonical catalog.
key = "histograma_boxplot"
label, definition = _resolve_term({"key": key, "label": key, "definition": ""})
assert label == _BASELINE_TERMS[key]["label"]
assert "boxplot de Tukey" in definition
def test_resolve_term_keeps_own_definition_over_baseline():
# Even when the key is in the baseline, a term that already carries its own
# definition (and a real label) must not be overwritten.
key = "pagina_categorica"
own_def = "Definición propia que no debe pisarse."
label, definition = _resolve_term(
{"key": key, "label": "Mi etiqueta", "definition": own_def})
assert label == "Mi etiqueta"
assert definition == own_def
def test_resolve_term_unknown_key_returns_as_is():
label, definition = _resolve_term(
{"key": "sin_baseline", "label": "Término libre", "definition": "Texto."})
assert label == "Término libre"
assert definition == "Texto."
# --------------------------------------------------------------------------- #
# Edges / degradation — the chapter disappears instead of raising.
# --------------------------------------------------------------------------- #
def test_none_when_no_glossary():
assert build_glosario({"table": "x"}, {}) is None
assert build_glosario({"table": "x"}, None) is None
def test_none_when_empty_collector():
assert build_glosario({"table": "x"}, {"glossary": GlossaryCollector()}) is None
def test_none_when_glossary_is_not_a_collector():
# A stray value in ctx['glossary'] must not be treated as a collector.
assert build_glosario({"table": "x"}, {"glossary": ["not", "a", "collector"]}) is None
assert build_glosario({"table": "x"}, {"glossary": {"entropia": "x"}}) is None
def test_none_profile_does_not_raise():
# The glossary ignores the profile; a None profile with a valid collector still
# builds, and a None profile with no glossary still returns None (no crash).
g = GlossaryCollector()
g.add("entropia", "Entropía", "def")
chapter = build_glosario(None, {"glossary": g})
assert isinstance(chapter, Chapter)
assert build_glosario(None, None) is None
# --------------------------------------------------------------------------- #
# Click target — each entry carries its registration key.
# --------------------------------------------------------------------------- #
def test_entries_carry_registered_key_as_click_target():
g = _collector_three_terms()
chapter = build_glosario({}, {"glossary": g})
keys = {e.key for e in _entries(chapter)}
assert keys == {"entropia", "pagina_categorica", "histograma_boxplot"}