Files
fn_registry/python/functions/datascience/automatic_eda/chapters/glosario.py
T
2026-07-03 00:48:43 +02:00

108 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Glossary chapter (GLOSARIO) — always the last chapter, clickable terms.
Renders one entry per glossary term that the other chapters registered during
the document build through ``ctx['glossary'].add(key, label, definition)`` (see
``GlossaryCollector`` in ``model.py``). Each entry is a clickable destination:
every in-text appearance a chapter marked with ``[[term:key]]texto[[/term]]``
becomes a real jump to its entry here — PDF link annotations (PyMuPDF) and PPTX
native slide jumps, both wired by the renderers.
Returns ``None`` when no term was registered (there is nothing to show), so the
chapter simply disappears from documents that did not mark any term.
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""
from __future__ import annotations
from .. import model
CHAPTER_VERSION = "1.1.1"
CHAPTER_ID = "glosario"
CHAPTER_TITLE = "Glosario"
# Canonical definitions for cross-cutting terms — the "how to read it" entries
# that do not belong to a single chapter. A chapter only needs to *register* the
# term (``ctx['glossary'].add(key, label)``) and mark its in-text appearance with
# ``[[term:key]]…[[/term]]``; this chapter supplies the full definition here when
# the collector carries the term without one. Keeping the prose in a single place
# avoids repeating a long paragraph inline in every chapter that names the term
# (the explanation moved out of the NUM DISTR and CAT DISTR intros lives here).
_BASELINE_TERMS = {
"histograma_boxplot": {
"label": "Cómo leer el histograma y el boxplot",
"definition": (
"Para cada columna numérica se muestra su histograma con tres líneas "
"de referencia: la media (línea roja discontinua), la mediana (línea "
"verde continua) y la banda ±1σ (zona sombreada que cubre una "
"desviación estándar a cada lado de la media). Debajo, alineado al "
"mismo eje horizontal, un boxplot de Tukey: la caja abarca del primer "
"al tercer cuartil (P25P75), la línea interior es la mediana y los "
"bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
"valores más allá de las vallas (posibles atípicos). Comparar la media "
"con la mediana revela la asimetría: si la media supera a la mediana la "
"cola larga cae hacia los valores altos (asimetría a la derecha), y al "
"revés hacia los bajos."),
},
"pagina_categorica": {
"label": "Cómo se organiza cada página categórica",
"definition": (
"Cada columna categórica ocupa su propia página: muestra sus métricas "
"de cardinalidad —incluida la entropía—, una nota que señala "
"cardinalidad problemática (columnas que se comportan como "
"identificador, con casi todos los valores distintos, o dominadas por "
"una sola categoría), la tabla de las categorías más frecuentes (top-k, "
"con su conteo y porcentaje) y un gráfico de barras de las categorías "
"más comunes (top-k más una barra «Otros» que agrupa la cola). El total "
"de filas del dataset se usa como referencia para interpretar los "
"conteos."),
},
}
def _resolve_term(term: dict) -> tuple:
"""Return (label, definition) for a collected term, completing a missing
definition (and, if absent, the label) from the canonical baseline catalog."""
key = model._safe_str(term.get("key"))
label = model._safe_str(term.get("label"))
definition = model._safe_str(term.get("definition"))
base = _BASELINE_TERMS.get(key)
if base:
if not definition.strip():
definition = model._safe_str(base.get("definition"))
if not label.strip() or label == key:
label = model._safe_str(base.get("label")) or label
return label, definition
def build_glosario(profile: dict, ctx: dict):
"""Build the glossary Chapter from the shared collector, or None if empty."""
ctx = ctx or {}
glossary = ctx.get("glossary")
if not isinstance(glossary, model.GlossaryCollector) or not glossary:
return None
blocks = [
model.Heading(text="Glosario de términos", level=1),
model.Markdown(text=(
"Definición de los términos técnicos que aparecen en el informe. "
"Cada término va resaltado en el texto y, al pulsarlo, salta a su "
"definición en esta sección.")),
]
# One clickable destination per term, alphabetically by *visible* label. The
# baseline resolution must happen BEFORE sorting: a term registered bare (no
# label) carries its key as label in the collector, so ordering by the
# collector's label would place it by its key instead of by the human label
# supplied by the baseline catalog. Resolve first, then sort by the final label.
resolved = []
for term in glossary.terms(by="order"):
label, definition = _resolve_term(term)
resolved.append((label, definition, model._safe_str(term.get("key"))))
resolved.sort(key=lambda e: model._safe_str(e[0]).lower())
for label, definition, key in resolved:
blocks.append(model.GlossaryEntry(
key=key, label=label, definition=definition))
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
version=CHAPTER_VERSION, blocks=blocks)