fn_registry/python/functions/datascience/automatic_eda/chapters/glosario.py

"""Glossary chapter (GLOSARIO) — always the last chapter, clickable terms.

Renders one entry per glossary term that the other chapters registered during
the document build through ``ctx['glossary'].add(key, label, definition)`` (see
``GlossaryCollector`` in ``model.py``). Each entry is a clickable destination:
every in-text appearance a chapter marked with ``[[term:key]]texto[[/term]]``
becomes a real jump to its entry here — PDF link annotations (PyMuPDF) and PPTX
native slide jumps, both wired by the renderers.

Returns ``None`` when no term was registered (there is nothing to show), so the
chapter simply disappears from documents that did not mark any term.

Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""

from __future__ import annotations

from .. import model

CHAPTER_VERSION = "1.1.1"
CHAPTER_ID = "glosario"
CHAPTER_TITLE = "Glosario"

# Canonical definitions for cross-cutting terms — the "how to read it" entries
# that do not belong to a single chapter. A chapter only needs to *register* the
# term (``ctx['glossary'].add(key, label)``) and mark its in-text appearance with
# ``[[term:key]]…[[/term]]``; this chapter supplies the full definition here when
# the collector carries the term without one. Keeping the prose in a single place
# avoids repeating a long paragraph inline in every chapter that names the term
# (the explanation moved out of the NUM DISTR and CAT DISTR intros lives here).
_BASELINE_TERMS = {
    "histograma_boxplot": {
        "label": "Cómo leer el histograma y el boxplot",
        "definition": (
            "Para cada columna numérica se muestra su histograma con tres líneas "
            "de referencia: la media (línea roja discontinua), la mediana (línea "
            "verde continua) y la banda ±1σ (zona sombreada que cubre una "
            "desviación estándar a cada lado de la media). Debajo, alineado al "
            "mismo eje horizontal, un boxplot de Tukey: la caja abarca del primer "
            "al tercer cuartil (P25–P75), la línea interior es la mediana y los "
            "bigotes llegan hasta 1,5·IQR; los puntos rojos señalan que hay "
            "valores más allá de las vallas (posibles atípicos). Comparar la media "
            "con la mediana revela la asimetría: si la media supera a la mediana la "
            "cola larga cae hacia los valores altos (asimetría a la derecha), y al "
            "revés hacia los bajos."),
    },
    "pagina_categorica": {
        "label": "Cómo se organiza cada página categórica",
        "definition": (
            "Cada columna categórica ocupa su propia página: muestra sus métricas "
            "de cardinalidad —incluida la entropía—, una nota que señala "
            "cardinalidad problemática (columnas que se comportan como "
            "identificador, con casi todos los valores distintos, o dominadas por "
            "una sola categoría), la tabla de las categorías más frecuentes (top-k, "
            "con su conteo y porcentaje) y un gráfico de barras de las categorías "
            "más comunes (top-k más una barra «Otros» que agrupa la cola). El total "
            "de filas del dataset se usa como referencia para interpretar los "
            "conteos."),
    },
}


def _resolve_term(term: dict) -> tuple:
    """Return (label, definition) for a collected term, completing a missing
    definition (and, if absent, the label) from the canonical baseline catalog."""
    key = model._safe_str(term.get("key"))
    label = model._safe_str(term.get("label"))
    definition = model._safe_str(term.get("definition"))
    base = _BASELINE_TERMS.get(key)
    if base:
        if not definition.strip():
            definition = model._safe_str(base.get("definition"))
        if not label.strip() or label == key:
            label = model._safe_str(base.get("label")) or label
    return label, definition


def build_glosario(profile: dict, ctx: dict):
    """Build the glossary Chapter from the shared collector, or None if empty."""
    ctx = ctx or {}
    glossary = ctx.get("glossary")
    if not isinstance(glossary, model.GlossaryCollector) or not glossary:
        return None

    blocks = [
        model.Heading(text="Glosario de términos", level=1),
        model.Markdown(text=(
            "Definición de los términos técnicos que aparecen en el informe. "
            "Cada término va resaltado en el texto y, al pulsarlo, salta a su "
            "definición en esta sección.")),
    ]
    # One clickable destination per term, alphabetically by *visible* label. The
    # baseline resolution must happen BEFORE sorting: a term registered bare (no
    # label) carries its key as label in the collector, so ordering by the
    # collector's label would place it by its key instead of by the human label
    # supplied by the baseline catalog. Resolve first, then sort by the final label.
    resolved = []
    for term in glossary.terms(by="order"):
        label, definition = _resolve_term(term)
        resolved.append((label, definition, model._safe_str(term.get("key"))))
    resolved.sort(key=lambda e: model._safe_str(e[0]).lower())
    for label, definition, key in resolved:
        blocks.append(model.GlossaryEntry(
            key=key, label=label, definition=definition))

    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)