fn_registry/python/functions/datascience/automatic_eda/model.py

"""AutomaticEDA document model — format-independent blocks and chapters.

This is the intermediate layer between *content* (what an EDA chapter wants to
say) and *output format* (PDF for mobile reading, PPTX for sharing). A document
is an ordered list of :class:`Chapter`. A chapter is ``{id, title, version,
blocks}``. A block is one of a small, closed set of presentation primitives
(heading, markdown, key/value table, data table, figure, image, caption, note).

Neither renderer knows anything about the EDA profile: they only know how to lay
out blocks so that **nothing is ever cut** — long text wraps to whole lines,
long tables split by rows repeating the header, figures and images are scaled to
fit entirely. Each chapter declares its own ``version`` so every page/slide can
be stamped ``<Chapter> · v<version>`` and tracked in a manifest for continuous,
per-chapter improvement.

Reading is defensive throughout (the ``eda`` group "dict-no-throw" style): the
normalizers accept dataclass blocks *or* plain dicts, coerce anything unknown
into a readable :class:`Note` instead of raising, and the renderers degrade a
malformed block to text rather than crashing the whole document.
"""

from __future__ import annotations

import json
import os
from dataclasses import dataclass, field
from typing import Any, Callable, Optional

# Global engine version. Bump when the document model or a renderer changes in a
# way that affects output. Individual chapters carry their own CHAPTER_VERSION.
ENGINE_VERSION = "1.0.0"
ENGINE_NAME = "AutomaticEDA"


# --------------------------------------------------------------------------- #
# Block primitives. Each carries a stable ``kind`` string so renderers can
# dispatch by kind (works for dataclass instances and for plain dicts alike).
# --------------------------------------------------------------------------- #
@dataclass
class Heading:
    """A section heading. ``level`` 1 (largest) .. 3 (smallest)."""

    text: str = ""
    level: int = 1
    kind: str = field(default="heading", init=False)


@dataclass
class Markdown:
    """A block of light markdown text.

    Supported subset (everything else is rendered verbatim, never dropped):
    ``#``/``##``/``###`` headings, ``-``/``*`` bullet lists, ``| a | b |``
    tables (consecutive pipe lines become a data table), blank lines as
    paragraph breaks, and ``**bold**`` inline markers (markers are stripped, the
    text is kept). Text is wrapped to whole lines so it is never cut mid-line.
    """

    text: str = ""
    kind: str = field(default="markdown", init=False)


@dataclass
class KVTable:
    """A two-column key/value table. ``rows`` is a list of ``(label, value)``."""

    rows: list = field(default_factory=list)
    title: Optional[str] = None
    kind: str = field(default="kv_table", init=False)


@dataclass
class DataTable:
    """A tabular block with a header row.

    If it does not fit in the remaining page/slide space it is split by rows,
    **repeating the header** on each continuation. Long cell text wraps inside
    its column (the row grows taller) so no cell content is ever lost.
    """

    header: list = field(default_factory=list)
    rows: list = field(default_factory=list)  # list[list[Any]]
    title: Optional[str] = None
    note: Optional[str] = None
    kind: str = field(default="data_table", init=False)


@dataclass
class Figure:
    """A matplotlib figure, scaled to fit entirely (never cropped).

    Provide either an already-built ``fig`` (a ``matplotlib.figure.Figure``) or
    a zero-arg ``make`` callable that returns one (lazy: only built when the
    renderer needs it). ``height_in`` is an optional hint for the target height
    on the page; renderers clamp it to the available space preserving aspect.
    """

    fig: Any = None
    make: Optional[Callable[[], Any]] = None
    caption: Optional[str] = None
    height_in: Optional[float] = None
    kind: str = field(default="figure", init=False)


@dataclass
class Image:
    """A raster image (PNG/JPG) by path, scaled to fit entirely."""

    path: str = ""
    caption: Optional[str] = None
    height_in: Optional[float] = None
    kind: str = field(default="image", init=False)


@dataclass
class Caption:
    """Small auxiliary text rendered under a figure/table."""

    text: str = ""
    kind: str = field(default="caption", init=False)


@dataclass
class Note:
    """Small auxiliary note (italic). Also the fallback for unknown content."""

    text: str = ""
    kind: str = field(default="note", init=False)


@dataclass
class Group:
    """A keep-together unit: its blocks render on the SAME page/slide.

    Renderers measure the whole group first; if it does not fit in the remaining
    space they move it *whole* to the next page (PDF) or slide (PPTX) before
    drawing anything — so a heading never gets stranded apart from the figure and
    text it introduces. If the group is taller than a full page even on its own,
    it starts on a fresh page and flows (honest degradation, never cut). Use it to
    bind ``Heading`` + ``Markdown`` + ``Figure`` of one idea together (see the
    DISTR NUM / AGREGACION chapters).

    When ``page_break_before`` is True the renderer additionally forces the group
    to *start* on a fresh page/slide (unless the current one is already empty), so
    a chapter can give each unit its own page — e.g. one categorical column per
    page (see CAT DISTR). It is purely additive: the default False keeps the plain
    keep-together behaviour for every existing chapter.
    """

    blocks: list = field(default_factory=list)
    title: Optional[str] = None
    page_break_before: bool = False
    kind: str = field(default="group", init=False)


@dataclass
class GlossaryEntry:
    """One glossary term: a clickable destination at the end of the document.

    Rendered as the term ``label`` (heading) plus its ``definition`` (markdown).
    The renderers register its page/slide position as the link target so every
    in-text appearance of the same ``key`` becomes a real clickable jump (PDF link
    annotation via PyMuPDF; PPTX internal slide jump)."""

    key: str = ""
    label: str = ""
    definition: str = ""
    kind: str = field(default="glossary_entry", init=False)


@dataclass
class Chapter:
    """An ordered set of blocks with an id, a title and a generation version."""

    id: str = ""
    title: str = ""
    version: str = "1.0.0"
    blocks: list = field(default_factory=list)


# --------------------------------------------------------------------------- #
# Defensive normalizers — accept dataclasses OR plain dicts, never raise.
# --------------------------------------------------------------------------- #
_BLOCK_BY_KIND = {
    "heading": Heading,
    "markdown": Markdown,
    "kv_table": KVTable,
    "data_table": DataTable,
    "figure": Figure,
    "image": Image,
    "caption": Caption,
    "note": Note,
    "group": Group,
    "glossary_entry": GlossaryEntry,
}


def as_block(obj: Any):
    """Coerce a value into a block dataclass. Unknown values become a Note."""
    if isinstance(obj, (Heading, Markdown, KVTable, DataTable, Figure, Image,
                        Caption, Note, Group, GlossaryEntry)):
        if isinstance(obj, Group):
            obj.blocks = as_blocks(obj.blocks)
        return obj
    if isinstance(obj, dict):
        kind = obj.get("kind")
        cls = _BLOCK_BY_KIND.get(kind)
        if cls is None:
            return Note(text=_safe_str(obj))
        # Build only with fields the dataclass accepts (ignore extras).
        try:
            if cls is Heading:
                return Heading(text=_safe_str(obj.get("text")),
                               level=int(obj.get("level", 1) or 1))
            if cls is Markdown:
                return Markdown(text=_safe_str(obj.get("text")))
            if cls is KVTable:
                return KVTable(rows=list(obj.get("rows") or []),
                               title=obj.get("title"))
            if cls is DataTable:
                return DataTable(header=list(obj.get("header") or []),
                                 rows=list(obj.get("rows") or []),
                                 title=obj.get("title"), note=obj.get("note"))
            if cls is Figure:
                return Figure(fig=obj.get("fig"), make=obj.get("make"),
                              caption=obj.get("caption"),
                              height_in=obj.get("height_in"))
            if cls is Image:
                return Image(path=_safe_str(obj.get("path")),
                             caption=obj.get("caption"),
                             height_in=obj.get("height_in"))
            if cls is Caption:
                return Caption(text=_safe_str(obj.get("text")))
            if cls is Note:
                return Note(text=_safe_str(obj.get("text")))
            if cls is Group:
                return Group(blocks=as_blocks(obj.get("blocks")),
                             title=obj.get("title"),
                             page_break_before=bool(
                                 obj.get("page_break_before", False)))
            if cls is GlossaryEntry:
                return GlossaryEntry(key=_safe_str(obj.get("key")),
                                     label=_safe_str(obj.get("label")),
                                     definition=_safe_str(obj.get("definition")))
        except Exception:  # noqa: BLE001 — never raise on a malformed block.
            return Note(text=_safe_str(obj))
    return Note(text=_safe_str(obj))


def as_blocks(seq: Any) -> list:
    """Normalize an arbitrary sequence into a list of block dataclasses."""
    if seq is None:
        return []
    if not isinstance(seq, (list, tuple)):
        return [as_block(seq)]
    return [as_block(b) for b in seq]


def as_chapter(obj: Any) -> Optional[Chapter]:
    """Coerce a value into a Chapter (or None). Accepts a dict or a Chapter."""
    if obj is None:
        return None
    if isinstance(obj, Chapter):
        obj.blocks = as_blocks(obj.blocks)
        return obj
    if isinstance(obj, dict):
        return Chapter(
            id=_safe_str(obj.get("id")),
            title=_safe_str(obj.get("title")) or _safe_str(obj.get("id")),
            version=_safe_str(obj.get("version")) or "1.0.0",
            blocks=as_blocks(obj.get("blocks")),
        )
    return None


def as_chapters(seq: Any) -> list:
    """Normalize a sequence of chapters, dropping anything that can't coerce."""
    if seq is None:
        return []
    if isinstance(seq, Chapter):
        return [as_chapter(seq)]
    if not isinstance(seq, (list, tuple)):
        return []
    out = []
    for c in seq:
        ch = as_chapter(c)
        if ch is not None:
            out.append(ch)
    return out


def _safe_str(v: Any) -> str:
    """str() that never raises and maps None to ''."""
    if v is None:
        return ""
    try:
        return str(v)
    except Exception:  # noqa: BLE001
        return ""


# --------------------------------------------------------------------------- #
# Glossary collector — chapters register the terms they use; the glosario
# chapter renders them at the end and the renderers wire the clickable links.
# --------------------------------------------------------------------------- #
class GlossaryCollector:
    """Accumulates glossary terms registered by chapters during document build.

    A single instance is created by :func:`build_document` and passed to every
    chapter via ``ctx['glossary']``. A chapter calls ``add(key, label,
    definition)`` to declare a term it explains (e.g. ``"entropia"`` →
    "Entropía"), and marks each in-text appearance with the inline span
    ``[[term:key]]texto visible[[/term]]`` (see ``text_layout.parse_inline_rich``).
    The ``glosario`` chapter reads ``terms()`` to emit one :class:`GlossaryEntry`
    per term; the renderers turn every marked appearance into a real click that
    jumps to that entry. First registration of a key wins (idempotent); never
    raises."""

    def __init__(self):
        self._terms: dict = {}
        self._order: list = []

    def add(self, key: Any, label: Any = None, definition: Any = "") -> str:
        """Register a term and return its normalized key (''. if invalid)."""
        try:
            k = _safe_str(key).strip()
            if not k:
                return ""
            if k not in self._terms:
                self._terms[k] = {
                    "key": k,
                    "label": _safe_str(label).strip() or k,
                    "definition": _safe_str(definition),
                }
                self._order.append(k)
            return k
        except Exception:  # noqa: BLE001 — collecting a term never breaks a build.
            return ""

    def has(self, key: Any) -> bool:
        return _safe_str(key).strip() in self._terms

    def get(self, key: Any) -> Optional[dict]:
        return self._terms.get(_safe_str(key).strip())

    def terms(self, by: str = "label") -> list:
        """Return the registered terms as dicts.

        ``by='label'`` (default) sorts alphabetically by visible label;
        ``by='order'`` keeps first-appearance order."""
        if by == "order":
            return [self._terms[k] for k in self._order]
        return sorted(self._terms.values(),
                      key=lambda t: _safe_str(t.get("label")).lower())

    def __len__(self) -> int:
        return len(self._terms)

    def __bool__(self) -> bool:
        return bool(self._terms)


# --------------------------------------------------------------------------- #
# Manifest — per-chapter versions and page/slide counts for tracking.
# --------------------------------------------------------------------------- #
def merge_manifest(manifest_path: str, renderer: str, chapters_meta: list,
                   generated_at: str,
                   engine_version: str = ENGINE_VERSION) -> dict:
    """Read-modify-write the AutomaticEDA manifest, merging one renderer's run.

    The manifest lives next to the outputs as ``automatic_eda_manifest.json``
    and records, per chapter, its version plus the page count (PDF) and slide
    count (PPTX). Calling either renderer creates or updates it. Never raises:
    on any error returns the in-memory manifest without writing.

    Args:
        manifest_path: path to the JSON manifest to create or update.
        renderer: "pdf" or "pptx" — selects which count key is written.
        chapters_meta: list of ``{"id", "version", "n_pages"|"n_slides"}``.
        generated_at: ISO-ish timestamp string for this run.
        engine_version: AutomaticEDA engine version.

    Returns:
        The merged manifest dict (also written to disk on success).
    """
    data: dict = {}
    try:
        if manifest_path and os.path.exists(manifest_path):
            with open(manifest_path, "r", encoding="utf-8") as fh:
                loaded = json.load(fh)
            if isinstance(loaded, dict):
                data = loaded
    except Exception:  # noqa: BLE001 — a corrupt manifest is overwritten.
        data = {}

    data["engine"] = ENGINE_NAME
    data["engine_version"] = engine_version
    data["generated_at"] = generated_at
    chapters = data.get("chapters")
    if not isinstance(chapters, dict):
        chapters = {}
    count_key = "n_slides" if renderer == "pptx" else "n_pages"
    for cm in chapters_meta or []:
        if not isinstance(cm, dict):
            continue
        cid = cm.get("id")
        if not cid:
            continue
        entry = chapters.get(cid)
        if not isinstance(entry, dict):
            entry = {}
        entry["version"] = cm.get("version") or entry.get("version") or "1.0.0"
        entry[count_key] = cm.get(count_key, cm.get("n_pages", cm.get("n_slides")))
        chapters[cid] = entry
    data["chapters"] = chapters

    try:
        parent = os.path.dirname(os.path.abspath(manifest_path))
        os.makedirs(parent, exist_ok=True)
        with open(manifest_path, "w", encoding="utf-8") as fh:
            json.dump(data, fh, ensure_ascii=False, indent=2, default=str)
    except Exception:  # noqa: BLE001 — never raise from the manifest writer.
        pass
    return data