fn_registry/python/functions/datascience/automatic_eda/chapters/overview.py

"""Overview chapter — df.head, column dictionary and describe (reference).

Second reference chapter for AutomaticEDA. Renders (across as many pages/slides
as needed, the renderers paginate):

1. ``df.head`` — the first rows of the table. The current ``TableProfile`` does
   NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
   ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
   an honest placeholder documenting the missing key instead of inventing data.
2. Column dictionary — name / type / nulls / non-null examples plus, when the
   LLM layer ran, the business **description** and **unit** of each column so the
   reader knows at a glance what every column is and in which unit. Examples come
   from ``columns[i]['examples']`` when present; otherwise they are derived from
   real non-null profile values (categorical top values, numeric min/median/max)
   so the cell is never empty nor fabricated.
3. ``df.describe`` — mean / median / min / max / std for every numeric column,
   plus its **unit** (same LLM source) so the stats read in context.

The description/unit come from the ``llm`` block that ``eda_llm_insights`` (group
``eda``) already stored in the profile (``profile['llm']['dictionary']``, a list
of ``{"column","description","business_meaning","unit"}`` entries) — this chapter
only **consumes** it, matching by column name; it never calls the LLM nor
recomputes anything. When the block is absent (``run_llm`` did not run) those
cells degrade to ``"—"`` and the tables still render.

Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""

from __future__ import annotations

from .. import model

CHAPTER_VERSION = "1.2.0"
CHAPTER_ID = "overview"
CHAPTER_TITLE = "Overview"

# Profile/ctx keys the calculation phase must add for a full head + examples.
HEAD_KEY = "head_rows"          # list[dict] — df.head(n)
EXAMPLES_KEY = "examples"       # per column: list of non-null sample values
LLM_KEY = "llm"                 # interpretive block from eda_llm_insights


def _llm_dict_index(profile: dict, ctx: dict) -> dict:
    """Map column name -> its LLM dictionary entry (description/unit/...).

    Reads the ``llm.dictionary`` list that ``eda_llm_insights`` stored in the
    profile (``profile['llm']``; falls back to ``ctx['llm']``). Returns an empty
    dict when no LLM block ran, so the caller degrades to "—" cells. Fully
    defensive: never raises on malformed input.
    """
    llm = profile.get(LLM_KEY)
    if not isinstance(llm, dict):
        llm = ctx.get(LLM_KEY)
    if not isinstance(llm, dict):
        return {}
    entries = llm.get("dictionary")
    if not isinstance(entries, (list, tuple)):
        return {}
    index: dict = {}
    for e in entries:
        if not isinstance(e, dict):
            continue
        col = e.get("column")
        if col is None:
            continue
        index[model._safe_str(col)] = e
    return index


def _llm_desc(entry) -> str:
    """Business description of a column from its LLM entry, or "—"."""
    if not isinstance(entry, dict):
        return "—"
    raw = entry.get("description") or entry.get("business_meaning")
    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
    return text or "—"


def _llm_unit(entry) -> str:
    """Unit of a column from its LLM entry, or "—"."""
    if not isinstance(entry, dict):
        return "—"
    raw = entry.get("unit")
    text = " ".join(model._safe_str(raw).split()) if raw is not None else ""
    return text or "—"


def _fmt_num(value, decimals: int = 3) -> str:
    if value is None:
        return "—"
    if isinstance(value, bool):
        return str(value)
    if isinstance(value, int):
        return f"{value:,}".replace(",", ".")
    if isinstance(value, float):
        if value != value:  # NaN
            return "NaN"
        if value in (float("inf"), float("-inf")):
            return str(value)
        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
        return text if text else "0"
    return str(value)


def _fmt_pct(value, decimals: int = 1) -> str:
    if value is None:
        return "—"
    try:
        return f"{float(value) * 100:.{decimals}f}%"
    except (TypeError, ValueError):
        return str(value)


def _examples_for(col: dict) -> str:
    """Build a short string of real non-null example values for a column."""
    explicit = col.get(EXAMPLES_KEY)
    if isinstance(explicit, (list, tuple)) and explicit:
        return ", ".join(model._safe_str(v) for v in explicit[:4])
    cat = col.get("categorical") or {}
    top = cat.get("top") or []
    if top:
        vals = [model._safe_str((t or {}).get("value")) for t in top[:4]
                if isinstance(t, dict)]
        vals = [v for v in vals if v]
        if vals:
            return ", ".join(vals)
    num = col.get("numeric") or {}
    if num:
        bits = []
        for key in ("min", "median", "max"):
            v = num.get(key)
            if v is not None:
                bits.append(_fmt_num(v))
        if bits:
            return ", ".join(bits)
    return "—"


def _head_block(profile: dict, ctx: dict):
    """Return a DataTable for df.head, or a Note documenting the missing key."""
    head = ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)
    if isinstance(head, list) and head and isinstance(head[0], dict):
        # Column order from the profile, then any extra keys present in rows.
        cols = [c.get("name") for c in (profile.get("columns") or [])
                if c.get("name")]
        if not cols:
            cols = list(head[0].keys())
        rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
        # Honest note: how many rows are shown and, when known, out of how many
        # rows the dataset has (so "primeras 10 filas de 891" gives context).
        note = f"primeras {len(rows)} filas"
        n_rows = profile.get("n_rows")
        if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
                and n_rows > len(rows):
            note += f" de {n_rows:,}".replace(",", ".")
        return model.DataTable(header=cols, rows=rows, note=note)
    return model.Note(
        "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
        "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
        "pasarlo en ctx['head_rows'] para mostrar las primeras filas.")


def _columns_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
    # Descripción / Unidad come from the LLM dictionary (matched by column name);
    # they read "—" when run_llm did not run, so the table always renders.
    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)",
              "Descripción", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict):
            continue
        name = c.get("name") or "(col)"
        ctype = c.get("inferred_type") or c.get("physical_type") or "—"
        sem = c.get("semantic_type")
        if sem:
            ctype = f"{ctype} ({sem})"
        null_pct = c.get("null_pct")
        null_count = c.get("null_count")
        if null_pct is not None:
            nulls = _fmt_pct(null_pct)
            if null_count is not None:
                nulls += f" ({null_count})"
        elif null_count is not None:
            nulls = str(null_count)
        else:
            nulls = "—"
        entry = llm_index.get(model._safe_str(name))
        rows.append([name, ctype, nulls, _examples_for(c),
                     _llm_desc(entry), _llm_unit(entry)])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Columnas")


def _describe_block(profile: dict, llm_index: dict):
    cols = profile.get("columns") or []
    # "Unidad" (LLM source) lets the reader know in which unit each stat is.
    header = ["Columna", "mean", "median", "min", "max", "std", "Unidad"]
    rows = []
    for c in cols:
        if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
            continue
        num = c.get("numeric") or {}
        if not num:
            continue
        name = c.get("name") or "(col)"
        entry = llm_index.get(model._safe_str(name))
        rows.append([
            name,
            _fmt_num(num.get("mean")),
            _fmt_num(num.get("median")),
            _fmt_num(num.get("min")),
            _fmt_num(num.get("max")),
            _fmt_num(num.get("std")),
            _llm_unit(entry),
        ])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Estadística (describe)")


def build_overview(profile: dict, ctx: dict):
    """Build the Overview Chapter, or None if the profile has no columns."""
    profile = profile or {}
    ctx = ctx or {}
    cols = profile.get("columns") or []
    if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
        return None

    llm_index = _llm_dict_index(profile, ctx)

    blocks = [
        model.Heading(text="Primeras filas (df.head)", level=2),
        _head_block(profile, ctx),
    ]
    cols_block = _columns_block(profile, llm_index)
    if cols_block is not None:
        blocks.append(model.Heading(
            text="Diccionario de columnas", level=2))
        blocks.append(cols_block)
    desc_block = _describe_block(profile, llm_index)
    if desc_block is not None:
        blocks.append(model.Heading(
            text="Resumen estadístico numérico", level=2))
        blocks.append(desc_block)

    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)