fn_registry/python/functions/datascience/automatic_eda/chapters/overview.py

"""Overview chapter — df.head, column dictionary and describe (reference).

Second reference chapter for AutomaticEDA. Renders (across as many pages/slides
as needed, the renderers paginate):

1. ``df.head`` — the first rows of the table. The current ``TableProfile`` does
   NOT carry the raw head, so this is read from ``ctx['head_rows']`` /
   ``profile['head_rows']`` (a list of row dicts). When absent the chapter shows
   an honest placeholder documenting the missing key instead of inventing data.
2. Column dictionary — name / type / nulls / non-null examples. Examples come
   from ``columns[i]['examples']`` when present; otherwise they are derived from
   real non-null profile values (categorical top values, numeric min/median/max)
   so the cell is never empty nor fabricated.
3. ``df.describe`` — mean / median / min / max / std for every numeric column.

Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""

from __future__ import annotations

from .. import model

CHAPTER_VERSION = "1.1.0"
CHAPTER_ID = "overview"
CHAPTER_TITLE = "Overview"

# Profile/ctx keys the calculation phase must add for a full head + examples.
HEAD_KEY = "head_rows"          # list[dict] — df.head(n)
EXAMPLES_KEY = "examples"       # per column: list of non-null sample values


def _fmt_num(value, decimals: int = 3) -> str:
    if value is None:
        return "—"
    if isinstance(value, bool):
        return str(value)
    if isinstance(value, int):
        return f"{value:,}".replace(",", ".")
    if isinstance(value, float):
        if value != value:  # NaN
            return "NaN"
        if value in (float("inf"), float("-inf")):
            return str(value)
        text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
        return text if text else "0"
    return str(value)


def _fmt_pct(value, decimals: int = 1) -> str:
    if value is None:
        return "—"
    try:
        return f"{float(value) * 100:.{decimals}f}%"
    except (TypeError, ValueError):
        return str(value)


def _examples_for(col: dict) -> str:
    """Build a short string of real non-null example values for a column."""
    explicit = col.get(EXAMPLES_KEY)
    if isinstance(explicit, (list, tuple)) and explicit:
        return ", ".join(model._safe_str(v) for v in explicit[:4])
    cat = col.get("categorical") or {}
    top = cat.get("top") or []
    if top:
        vals = [model._safe_str((t or {}).get("value")) for t in top[:4]
                if isinstance(t, dict)]
        vals = [v for v in vals if v]
        if vals:
            return ", ".join(vals)
    num = col.get("numeric") or {}
    if num:
        bits = []
        for key in ("min", "median", "max"):
            v = num.get(key)
            if v is not None:
                bits.append(_fmt_num(v))
        if bits:
            return ", ".join(bits)
    return "—"


def _head_block(profile: dict, ctx: dict):
    """Return a DataTable for df.head, or a Note documenting the missing key."""
    head = ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)
    if isinstance(head, list) and head and isinstance(head[0], dict):
        # Column order from the profile, then any extra keys present in rows.
        cols = [c.get("name") for c in (profile.get("columns") or [])
                if c.get("name")]
        if not cols:
            cols = list(head[0].keys())
        rows = [[model._safe_str(r.get(c)) for c in cols] for r in head[:10]]
        # Honest note: how many rows are shown and, when known, out of how many
        # rows the dataset has (so "primeras 10 filas de 891" gives context).
        note = f"primeras {len(rows)} filas"
        n_rows = profile.get("n_rows")
        if isinstance(n_rows, int) and not isinstance(n_rows, bool) \
                and n_rows > len(rows):
            note += f" de {n_rows:,}".replace(",", ".")
        return model.DataTable(header=cols, rows=rows, note=note)
    return model.Note(
        "df.head no disponible: el TableProfile no incluye 'head_rows'. La fase "
        "de cálculo debe añadir profile['head_rows'] (lista de dicts fila) o "
        "pasarlo en ctx['head_rows'] para mostrar las primeras filas.")


def _columns_block(profile: dict):
    cols = profile.get("columns") or []
    header = ["Columna", "Tipo", "Nulos", "Ejemplos (no nulos)"]
    rows = []
    for c in cols:
        if not isinstance(c, dict):
            continue
        name = c.get("name") or "(col)"
        ctype = c.get("inferred_type") or c.get("physical_type") or "—"
        sem = c.get("semantic_type")
        if sem:
            ctype = f"{ctype} ({sem})"
        null_pct = c.get("null_pct")
        null_count = c.get("null_count")
        if null_pct is not None:
            nulls = _fmt_pct(null_pct)
            if null_count is not None:
                nulls += f" ({null_count})"
        elif null_count is not None:
            nulls = str(null_count)
        else:
            nulls = "—"
        rows.append([name, ctype, nulls, _examples_for(c)])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Columnas")


def _describe_block(profile: dict):
    cols = profile.get("columns") or []
    header = ["Columna", "mean", "median", "min", "max", "std"]
    rows = []
    for c in cols:
        if not isinstance(c, dict) or c.get("inferred_type") != "numeric":
            continue
        num = c.get("numeric") or {}
        if not num:
            continue
        rows.append([
            c.get("name") or "(col)",
            _fmt_num(num.get("mean")),
            _fmt_num(num.get("median")),
            _fmt_num(num.get("min")),
            _fmt_num(num.get("max")),
            _fmt_num(num.get("std")),
        ])
    if not rows:
        return None
    return model.DataTable(header=header, rows=rows, title="Estadística (describe)")


def build_overview(profile: dict, ctx: dict):
    """Build the Overview Chapter, or None if the profile has no columns."""
    profile = profile or {}
    ctx = ctx or {}
    cols = profile.get("columns") or []
    if not cols and not (ctx.get(HEAD_KEY) or profile.get(HEAD_KEY)):
        return None

    blocks = [
        model.Heading(text="Primeras filas (df.head)", level=2),
        _head_block(profile, ctx),
    ]
    cols_block = _columns_block(profile)
    if cols_block is not None:
        blocks.append(model.Heading(
            text="Diccionario de columnas", level=2))
        blocks.append(cols_block)
    desc_block = _describe_block(profile)
    if desc_block is not None:
        blocks.append(model.Heading(
            text="Resumen estadístico numérico", level=2))
        blocks.append(desc_block)

    return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
                         version=CHAPTER_VERSION, blocks=blocks)