fn_registry/python/functions/datascience/missingness_rank_bar_figure.py

"""Impure EDA helper: ranked bar figure of missing-value share (`eda` group).

Builds a horizontal bar chart ranking the columns of a dataset by their
percentage of missing values (0-100), largest at the top, each bar labelled with
its ``NN.N%`` at the end. Returns a ready-to-rasterize
``matplotlib.figure.Figure``; it never shows nor saves it.

Impure because it touches matplotlib's rendering machinery. It uses the headless
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
global state and is safe to call repeatedly from a report renderer.
"""

import matplotlib

matplotlib.use("Agg")

from matplotlib.figure import Figure  # noqa: E402

# Muted gray for secondary text (no-data / fallback messages).
_MUTED_TEXT = "#5f6b7a"
# Soft red for the error fallback message.
_ERROR_TEXT = "#b00020"
# Bar fill — a calm blue that reads well on white at report size.
_BAR_COLOR = "#4C72B0"


def _truncate(text, width: int = 22) -> str:
    """Truncate ``text`` to ``width`` chars, appending an ellipsis if cut."""
    s = "" if text is None else str(text)
    if len(s) <= width:
        return s
    if width <= 1:
        return s[:width]
    return s[: width - 1] + "…"


def _message_figure(message: str, color: str = _MUTED_TEXT) -> "Figure":
    """Return a fallback ``Figure`` carrying a single centered message."""
    fig = Figure(figsize=(6.4, 4.0), dpi=150)
    ax = fig.add_subplot(111)
    ax.axis("off")
    ax.text(
        0.5,
        0.5,
        message,
        ha="center",
        va="center",
        fontsize=12,
        color=color,
        wrap=True,
        transform=ax.transAxes,
    )
    fig.tight_layout()
    return fig


def missingness_rank_bar_figure(
    names,
    pcts,
    title: str = "% de valores faltantes por columna",
) -> "matplotlib.figure.Figure":
    """Build a horizontal ranked bar figure of missing-value share per column.

    Pairs each column name with its missing percentage, sorts by percentage
    descending and draws horizontal bars with the largest at the top. The X axis
    is pinned to ``[0, 100]`` so bars are comparable across reports, each bar is
    annotated with its ``NN.N%`` at the end, and the Y tick labels are truncated
    to ~22 chars.

    The function is fully defensive: empty/mismatched/non-numeric input never
    raises. When there is nothing valid to draw it returns a ``Figure`` carrying
    a centered "sin datos faltantes" message, and any unexpected error is caught
    and turned into a fallback ``Figure`` carrying the error text.

    Args:
        names: List of column names. May be empty. Items are stringified and
            truncated for display; the originals are not mutated.
        pcts: List parallel to ``names`` of missing-value percentages in
            ``[0, 100]``. Non-numeric/``None`` values are coerced to ``0.0`` and
            negatives are clamped to ``0``. The list is truncated to
            ``min(len(names), len(pcts))`` so a length mismatch never crashes.
        title: Figure title. Default "% de valores faltantes por columna".

    Returns:
        A ``matplotlib.figure.Figure`` with a single horizontal-bar Axes. The
        caller is responsible for rasterizing/closing it.
    """
    try:
        if (
            not isinstance(names, (list, tuple))
            or not isinstance(pcts, (list, tuple))
            or len(names) == 0
            or len(pcts) == 0
        ):
            return _message_figure("sin datos faltantes")

        # --- Pair names with coerced percentages, tolerating length mismatch.
        pairs = []
        for name, pct in zip(names, pcts):
            try:
                val = float(pct)
            except (TypeError, ValueError):
                val = 0.0
            if val != val:  # NaN guard.
                val = 0.0
            val = max(0.0, val)
            pairs.append((name, val))

        if not pairs:
            return _message_figure("sin datos faltantes")

        # Sort by percentage descending; barh draws bottom-up, so the largest
        # ends at the top when we reverse the order before plotting.
        pairs.sort(key=lambda p: p[1], reverse=True)
        ordered = list(reversed(pairs))  # smallest first -> largest on top.

        labels = [_truncate(name, 22) for name, _ in ordered]
        values = [val for _, val in ordered]
        y_pos = range(len(ordered))

        # Height scales with the number of bars so dense reports stay readable.
        height = max(2.4, min(0.4 * len(ordered) + 1.2, 14.0))
        fig = Figure(figsize=(6.4, height), dpi=150)
        ax = fig.add_subplot(111)

        ax.barh(list(y_pos), values, color=_BAR_COLOR, edgecolor="white")
        ax.set_yticks(list(y_pos))
        ax.set_yticklabels(labels, fontsize=8)
        ax.set_xlim(0, 100)
        ax.set_xlabel("% faltante", fontsize=9)

        # Annotate each bar with its percentage at the end of the bar.
        for y, val in zip(y_pos, values):
            ax.text(
                min(val + 1.5, 99.0),
                y,
                f"{val:.1f}%",
                va="center",
                ha="left" if val < 90 else "right",
                fontsize=7,
                color="#202020",
            )

        if title:
            ax.set_title(_truncate(title, 60), fontsize=12, loc="left", pad=10)

        fig.tight_layout()
        return fig
    except Exception as exc:  # noqa: BLE001 — never raise from a figure builder.
        return _message_figure(f"error al dibujar barras: {exc}", color=_ERROR_TEXT)