fn_registry/python/functions/datascience/automatic_eda/text_layout.py

"""Shared text-measurement helpers for the AutomaticEDA renderers.

Both renderers flow content top-to-bottom and must know, *before* placing a
block, how much vertical space it will take — that is what guarantees nothing is
cut: a unit either fits in the remaining space or moves to the next page/slide
whole. Measuring proportional text exactly in matplotlib/pptx is impractical, so
we use a deterministic character-grid estimate (chars-per-line from an average
glyph width) which slightly over-estimates and is therefore safe: it never
claims something fits when it would overflow.

Wrapping is word-aware (``textwrap``) and additionally hard-splits any single
token longer than the line so a 200-character value still wraps instead of
overflowing — that is wrapping, not loss: every character is still rendered.
"""

from __future__ import annotations

import textwrap


def avg_char_width_in(fontsize_pt: float) -> float:
    """Approximate average glyph width in inches for a sans-serif font.

    ~0.5 of the point size is a conservative mean advance width for proportional
    sans fonts; dividing by 72 converts points to inches.
    """
    return 0.5 * fontsize_pt / 72.0


def line_height_in(fontsize_pt: float, leading: float = 1.32) -> float:
    """Line height in inches for a given font size and leading."""
    return leading * fontsize_pt / 72.0


def chars_per_line(width_in: float, fontsize_pt: float) -> int:
    """How many average glyphs fit in ``width_in`` at ``fontsize_pt``."""
    cw = avg_char_width_in(fontsize_pt)
    if cw <= 0:
        return 80
    n = int(width_in / cw)
    return max(1, n)


def wrap(text: str, max_chars: int) -> list:
    """Word-wrap ``text`` to lines of at most ``max_chars``, never losing chars.

    Long tokens (no spaces) are hard-split so they cannot overflow. Existing
    newlines are honored as hard breaks. Empty input yields a single empty line
    so callers can still reserve a row.
    """
    if max_chars < 1:
        max_chars = 1
    s = "" if text is None else str(text)
    out: list = []
    for raw_line in s.split("\n"):
        if raw_line == "":
            out.append("")
            continue
        # textwrap with break_long_words so no token overflows the column.
        wrapped = textwrap.wrap(
            raw_line, width=max_chars, break_long_words=True,
            break_on_hyphens=False, replace_whitespace=True,
            drop_whitespace=True,
        )
        if not wrapped:
            out.append("")
        else:
            out.extend(wrapped)
    return out or [""]


def strip_inline_md(text: str) -> str:
    """Strip a tiny subset of inline markdown markers, keeping the text.

    Removes ``**bold**`` / ``__bold__`` / ``*em*`` / `` `code` `` markers so the
    content is preserved without trying to style spans (which the line-grid
    layout cannot do). Nothing is dropped except the markers themselves.
    """
    if not text:
        return ""
    s = str(text)
    for marker in ("**", "__", "`"):
        s = s.replace(marker, "")
    return s


def parse_md_table(lines: list):
    """Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None.

    Accepts an optional separator row (``|---|---|``) right after the header,
    which is ignored. Returns None if the lines are not a pipe table.
    """
    cells_rows = []
    for ln in lines:
        s = ln.strip()
        if not (s.startswith("|") and s.endswith("|")):
            return None
        parts = [c.strip() for c in s.strip("|").split("|")]
        cells_rows.append(parts)
    if not cells_rows:
        return None
    header = cells_rows[0]
    body = cells_rows[1:]
    # Drop a markdown separator row (all cells are dashes/colons).
    if body and all(set(c) <= set("-: ") and "-" in c for c in body[0]):
        body = body[1:]
    return header, body