fn_registry/python/functions/datascience/automatic_eda/text_layout.py

"""Shared text-measurement helpers for the AutomaticEDA renderers.

Both renderers flow content top-to-bottom and must know, *before* placing a
block, how much vertical space it will take — that is what guarantees nothing is
cut: a unit either fits in the remaining space or moves to the next page/slide
whole. Measuring proportional text exactly in matplotlib/pptx is impractical, so
we use a deterministic character-grid estimate (chars-per-line from an average
glyph width) which slightly over-estimates and is therefore safe: it never
claims something fits when it would overflow.

Wrapping is word-aware (``textwrap``) and additionally hard-splits any single
token longer than the line so a 200-character value still wraps instead of
overflowing — that is wrapping, not loss: every character is still rendered.
"""

from __future__ import annotations

import re
import textwrap

# Inline span markers: ``**bold**`` / ``__bold__`` (rendered bold) and
# `` `code` `` (markers removed, not styled). Matched non-greedily so the
# shortest balanced pair wins. Unbalanced leftovers are stripped afterwards so
# the visible text matches ``strip_inline_md`` exactly.
_INLINE_SPAN_RE = re.compile(r"(\*\*.+?\*\*|__.+?__|`.+?`)")

# Glossary term span: ``[[term:key]]texto visible[[/term]]``. The visible text
# (which may itself contain ``**bold**``) is kept and tagged with ``key`` so the
# renderers can turn each appearance into a clickable jump to the glossary entry.
_TERM_SPAN_RE = re.compile(r"\[\[term:([A-Za-z0-9_]+)\]\](.*?)\[\[/term\]\]",
                           re.S)
_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")


def avg_char_width_in(fontsize_pt: float) -> float:
    """Approximate average glyph width in inches for a sans-serif font.

    ~0.5 of the point size is a conservative mean advance width for proportional
    sans fonts; dividing by 72 converts points to inches.
    """
    return 0.5 * fontsize_pt / 72.0


def line_height_in(fontsize_pt: float, leading: float = 1.32) -> float:
    """Line height in inches for a given font size and leading."""
    return leading * fontsize_pt / 72.0


def chars_per_line(width_in: float, fontsize_pt: float) -> int:
    """How many average glyphs fit in ``width_in`` at ``fontsize_pt``."""
    cw = avg_char_width_in(fontsize_pt)
    if cw <= 0:
        return 80
    n = int(width_in / cw)
    return max(1, n)


def wrap(text: str, max_chars: int) -> list:
    """Word-wrap ``text`` to lines of at most ``max_chars``, never losing chars.

    Long tokens (no spaces) are hard-split so they cannot overflow. Existing
    newlines are honored as hard breaks. Empty input yields a single empty line
    so callers can still reserve a row.
    """
    if max_chars < 1:
        max_chars = 1
    s = "" if text is None else str(text)
    out: list = []
    for raw_line in s.split("\n"):
        if raw_line == "":
            out.append("")
            continue
        # textwrap with break_long_words so no token overflows the column.
        wrapped = textwrap.wrap(
            raw_line, width=max_chars, break_long_words=True,
            break_on_hyphens=False, replace_whitespace=True,
            drop_whitespace=True,
        )
        if not wrapped:
            out.append("")
        else:
            out.extend(wrapped)
    return out or [""]


def strip_inline_md(text: str) -> str:
    """Strip a tiny subset of inline markdown markers, keeping the text.

    Removes ``**bold**`` / ``__bold__`` / ``*em*`` / `` `code` `` markers so the
    content is preserved without trying to style spans (which the line-grid
    layout cannot do). Nothing is dropped except the markers themselves.
    """
    if not text:
        return ""
    s = str(text)
    # Drop glossary term markers, keeping the visible inner text.
    s = _TERM_SPAN_RE.sub(lambda m: m.group(2), s)
    s = _TERM_OPEN_RE.sub("", s)      # leftover unbalanced open marker.
    s = s.replace("[[/term]]", "")    # leftover unbalanced close marker.
    for marker in ("**", "__", "`"):
        s = s.replace(marker, "")
    return s


def _strip_term_markers(s: str) -> str:
    """Remove any (balanced or leftover) glossary term markers, keeping text."""
    s = _TERM_OPEN_RE.sub("", s)
    return s.replace("[[/term]]", "")


def _strip_leftover_markers(s: str) -> str:
    """Drop any unbalanced inline markers from a plain (non-span) fragment.

    Keeps the visible text identical to :func:`strip_inline_md` even when a
    ``**`` / ``__`` / `` ` `` has no matching closing marker.
    """
    for marker in ("**", "__", "`"):
        s = s.replace(marker, "")
    return s


def parse_inline_bold(text: str):
    """Split ``text`` into ``[(fragment, is_bold), ...]`` preserving order.

    ``**...**`` and ``__...__`` spans become bold fragments (markers removed);
    `` `code` `` keeps its text without the backticks and is not bold; any other
    text is emitted verbatim with unbalanced markers stripped. The concatenation
    of all fragment texts equals :func:`strip_inline_md` of the input — so the
    *visible* characters (and therefore line wrapping) are unchanged; only the
    bold flag is added. Adjacent fragments of the same weight are merged.
    """
    s = "" if text is None else str(text)
    if not s:
        return []
    out = []

    def _emit(fragment: str, bold: bool) -> None:
        if fragment == "":
            return
        if out and out[-1][1] == bold:
            out[-1] = (out[-1][0] + fragment, bold)
        else:
            out.append((fragment, bold))

    pos = 0
    for m in _INLINE_SPAN_RE.finditer(s):
        if m.start() > pos:
            _emit(_strip_leftover_markers(s[pos:m.start()]), False)
        tok = m.group(0)
        if tok.startswith("**") and tok.endswith("**"):
            _emit(tok[2:-2], True)
        elif tok.startswith("__") and tok.endswith("__"):
            _emit(tok[2:-2], True)
        else:  # `code`
            _emit(tok[1:-1], False)
        pos = m.end()
    if pos < len(s):
        _emit(_strip_leftover_markers(s[pos:]), False)
    return out


def _hard_split(word: str, max_chars: int):
    """Split a single long token into <= max_chars chunks (never loses chars)."""
    return [word[i:i + max_chars] for i in range(0, len(word), max_chars)] or [""]


def wrap_rich(text: str, max_chars: int):
    """Word-wrap ``text`` to ``max_chars`` while preserving inline bold spans.

    Returns ``list[list[(fragment, is_bold)]]`` — one inner list of styled
    fragments per output line; concatenating an inner list's fragment texts is
    the visible line. Wrapping is word-aware and hard-splits over-long tokens, so
    no line exceeds ``max_chars`` (the renderers measure these very lines, so the
    no-cut guarantee holds). Bold spans never widen a line: only the bold flag is
    carried, the visible width is identical to :func:`wrap`.
    """
    if max_chars < 1:
        max_chars = 1
    spans = parse_inline_bold(text)
    if not spans:
        return [[("", False)]]

    # Flatten to (word, is_bold) tokens, honoring hard newlines as line breaks.
    # A token list of None marks a forced line break.
    tokens = []  # each: (word, bold) or ("\n", None)
    for frag, bold in spans:
        parts = frag.split("\n")
        for pi, part in enumerate(parts):
            if pi > 0:
                tokens.append(("\n", None))
            for word in part.split(" "):
                if word == "":
                    continue
                tokens.append((word, bold))

    lines = []          # list[list[(seg, bold)]]
    cur = []            # list[(word, bold)]
    cur_len = 0

    def _flush():
        nonlocal cur, cur_len
        # Merge adjacent same-weight words (with separating spaces) into segments.
        merged = []
        for k, (word, bold) in enumerate(cur):
            piece = word if k == 0 else " " + word
            if merged and merged[-1][1] == bold:
                merged[-1] = (merged[-1][0] + piece, bold)
            else:
                merged.append((piece, bold))
        lines.append(merged or [("", False)])
        cur = []
        cur_len = 0

    for word, bold in tokens:
        if bold is None:  # forced newline
            _flush()
            continue
        if len(word) > max_chars:
            if cur:
                _flush()
            chunks = _hard_split(word, max_chars)
            for ci, chunk in enumerate(chunks):
                if ci < len(chunks) - 1:
                    lines.append([(chunk, bold)])
                else:
                    cur = [(chunk, bold)]
                    cur_len = len(chunk)
            continue
        add = len(word) if cur_len == 0 else cur_len + 1 + len(word)
        if cur_len != 0 and add > max_chars:
            _flush()
            cur = [(word, bold)]
            cur_len = len(word)
        else:
            cur.append((word, bold))
            cur_len = add
    if cur:
        _flush()
    return lines or [[("", False)]]


def parse_inline_rich(text: str):
    """Split ``text`` into ``[(fragment, is_bold, term_key), ...]``.

    Extends :func:`parse_inline_bold` with glossary term spans
    ``[[term:key]]visible[[/term]]``: the inner ``visible`` text is parsed for
    ``**bold**`` as usual and every resulting fragment carries ``term_key`` so the
    renderers can make it clickable. Text outside a term span gets ``term_key =
    None``. Unbalanced term markers are stripped (kept identical to
    :func:`strip_inline_md`). The concatenation of all fragment texts equals
    ``strip_inline_md(text)`` — visible characters and wrapping are unchanged; only
    the bold flag and the term key are added. Adjacent fragments with the same
    (bold, term) are merged.
    """
    s = "" if text is None else str(text)
    if not s:
        return []
    out = []

    def _emit(fragment: str, bold: bool, term) -> None:
        if fragment == "":
            return
        if out and out[-1][1] == bold and out[-1][2] == term:
            out[-1] = (out[-1][0] + fragment, bold, term)
        else:
            out.append((fragment, bold, term))

    def _emit_bolded(segment: str, term) -> None:
        # Reuse the bold parser on a term-marker-free segment.
        for frag, bold in parse_inline_bold(_strip_term_markers(segment)):
            _emit(frag, bold, term)

    pos = 0
    for m in _TERM_SPAN_RE.finditer(s):
        if m.start() > pos:
            _emit_bolded(s[pos:m.start()], None)
        _emit_bolded(m.group(2), m.group(1))
        pos = m.end()
    if pos < len(s):
        _emit_bolded(s[pos:], None)
    return out


def wrap_rich_terms(text: str, max_chars: int):
    """Like :func:`wrap_rich` but preserving glossary term keys per fragment.

    Returns ``list[list[(fragment, is_bold, term_key)]]`` — one inner list per
    output line. Wrapping is word-aware and hard-splits over-long tokens so no
    line exceeds ``max_chars`` (the renderers measure these very lines). Term and
    bold flags never widen a line: the visible width matches :func:`wrap`.
    """
    if max_chars < 1:
        max_chars = 1
    spans = parse_inline_rich(text)
    if not spans:
        return [[("", False, None)]]

    tokens = []  # each: (word, bold, term) or ("\n", None, None)
    for frag, bold, term in spans:
        parts = frag.split("\n")
        for pi, part in enumerate(parts):
            if pi > 0:
                tokens.append(("\n", None, None))
            for word in part.split(" "):
                if word == "":
                    continue
                tokens.append((word, bold, term))

    lines = []
    cur = []
    cur_len = 0

    def _flush():
        nonlocal cur, cur_len
        merged = []
        for k, (word, bold, term) in enumerate(cur):
            piece = word if k == 0 else " " + word
            if merged and merged[-1][1] == bold and merged[-1][2] == term:
                merged[-1] = (merged[-1][0] + piece, bold, term)
            else:
                merged.append((piece, bold, term))
        lines.append(merged or [("", False, None)])
        cur = []
        cur_len = 0

    for word, bold, term in tokens:
        if bold is None:  # forced newline
            _flush()
            continue
        if len(word) > max_chars:
            if cur:
                _flush()
            chunks = _hard_split(word, max_chars)
            for ci, chunk in enumerate(chunks):
                if ci < len(chunks) - 1:
                    lines.append([(chunk, bold, term)])
                else:
                    cur = [(chunk, bold, term)]
                    cur_len = len(chunk)
            continue
        add = len(word) if cur_len == 0 else cur_len + 1 + len(word)
        if cur_len != 0 and add > max_chars:
            _flush()
            cur = [(word, bold, term)]
            cur_len = len(word)
        else:
            cur.append((word, bold, term))
            cur_len = add
    if cur:
        _flush()
    return lines or [[("", False, None)]]


def parse_md_table(lines: list):
    """Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None.

    Accepts an optional separator row (``|---|---|``) right after the header,
    which is ignored. Returns None if the lines are not a pipe table.
    """
    cells_rows = []
    for ln in lines:
        s = ln.strip()
        if not (s.startswith("|") and s.endswith("|")):
            return None
        parts = [c.strip() for c in s.strip("|").split("|")]
        cells_rows.append(parts)
    if not cells_rows:
        return None
    header = cells_rows[0]
    body = cells_rows[1:]
    # Drop a markdown separator row (all cells are dashes/colons).
    if body and all(set(c) <= set("-: ") and "-" in c for c in body[0]):
        body = body[1:]
    return header, body