Files
fn_registry/python/functions/datascience/automatic_eda/text_layout.py
T
egutierrez d1a3d58a6b feat(eda): motor AutomaticEDA fase 4a — render fixes + keep-together + glosario clicable
Mejoras transversales del motor de render (no del contenido de capítulos):

1. Fix negrita pisa texto (PDF): _place_rich_lines mide el ancho REAL de cada
   span con las métricas de fuente del renderer (peso correcto) en vez del
   grid de ancho medio; negrita y normal en la misma línea ya no se solapan.
2. Zebra striping: filas pares sombreadas (#f6f8fa) en DataTable (PDF + PPTX),
   coherente al partir tablas largas (índice de fila lógico, no por página).
3. Keep-together: bloque Group nuevo; el renderer mide el grupo entero y lo
   mueve completo a la página/slide siguiente si no cabe, y encoge la figura
   (height_in) para dejar sitio a su título y texto. num_distr lo usa.
4. Caption siempre visible en toda figura PPTX (fallback al heading); la figura
   reserva el alto de su caption para que ambos quepan en el mismo slide.
5. Portada construida al final (con resumen agregado del análisis vía
   ctx['document_summary']) pero colocada primera por build_document.
6. Glosario: capítulo nuevo (último) + GlossaryCollector en ctx; los capítulos
   registran términos y marcan apariciones con [[term:key]]...[[/term]]. Links
   clicables reales: PDF (PyMuPDF, link GOTO) y PPTX (slide-jump nativo).
   Enganchado "entropía" en cat_distr como ejemplo end-to-end.

Funciones reutilizables delegadas a fn-constructor (tag eda):
- add_pdf_internal_links_py_datascience (PyMuPDF)
- pptx_link_run_to_slide_py_datascience (slide-jump)

Contrato docs/automatic_eda_contract.md actualizado (§1/§3/§5 + §11 nueva) con
la API de glosario, keep-together y zebra para la siguiente fase. PyMuPDF
declarado en pyproject. Suite verde (90 tests); golden titanic verificado.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 17:35:19 +02:00

375 lines
14 KiB
Python

"""Shared text-measurement helpers for the AutomaticEDA renderers.
Both renderers flow content top-to-bottom and must know, *before* placing a
block, how much vertical space it will take — that is what guarantees nothing is
cut: a unit either fits in the remaining space or moves to the next page/slide
whole. Measuring proportional text exactly in matplotlib/pptx is impractical, so
we use a deterministic character-grid estimate (chars-per-line from an average
glyph width) which slightly over-estimates and is therefore safe: it never
claims something fits when it would overflow.
Wrapping is word-aware (``textwrap``) and additionally hard-splits any single
token longer than the line so a 200-character value still wraps instead of
overflowing — that is wrapping, not loss: every character is still rendered.
"""
from __future__ import annotations
import re
import textwrap
# Inline span markers: ``**bold**`` / ``__bold__`` (rendered bold) and
# `` `code` `` (markers removed, not styled). Matched non-greedily so the
# shortest balanced pair wins. Unbalanced leftovers are stripped afterwards so
# the visible text matches ``strip_inline_md`` exactly.
_INLINE_SPAN_RE = re.compile(r"(\*\*.+?\*\*|__.+?__|`.+?`)")
# Glossary term span: ``[[term:key]]texto visible[[/term]]``. The visible text
# (which may itself contain ``**bold**``) is kept and tagged with ``key`` so the
# renderers can turn each appearance into a clickable jump to the glossary entry.
_TERM_SPAN_RE = re.compile(r"\[\[term:([A-Za-z0-9_]+)\]\](.*?)\[\[/term\]\]",
re.S)
_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")
def avg_char_width_in(fontsize_pt: float) -> float:
"""Approximate average glyph width in inches for a sans-serif font.
~0.5 of the point size is a conservative mean advance width for proportional
sans fonts; dividing by 72 converts points to inches.
"""
return 0.5 * fontsize_pt / 72.0
def line_height_in(fontsize_pt: float, leading: float = 1.32) -> float:
"""Line height in inches for a given font size and leading."""
return leading * fontsize_pt / 72.0
def chars_per_line(width_in: float, fontsize_pt: float) -> int:
"""How many average glyphs fit in ``width_in`` at ``fontsize_pt``."""
cw = avg_char_width_in(fontsize_pt)
if cw <= 0:
return 80
n = int(width_in / cw)
return max(1, n)
def wrap(text: str, max_chars: int) -> list:
"""Word-wrap ``text`` to lines of at most ``max_chars``, never losing chars.
Long tokens (no spaces) are hard-split so they cannot overflow. Existing
newlines are honored as hard breaks. Empty input yields a single empty line
so callers can still reserve a row.
"""
if max_chars < 1:
max_chars = 1
s = "" if text is None else str(text)
out: list = []
for raw_line in s.split("\n"):
if raw_line == "":
out.append("")
continue
# textwrap with break_long_words so no token overflows the column.
wrapped = textwrap.wrap(
raw_line, width=max_chars, break_long_words=True,
break_on_hyphens=False, replace_whitespace=True,
drop_whitespace=True,
)
if not wrapped:
out.append("")
else:
out.extend(wrapped)
return out or [""]
def strip_inline_md(text: str) -> str:
"""Strip a tiny subset of inline markdown markers, keeping the text.
Removes ``**bold**`` / ``__bold__`` / ``*em*`` / `` `code` `` markers so the
content is preserved without trying to style spans (which the line-grid
layout cannot do). Nothing is dropped except the markers themselves.
"""
if not text:
return ""
s = str(text)
# Drop glossary term markers, keeping the visible inner text.
s = _TERM_SPAN_RE.sub(lambda m: m.group(2), s)
s = _TERM_OPEN_RE.sub("", s) # leftover unbalanced open marker.
s = s.replace("[[/term]]", "") # leftover unbalanced close marker.
for marker in ("**", "__", "`"):
s = s.replace(marker, "")
return s
def _strip_term_markers(s: str) -> str:
"""Remove any (balanced or leftover) glossary term markers, keeping text."""
s = _TERM_OPEN_RE.sub("", s)
return s.replace("[[/term]]", "")
def _strip_leftover_markers(s: str) -> str:
"""Drop any unbalanced inline markers from a plain (non-span) fragment.
Keeps the visible text identical to :func:`strip_inline_md` even when a
``**`` / ``__`` / `` ` `` has no matching closing marker.
"""
for marker in ("**", "__", "`"):
s = s.replace(marker, "")
return s
def parse_inline_bold(text: str):
"""Split ``text`` into ``[(fragment, is_bold), ...]`` preserving order.
``**...**`` and ``__...__`` spans become bold fragments (markers removed);
`` `code` `` keeps its text without the backticks and is not bold; any other
text is emitted verbatim with unbalanced markers stripped. The concatenation
of all fragment texts equals :func:`strip_inline_md` of the input — so the
*visible* characters (and therefore line wrapping) are unchanged; only the
bold flag is added. Adjacent fragments of the same weight are merged.
"""
s = "" if text is None else str(text)
if not s:
return []
out = []
def _emit(fragment: str, bold: bool) -> None:
if fragment == "":
return
if out and out[-1][1] == bold:
out[-1] = (out[-1][0] + fragment, bold)
else:
out.append((fragment, bold))
pos = 0
for m in _INLINE_SPAN_RE.finditer(s):
if m.start() > pos:
_emit(_strip_leftover_markers(s[pos:m.start()]), False)
tok = m.group(0)
if tok.startswith("**") and tok.endswith("**"):
_emit(tok[2:-2], True)
elif tok.startswith("__") and tok.endswith("__"):
_emit(tok[2:-2], True)
else: # `code`
_emit(tok[1:-1], False)
pos = m.end()
if pos < len(s):
_emit(_strip_leftover_markers(s[pos:]), False)
return out
def _hard_split(word: str, max_chars: int):
"""Split a single long token into <= max_chars chunks (never loses chars)."""
return [word[i:i + max_chars] for i in range(0, len(word), max_chars)] or [""]
def wrap_rich(text: str, max_chars: int):
"""Word-wrap ``text`` to ``max_chars`` while preserving inline bold spans.
Returns ``list[list[(fragment, is_bold)]]`` — one inner list of styled
fragments per output line; concatenating an inner list's fragment texts is
the visible line. Wrapping is word-aware and hard-splits over-long tokens, so
no line exceeds ``max_chars`` (the renderers measure these very lines, so the
no-cut guarantee holds). Bold spans never widen a line: only the bold flag is
carried, the visible width is identical to :func:`wrap`.
"""
if max_chars < 1:
max_chars = 1
spans = parse_inline_bold(text)
if not spans:
return [[("", False)]]
# Flatten to (word, is_bold) tokens, honoring hard newlines as line breaks.
# A token list of None marks a forced line break.
tokens = [] # each: (word, bold) or ("\n", None)
for frag, bold in spans:
parts = frag.split("\n")
for pi, part in enumerate(parts):
if pi > 0:
tokens.append(("\n", None))
for word in part.split(" "):
if word == "":
continue
tokens.append((word, bold))
lines = [] # list[list[(seg, bold)]]
cur = [] # list[(word, bold)]
cur_len = 0
def _flush():
nonlocal cur, cur_len
# Merge adjacent same-weight words (with separating spaces) into segments.
merged = []
for k, (word, bold) in enumerate(cur):
piece = word if k == 0 else " " + word
if merged and merged[-1][1] == bold:
merged[-1] = (merged[-1][0] + piece, bold)
else:
merged.append((piece, bold))
lines.append(merged or [("", False)])
cur = []
cur_len = 0
for word, bold in tokens:
if bold is None: # forced newline
_flush()
continue
if len(word) > max_chars:
if cur:
_flush()
chunks = _hard_split(word, max_chars)
for ci, chunk in enumerate(chunks):
if ci < len(chunks) - 1:
lines.append([(chunk, bold)])
else:
cur = [(chunk, bold)]
cur_len = len(chunk)
continue
add = len(word) if cur_len == 0 else cur_len + 1 + len(word)
if cur_len != 0 and add > max_chars:
_flush()
cur = [(word, bold)]
cur_len = len(word)
else:
cur.append((word, bold))
cur_len = add
if cur:
_flush()
return lines or [[("", False)]]
def parse_inline_rich(text: str):
"""Split ``text`` into ``[(fragment, is_bold, term_key), ...]``.
Extends :func:`parse_inline_bold` with glossary term spans
``[[term:key]]visible[[/term]]``: the inner ``visible`` text is parsed for
``**bold**`` as usual and every resulting fragment carries ``term_key`` so the
renderers can make it clickable. Text outside a term span gets ``term_key =
None``. Unbalanced term markers are stripped (kept identical to
:func:`strip_inline_md`). The concatenation of all fragment texts equals
``strip_inline_md(text)`` — visible characters and wrapping are unchanged; only
the bold flag and the term key are added. Adjacent fragments with the same
(bold, term) are merged.
"""
s = "" if text is None else str(text)
if not s:
return []
out = []
def _emit(fragment: str, bold: bool, term) -> None:
if fragment == "":
return
if out and out[-1][1] == bold and out[-1][2] == term:
out[-1] = (out[-1][0] + fragment, bold, term)
else:
out.append((fragment, bold, term))
def _emit_bolded(segment: str, term) -> None:
# Reuse the bold parser on a term-marker-free segment.
for frag, bold in parse_inline_bold(_strip_term_markers(segment)):
_emit(frag, bold, term)
pos = 0
for m in _TERM_SPAN_RE.finditer(s):
if m.start() > pos:
_emit_bolded(s[pos:m.start()], None)
_emit_bolded(m.group(2), m.group(1))
pos = m.end()
if pos < len(s):
_emit_bolded(s[pos:], None)
return out
def wrap_rich_terms(text: str, max_chars: int):
"""Like :func:`wrap_rich` but preserving glossary term keys per fragment.
Returns ``list[list[(fragment, is_bold, term_key)]]`` — one inner list per
output line. Wrapping is word-aware and hard-splits over-long tokens so no
line exceeds ``max_chars`` (the renderers measure these very lines). Term and
bold flags never widen a line: the visible width matches :func:`wrap`.
"""
if max_chars < 1:
max_chars = 1
spans = parse_inline_rich(text)
if not spans:
return [[("", False, None)]]
tokens = [] # each: (word, bold, term) or ("\n", None, None)
for frag, bold, term in spans:
parts = frag.split("\n")
for pi, part in enumerate(parts):
if pi > 0:
tokens.append(("\n", None, None))
for word in part.split(" "):
if word == "":
continue
tokens.append((word, bold, term))
lines = []
cur = []
cur_len = 0
def _flush():
nonlocal cur, cur_len
merged = []
for k, (word, bold, term) in enumerate(cur):
piece = word if k == 0 else " " + word
if merged and merged[-1][1] == bold and merged[-1][2] == term:
merged[-1] = (merged[-1][0] + piece, bold, term)
else:
merged.append((piece, bold, term))
lines.append(merged or [("", False, None)])
cur = []
cur_len = 0
for word, bold, term in tokens:
if bold is None: # forced newline
_flush()
continue
if len(word) > max_chars:
if cur:
_flush()
chunks = _hard_split(word, max_chars)
for ci, chunk in enumerate(chunks):
if ci < len(chunks) - 1:
lines.append([(chunk, bold, term)])
else:
cur = [(chunk, bold, term)]
cur_len = len(chunk)
continue
add = len(word) if cur_len == 0 else cur_len + 1 + len(word)
if cur_len != 0 and add > max_chars:
_flush()
cur = [(word, bold, term)]
cur_len = len(word)
else:
cur.append((word, bold, term))
cur_len = add
if cur:
_flush()
return lines or [[("", False, None)]]
def parse_md_table(lines: list):
"""Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None.
Accepts an optional separator row (``|---|---|``) right after the header,
which is ignored. Returns None if the lines are not a pipe table.
"""
cells_rows = []
for ln in lines:
s = ln.strip()
if not (s.startswith("|") and s.endswith("|")):
return None
parts = [c.strip() for c in s.strip("|").split("|")]
cells_rows.append(parts)
if not cells_rows:
return None
header = cells_rows[0]
body = cells_rows[1:]
# Drop a markdown separator row (all cells are dashes/colons).
if body and all(set(c) <= set("-: ") and "-" in c for c in body[0]):
body = body[1:]
return header, body