feat(eda): negrita inline real (**bold**) en renderers AutomaticEDA

El render de Markdown del motor AutomaticEDA quitaba los marcadores **negrita**
sin aplicar estilo. Ahora los spans **bold**/__bold__ se renderizan en negrita
real, de forma aditiva y sin romper el anti-corte:

- text_layout.py: parse_inline_bold() tokeniza spans preservando el texto
  visible (== strip_inline_md) y wrap_rich() envuelve por palabras a max_chars
  conservando el flag de negrita por segmento (la anchura visible no cambia, así
  que la paginación es idéntica).
- render_pdf_impl.py: _place_rich_lines() dibuja cada segmento con su fontweight
  avanzando x por el mismo grid de caracteres que usa el wrap (párrafos+bullets).
- render_pptx_impl.py: _add_rich_text() usa runs nativos de python-pptx con
  font.bold por segmento (negrita real de PowerPoint).
- bold_render_test.py: helpers puros (no-overflow, bold preservado, marcadores
  desbalanceados) + e2e que abre el .pptx y confirma un run con font.bold True.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-30 16:08:16 +02:00
parent 5eaf3f662e
commit f5b30b23dc
4 changed files with 334 additions and 18 deletions
@@ -0,0 +1,113 @@
"""Tests for inline-bold rendering (**bold**) in the AutomaticEDA engine.
Covers the pure helpers (parse_inline_bold / wrap_rich) and an end-to-end PPTX
check that a ``**bold**`` span is rendered with NATIVE PowerPoint bold
(``run.font.bold is True``) while no line overflows the wrap width (no-cut).
"""
import os
import sys
import pytest
# Make the engine importable as a package (datascience.automatic_eda).
_HERE = os.path.dirname(os.path.abspath(__file__))
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions
if _FUNCTIONS not in sys.path:
sys.path.insert(0, _FUNCTIONS)
from datascience.automatic_eda import model # noqa: E402
from datascience.automatic_eda import text_layout as tl # noqa: E402
from datascience.automatic_eda import render_pptx # noqa: E402
# --------------------------------------------------------------------------- #
# Pure helpers.
# --------------------------------------------------------------------------- #
def test_parse_inline_bold_marks_spans_and_preserves_visible_text():
src = "**Estacionariedad:** serie no estacionaria con `code` y normal."
segs = tl.parse_inline_bold(src)
# Visible text equals strip_inline_md (no characters lost, markers removed).
visible = "".join(s for s, _ in segs)
assert visible == tl.strip_inline_md(src)
# The span "Estacionariedad:" is flagged bold; the rest is not.
bold_text = "".join(s for s, b in segs if b)
assert "Estacionariedad:" in bold_text
assert "serie no estacionaria" not in bold_text
def test_parse_inline_bold_handles_unbalanced_markers():
# An unbalanced ** must not crash and must be stripped (matches strip_inline_md).
segs = tl.parse_inline_bold("texto **sin cierre aqui")
visible = "".join(s for s, _ in segs)
assert visible == "texto sin cierre aqui"
assert not any(b for _, b in segs) # nothing rendered bold.
def test_wrap_rich_never_overflows_and_keeps_bold():
text = ("**Segmento premium.** Clientes de alto gasto y baja frecuencia con "
"ticket medio elevado y recurrencia anual estable a lo largo del año.")
max_chars = 30
lines = tl.wrap_rich(text, max_chars)
# No visible line exceeds max_chars (no-cut: the renderer measures these).
for ln in lines:
visible = "".join(s for s, _ in ln)
assert len(visible) <= max_chars, f"línea desborda: {visible!r}"
# At least one segment is bold and it is the span content.
bold_segs = [s for ln in lines for s, b in ln if b]
assert any("Segmento premium." in s for s in bold_segs)
def test_wrap_rich_hard_splits_long_token():
long = "x" * 50
lines = tl.wrap_rich(f"**{long}**", 20)
for ln in lines:
assert len("".join(s for s, _ in ln)) <= 20
# The whole long token is preserved across the split lines.
joined = "".join(s for ln in lines for s, _ in ln)
assert joined == long
# --------------------------------------------------------------------------- #
# End-to-end: PPTX renders **bold** as a real bold run.
# --------------------------------------------------------------------------- #
def _has_pptx():
try:
import pptx # noqa: F401
return True
except Exception: # noqa: BLE001
return False
@pytest.mark.skipif(not _has_pptx(), reason="python-pptx no instalado")
def test_pptx_renders_bold_span_as_native_bold_run(tmp_path):
from pptx import Presentation
doc = [model.Chapter(
id="t", title="Negrita", version="1.0.0",
blocks=[model.Markdown(
text="Frase con **PALABRACLAVE** resaltada y texto normal después.")],
)]
out = str(tmp_path / "bold.pptx")
res = render_pptx(doc, out, {"title": "T"})
assert res.get("path") == out
assert os.path.exists(out)
prs = Presentation(out)
bold_texts = []
all_text = []
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for para in shape.text_frame.paragraphs:
for run in para.runs:
all_text.append(run.text)
if run.font.bold:
bold_texts.append(run.text)
# The bold span text appears in a run with font.bold True (native bold).
assert any("PALABRACLAVE" in t for t in bold_texts), \
f"no se encontró run bold con el span; bold={bold_texts}"
# And the surrounding plain text is NOT bold (markers did not bleed).
assert any("resaltada" in t for t in all_text)
assert not any("resaltada" in t for t in bold_texts)
@@ -169,6 +169,38 @@ def _place_text_lines(st: _PdfState, lines: list, fs: float, color: str,
st.y += lh
def _place_rich_lines(st: _PdfState, rich_lines: list, fs: float, color: str,
indent: float = 0.0, prefixes=None) -> None:
"""Draw pre-wrapped lines of styled segments (bold spans rendered bold).
Each line is ``[(text, is_bold), ...]``. Segments are placed left-to-right,
advancing x by the deterministic character grid (same metric the wrapper
used), so a bold span is rendered with ``fontweight='bold'`` without
changing the line's measured width — the no-cut guarantee is preserved.
``prefixes`` is an optional ``(first_line, other_lines)`` pair (e.g. a
bullet) drawn before the segments.
"""
lh = tl.line_height_in(fs)
cw = tl.avg_char_width_in(fs)
for idx, segs in enumerate(rich_lines):
_ensure_space(st, lh)
x = _ML + indent
if prefixes is not None:
prefix = prefixes[0] if idx == 0 else prefixes[1]
if prefix:
st.fig.text(_xf(x), _yf(st.y), prefix, fontsize=fs, color=color,
ha="left", va="top")
x += cw * len(prefix)
for seg_text, is_bold in segs:
if seg_text == "":
continue
st.fig.text(_xf(x), _yf(st.y), seg_text, fontsize=fs, color=color,
ha="left", va="top",
fontweight="bold" if is_bold else "normal")
x += cw * len(seg_text)
st.y += lh
def _place_markdown(st: _PdfState, block) -> None:
raw = getattr(block, "text", "") or ""
md_lines = str(raw).split("\n")
@@ -208,29 +240,25 @@ def _place_markdown(st: _PdfState, block) -> None:
i += 1
continue
if stripped.startswith("- ") or stripped.startswith("* "):
content = tl.strip_inline_md(stripped[2:])
content = stripped[2:] # keep inline markers for bold rendering.
bullet_chars = tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY)
wrapped = tl.wrap(content, bullet_chars)
first = True
for w in wrapped:
prefix = "" if first else " "
_place_text_lines(st, [prefix + w], _FS_BODY, _INK,
indent=0.0)
first = False
rich = tl.wrap_rich(content, bullet_chars)
_place_rich_lines(st, rich, _FS_BODY, _INK,
prefixes=("", " "))
i += 1
continue
# Plain paragraph (gather following plain lines into one paragraph).
para = [tl.strip_inline_md(stripped)]
para = [stripped] # keep inline markers; wrap_rich renders **bold**.
j = i + 1
while j < n:
nxt = md_lines[j].strip()
if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")):
break
para.append(tl.strip_inline_md(nxt))
para.append(nxt)
j += 1
text = " ".join(para)
max_chars = tl.chars_per_line(_USABLE_W, _FS_BODY)
_place_text_lines(st, tl.wrap(text, max_chars), _FS_BODY, _INK)
_place_rich_lines(st, tl.wrap_rich(text, max_chars), _FS_BODY, _INK)
i = j
st.y += _GAP
@@ -151,6 +151,42 @@ def _add_text(st: _PptxState, lines: list, fs: float, color, bold=False,
st.y += height
def _add_rich_text(st: _PptxState, rich_lines: list, fs: float, color,
indent=0.0, bullet=False) -> None:
"""Add pre-wrapped lines of styled segments as one paragraph per line.
Each line is ``[(text, is_bold), ...]``; every segment becomes its own run
so ``**bold**`` spans render with native PowerPoint bold (``run.font.bold``)
without affecting the measured height (one paragraph per pre-wrapped line).
"""
lh = tl.line_height_in(fs)
height = lh * len(rich_lines) + 0.05
_ensure(st, height)
box = st.slide.shapes.add_textbox(
Inches(_ML + indent), Inches(st.y), Inches(_USABLE_W - indent),
Inches(height))
tf = box.text_frame
tf.word_wrap = True
first = True
for segs in rich_lines:
p = tf.paragraphs[0] if first else tf.add_paragraph()
first = False
if bullet:
r0 = p.add_run()
r0.text = ""
r0.font.size = Pt(fs)
r0.font.color.rgb = _rgb(color)
for seg_text, is_bold in segs:
if seg_text == "":
continue
run = p.add_run()
run.text = seg_text
run.font.size = Pt(fs)
run.font.bold = bool(is_bold)
run.font.color.rgb = _rgb(color)
st.y += height
def _place_heading(st: _PptxState, block) -> None:
level = max(1, min(3, int(getattr(block, "level", 1) or 1)))
fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level]
@@ -196,22 +232,23 @@ def _place_markdown(st: _PptxState, block) -> None:
i += 1
continue
if stripped.startswith("- ") or stripped.startswith("* "):
content = tl.strip_inline_md(stripped[2:])
lines = tl.wrap(content, tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY))
_add_text(st, lines, _FS_BODY, _INK, bullet=True)
content = stripped[2:] # keep inline markers for bold rendering.
rich = tl.wrap_rich(content,
tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY))
_add_rich_text(st, rich, _FS_BODY, _INK, bullet=True)
i += 1
continue
para = [tl.strip_inline_md(stripped)]
para = [stripped] # keep inline markers; wrap_rich renders **bold**.
j = i + 1
while j < n:
nxt = md_lines[j].strip()
if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")):
break
para.append(tl.strip_inline_md(nxt))
para.append(nxt)
j += 1
text = " ".join(para)
_add_text(st, tl.wrap(text, tl.chars_per_line(_USABLE_W, _FS_BODY)),
_FS_BODY, _INK)
_add_rich_text(st, tl.wrap_rich(text, tl.chars_per_line(_USABLE_W, _FS_BODY)),
_FS_BODY, _INK)
i = j
st.y += _GAP
@@ -15,8 +15,15 @@ overflowing — that is wrapping, not loss: every character is still rendered.
from __future__ import annotations
import re
import textwrap
# Inline span markers: ``**bold**`` / ``__bold__`` (rendered bold) and
# `` `code` `` (markers removed, not styled). Matched non-greedily so the
# shortest balanced pair wins. Unbalanced leftovers are stripped afterwards so
# the visible text matches ``strip_inline_md`` exactly.
_INLINE_SPAN_RE = re.compile(r"(\*\*.+?\*\*|__.+?__|`.+?`)")
def avg_char_width_in(fontsize_pt: float) -> float:
"""Approximate average glyph width in inches for a sans-serif font.
@@ -84,6 +91,137 @@ def strip_inline_md(text: str) -> str:
return s
def _strip_leftover_markers(s: str) -> str:
"""Drop any unbalanced inline markers from a plain (non-span) fragment.
Keeps the visible text identical to :func:`strip_inline_md` even when a
``**`` / ``__`` / `` ` `` has no matching closing marker.
"""
for marker in ("**", "__", "`"):
s = s.replace(marker, "")
return s
def parse_inline_bold(text: str):
"""Split ``text`` into ``[(fragment, is_bold), ...]`` preserving order.
``**...**`` and ``__...__`` spans become bold fragments (markers removed);
`` `code` `` keeps its text without the backticks and is not bold; any other
text is emitted verbatim with unbalanced markers stripped. The concatenation
of all fragment texts equals :func:`strip_inline_md` of the input — so the
*visible* characters (and therefore line wrapping) are unchanged; only the
bold flag is added. Adjacent fragments of the same weight are merged.
"""
s = "" if text is None else str(text)
if not s:
return []
out = []
def _emit(fragment: str, bold: bool) -> None:
if fragment == "":
return
if out and out[-1][1] == bold:
out[-1] = (out[-1][0] + fragment, bold)
else:
out.append((fragment, bold))
pos = 0
for m in _INLINE_SPAN_RE.finditer(s):
if m.start() > pos:
_emit(_strip_leftover_markers(s[pos:m.start()]), False)
tok = m.group(0)
if tok.startswith("**") and tok.endswith("**"):
_emit(tok[2:-2], True)
elif tok.startswith("__") and tok.endswith("__"):
_emit(tok[2:-2], True)
else: # `code`
_emit(tok[1:-1], False)
pos = m.end()
if pos < len(s):
_emit(_strip_leftover_markers(s[pos:]), False)
return out
def _hard_split(word: str, max_chars: int):
"""Split a single long token into <= max_chars chunks (never loses chars)."""
return [word[i:i + max_chars] for i in range(0, len(word), max_chars)] or [""]
def wrap_rich(text: str, max_chars: int):
"""Word-wrap ``text`` to ``max_chars`` while preserving inline bold spans.
Returns ``list[list[(fragment, is_bold)]]`` — one inner list of styled
fragments per output line; concatenating an inner list's fragment texts is
the visible line. Wrapping is word-aware and hard-splits over-long tokens, so
no line exceeds ``max_chars`` (the renderers measure these very lines, so the
no-cut guarantee holds). Bold spans never widen a line: only the bold flag is
carried, the visible width is identical to :func:`wrap`.
"""
if max_chars < 1:
max_chars = 1
spans = parse_inline_bold(text)
if not spans:
return [[("", False)]]
# Flatten to (word, is_bold) tokens, honoring hard newlines as line breaks.
# A token list of None marks a forced line break.
tokens = [] # each: (word, bold) or ("\n", None)
for frag, bold in spans:
parts = frag.split("\n")
for pi, part in enumerate(parts):
if pi > 0:
tokens.append(("\n", None))
for word in part.split(" "):
if word == "":
continue
tokens.append((word, bold))
lines = [] # list[list[(seg, bold)]]
cur = [] # list[(word, bold)]
cur_len = 0
def _flush():
nonlocal cur, cur_len
# Merge adjacent same-weight words (with separating spaces) into segments.
merged = []
for k, (word, bold) in enumerate(cur):
piece = word if k == 0 else " " + word
if merged and merged[-1][1] == bold:
merged[-1] = (merged[-1][0] + piece, bold)
else:
merged.append((piece, bold))
lines.append(merged or [("", False)])
cur = []
cur_len = 0
for word, bold in tokens:
if bold is None: # forced newline
_flush()
continue
if len(word) > max_chars:
if cur:
_flush()
chunks = _hard_split(word, max_chars)
for ci, chunk in enumerate(chunks):
if ci < len(chunks) - 1:
lines.append([(chunk, bold)])
else:
cur = [(chunk, bold)]
cur_len = len(chunk)
continue
add = len(word) if cur_len == 0 else cur_len + 1 + len(word)
if cur_len != 0 and add > max_chars:
_flush()
cur = [(word, bold)]
cur_len = len(word)
else:
cur.append((word, bold))
cur_len = add
if cur:
_flush()
return lines or [[("", False)]]
def parse_md_table(lines: list):
"""Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None.