feat(eda): negrita inline real (**bold**) en renderers AutomaticEDA

El render de Markdown del motor AutomaticEDA quitaba los marcadores **negrita** sin aplicar estilo. Ahora los spans **bold**/__bold__ se renderizan en negrita real, de forma aditiva y sin romper el anti-corte: - text_layout.py: parse_inline_bold() tokeniza spans preservando el texto visible (== strip_inline_md) y wrap_rich() envuelve por palabras a max_chars conservando el flag de negrita por segmento (la anchura visible no cambia, así que la paginación es idéntica). - render_pdf_impl.py: _place_rich_lines() dibuja cada segmento con su fontweight avanzando x por el mismo grid de caracteres que usa el wrap (párrafos+bullets). - render_pptx_impl.py: _add_rich_text() usa runs nativos de python-pptx con font.bold por segmento (negrita real de PowerPoint). - bold_render_test.py: helpers puros (no-overflow, bold preservado, marcadores desbalanceados) + e2e que abre el .pptx y confirma un run con font.bold True. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 16:08:16 +02:00
parent 5eaf3f662e
commit f5b30b23dc
4 changed files with 334 additions and 18 deletions
@@ -0,0 +1,113 @@
+"""Tests for inline-bold rendering (**bold**) in the AutomaticEDA engine.
+
+Covers the pure helpers (parse_inline_bold / wrap_rich) and an end-to-end PPTX
+check that a ``**bold**`` span is rendered with NATIVE PowerPoint bold
+(``run.font.bold is True``) while no line overflows the wrap width (no-cut).
+"""
+
+import os
+import sys
+
+import pytest
+
+# Make the engine importable as a package (datascience.automatic_eda).
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))  # python/functions
+if _FUNCTIONS not in sys.path:
+    sys.path.insert(0, _FUNCTIONS)
+
+from datascience.automatic_eda import model  # noqa: E402
+from datascience.automatic_eda import text_layout as tl  # noqa: E402
+from datascience.automatic_eda import render_pptx  # noqa: E402
+
+
+# --------------------------------------------------------------------------- #
+# Pure helpers.
+# --------------------------------------------------------------------------- #
+def test_parse_inline_bold_marks_spans_and_preserves_visible_text():
+    src = "**Estacionariedad:** serie no estacionaria con `code` y normal."
+    segs = tl.parse_inline_bold(src)
+    # Visible text equals strip_inline_md (no characters lost, markers removed).
+    visible = "".join(s for s, _ in segs)
+    assert visible == tl.strip_inline_md(src)
+    # The span "Estacionariedad:" is flagged bold; the rest is not.
+    bold_text = "".join(s for s, b in segs if b)
+    assert "Estacionariedad:" in bold_text
+    assert "serie no estacionaria" not in bold_text
+
+
+def test_parse_inline_bold_handles_unbalanced_markers():
+    # An unbalanced ** must not crash and must be stripped (matches strip_inline_md).
+    segs = tl.parse_inline_bold("texto **sin cierre aqui")
+    visible = "".join(s for s, _ in segs)
+    assert visible == "texto sin cierre aqui"
+    assert not any(b for _, b in segs)  # nothing rendered bold.
+
+
+def test_wrap_rich_never_overflows_and_keeps_bold():
+    text = ("**Segmento premium.** Clientes de alto gasto y baja frecuencia con "
+            "ticket medio elevado y recurrencia anual estable a lo largo del año.")
+    max_chars = 30
+    lines = tl.wrap_rich(text, max_chars)
+    # No visible line exceeds max_chars (no-cut: the renderer measures these).
+    for ln in lines:
+        visible = "".join(s for s, _ in ln)
+        assert len(visible) <= max_chars, f"línea desborda: {visible!r}"
+    # At least one segment is bold and it is the span content.
+    bold_segs = [s for ln in lines for s, b in ln if b]
+    assert any("Segmento premium." in s for s in bold_segs)
+
+
+def test_wrap_rich_hard_splits_long_token():
+    long = "x" * 50
+    lines = tl.wrap_rich(f"**{long}**", 20)
+    for ln in lines:
+        assert len("".join(s for s, _ in ln)) <= 20
+    # The whole long token is preserved across the split lines.
+    joined = "".join(s for ln in lines for s, _ in ln)
+    assert joined == long
+
+
+# --------------------------------------------------------------------------- #
+# End-to-end: PPTX renders **bold** as a real bold run.
+# --------------------------------------------------------------------------- #
+def _has_pptx():
+    try:
+        import pptx  # noqa: F401
+        return True
+    except Exception:  # noqa: BLE001
+        return False
+
+
+@pytest.mark.skipif(not _has_pptx(), reason="python-pptx no instalado")
+def test_pptx_renders_bold_span_as_native_bold_run(tmp_path):
+    from pptx import Presentation
+
+    doc = [model.Chapter(
+        id="t", title="Negrita", version="1.0.0",
+        blocks=[model.Markdown(
+            text="Frase con **PALABRACLAVE** resaltada y texto normal después.")],
+    )]
+    out = str(tmp_path / "bold.pptx")
+    res = render_pptx(doc, out, {"title": "T"})
+    assert res.get("path") == out
+    assert os.path.exists(out)
+
+    prs = Presentation(out)
+    bold_texts = []
+    all_text = []
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if not shape.has_text_frame:
+                continue
+            for para in shape.text_frame.paragraphs:
+                for run in para.runs:
+                    all_text.append(run.text)
+                    if run.font.bold:
+                        bold_texts.append(run.text)
+    # The bold span text appears in a run with font.bold True (native bold).
+    assert any("PALABRACLAVE" in t for t in bold_texts), \
+        f"no se encontró run bold con el span; bold={bold_texts}"
+    # And the surrounding plain text is NOT bold (markers did not bleed).
+    assert any("resaltada" in t for t in all_text)
+    assert not any("resaltada" in t for t in bold_texts)
@@ -169,6 +169,38 @@ def _place_text_lines(st: _PdfState, lines: list, fs: float, color: str,
        st.y += lh


+def _place_rich_lines(st: _PdfState, rich_lines: list, fs: float, color: str,
+                      indent: float = 0.0, prefixes=None) -> None:
+    """Draw pre-wrapped lines of styled segments (bold spans rendered bold).
+
+    Each line is ``[(text, is_bold), ...]``. Segments are placed left-to-right,
+    advancing x by the deterministic character grid (same metric the wrapper
+    used), so a bold span is rendered with ``fontweight='bold'`` without
+    changing the line's measured width — the no-cut guarantee is preserved.
+    ``prefixes`` is an optional ``(first_line, other_lines)`` pair (e.g. a
+    bullet) drawn before the segments.
+    """
+    lh = tl.line_height_in(fs)
+    cw = tl.avg_char_width_in(fs)
+    for idx, segs in enumerate(rich_lines):
+        _ensure_space(st, lh)
+        x = _ML + indent
+        if prefixes is not None:
+            prefix = prefixes[0] if idx == 0 else prefixes[1]
+            if prefix:
+                st.fig.text(_xf(x), _yf(st.y), prefix, fontsize=fs, color=color,
+                            ha="left", va="top")
+                x += cw * len(prefix)
+        for seg_text, is_bold in segs:
+            if seg_text == "":
+                continue
+            st.fig.text(_xf(x), _yf(st.y), seg_text, fontsize=fs, color=color,
+                        ha="left", va="top",
+                        fontweight="bold" if is_bold else "normal")
+            x += cw * len(seg_text)
+        st.y += lh
+
+
 def _place_markdown(st: _PdfState, block) -> None:
    raw = getattr(block, "text", "") or ""
    md_lines = str(raw).split("\n")
@@ -208,29 +240,25 @@ def _place_markdown(st: _PdfState, block) -> None:
            i += 1
            continue
        if stripped.startswith("- ") or stripped.startswith("* "):
-            content = tl.strip_inline_md(stripped[2:])
+            content = stripped[2:]  # keep inline markers for bold rendering.
            bullet_chars = tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY)
-            wrapped = tl.wrap(content, bullet_chars)
-            first = True
-            for w in wrapped:
-                prefix = "•  " if first else "   "
-                _place_text_lines(st, [prefix + w], _FS_BODY, _INK,
-                                  indent=0.0)
-                first = False
+            rich = tl.wrap_rich(content, bullet_chars)
+            _place_rich_lines(st, rich, _FS_BODY, _INK,
+                              prefixes=("•  ", "   "))
            i += 1
            continue
        # Plain paragraph (gather following plain lines into one paragraph).
-        para = [tl.strip_inline_md(stripped)]
+        para = [stripped]  # keep inline markers; wrap_rich renders **bold**.
        j = i + 1
        while j < n:
            nxt = md_lines[j].strip()
            if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")):
                break
-            para.append(tl.strip_inline_md(nxt))
+            para.append(nxt)
            j += 1
        text = " ".join(para)
        max_chars = tl.chars_per_line(_USABLE_W, _FS_BODY)
-        _place_text_lines(st, tl.wrap(text, max_chars), _FS_BODY, _INK)
+        _place_rich_lines(st, tl.wrap_rich(text, max_chars), _FS_BODY, _INK)
        i = j
    st.y += _GAP

@@ -151,6 +151,42 @@ def _add_text(st: _PptxState, lines: list, fs: float, color, bold=False,
    st.y += height


+def _add_rich_text(st: _PptxState, rich_lines: list, fs: float, color,
+                   indent=0.0, bullet=False) -> None:
+    """Add pre-wrapped lines of styled segments as one paragraph per line.
+
+    Each line is ``[(text, is_bold), ...]``; every segment becomes its own run
+    so ``**bold**`` spans render with native PowerPoint bold (``run.font.bold``)
+    without affecting the measured height (one paragraph per pre-wrapped line).
+    """
+    lh = tl.line_height_in(fs)
+    height = lh * len(rich_lines) + 0.05
+    _ensure(st, height)
+    box = st.slide.shapes.add_textbox(
+        Inches(_ML + indent), Inches(st.y), Inches(_USABLE_W - indent),
+        Inches(height))
+    tf = box.text_frame
+    tf.word_wrap = True
+    first = True
+    for segs in rich_lines:
+        p = tf.paragraphs[0] if first else tf.add_paragraph()
+        first = False
+        if bullet:
+            r0 = p.add_run()
+            r0.text = "•  "
+            r0.font.size = Pt(fs)
+            r0.font.color.rgb = _rgb(color)
+        for seg_text, is_bold in segs:
+            if seg_text == "":
+                continue
+            run = p.add_run()
+            run.text = seg_text
+            run.font.size = Pt(fs)
+            run.font.bold = bool(is_bold)
+            run.font.color.rgb = _rgb(color)
+    st.y += height
+
+
 def _place_heading(st: _PptxState, block) -> None:
    level = max(1, min(3, int(getattr(block, "level", 1) or 1)))
    fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level]
@@ -196,22 +232,23 @@ def _place_markdown(st: _PptxState, block) -> None:
            i += 1
            continue
        if stripped.startswith("- ") or stripped.startswith("* "):
-            content = tl.strip_inline_md(stripped[2:])
-            lines = tl.wrap(content, tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY))
-            _add_text(st, lines, _FS_BODY, _INK, bullet=True)
+            content = stripped[2:]  # keep inline markers for bold rendering.
+            rich = tl.wrap_rich(content,
+                                tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY))
+            _add_rich_text(st, rich, _FS_BODY, _INK, bullet=True)
            i += 1
            continue
-        para = [tl.strip_inline_md(stripped)]
+        para = [stripped]  # keep inline markers; wrap_rich renders **bold**.
        j = i + 1
        while j < n:
            nxt = md_lines[j].strip()
            if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")):
                break
-            para.append(tl.strip_inline_md(nxt))
+            para.append(nxt)
            j += 1
        text = " ".join(para)
-        _add_text(st, tl.wrap(text, tl.chars_per_line(_USABLE_W, _FS_BODY)),
-                  _FS_BODY, _INK)
+        _add_rich_text(st, tl.wrap_rich(text, tl.chars_per_line(_USABLE_W, _FS_BODY)),
+                       _FS_BODY, _INK)
        i = j
    st.y += _GAP

@@ -15,8 +15,15 @@ overflowing — that is wrapping, not loss: every character is still rendered.

 from __future__ import annotations

+import re
 import textwrap

+# Inline span markers: ``**bold**`` / ``__bold__`` (rendered bold) and
+# `` `code` `` (markers removed, not styled). Matched non-greedily so the
+# shortest balanced pair wins. Unbalanced leftovers are stripped afterwards so
+# the visible text matches ``strip_inline_md`` exactly.
+_INLINE_SPAN_RE = re.compile(r"(\*\*.+?\*\*|__.+?__|`.+?`)")
+

 def avg_char_width_in(fontsize_pt: float) -> float:
    """Approximate average glyph width in inches for a sans-serif font.
@@ -84,6 +91,137 @@ def strip_inline_md(text: str) -> str:
    return s


+def _strip_leftover_markers(s: str) -> str:
+    """Drop any unbalanced inline markers from a plain (non-span) fragment.
+
+    Keeps the visible text identical to :func:`strip_inline_md` even when a
+    ``**`` / ``__`` / `` ` `` has no matching closing marker.
+    """
+    for marker in ("**", "__", "`"):
+        s = s.replace(marker, "")
+    return s
+
+
+def parse_inline_bold(text: str):
+    """Split ``text`` into ``[(fragment, is_bold), ...]`` preserving order.
+
+    ``**...**`` and ``__...__`` spans become bold fragments (markers removed);
+    `` `code` `` keeps its text without the backticks and is not bold; any other
+    text is emitted verbatim with unbalanced markers stripped. The concatenation
+    of all fragment texts equals :func:`strip_inline_md` of the input — so the
+    *visible* characters (and therefore line wrapping) are unchanged; only the
+    bold flag is added. Adjacent fragments of the same weight are merged.
+    """
+    s = "" if text is None else str(text)
+    if not s:
+        return []
+    out = []
+
+    def _emit(fragment: str, bold: bool) -> None:
+        if fragment == "":
+            return
+        if out and out[-1][1] == bold:
+            out[-1] = (out[-1][0] + fragment, bold)
+        else:
+            out.append((fragment, bold))
+
+    pos = 0
+    for m in _INLINE_SPAN_RE.finditer(s):
+        if m.start() > pos:
+            _emit(_strip_leftover_markers(s[pos:m.start()]), False)
+        tok = m.group(0)
+        if tok.startswith("**") and tok.endswith("**"):
+            _emit(tok[2:-2], True)
+        elif tok.startswith("__") and tok.endswith("__"):
+            _emit(tok[2:-2], True)
+        else:  # `code`
+            _emit(tok[1:-1], False)
+        pos = m.end()
+    if pos < len(s):
+        _emit(_strip_leftover_markers(s[pos:]), False)
+    return out
+
+
+def _hard_split(word: str, max_chars: int):
+    """Split a single long token into <= max_chars chunks (never loses chars)."""
+    return [word[i:i + max_chars] for i in range(0, len(word), max_chars)] or [""]
+
+
+def wrap_rich(text: str, max_chars: int):
+    """Word-wrap ``text`` to ``max_chars`` while preserving inline bold spans.
+
+    Returns ``list[list[(fragment, is_bold)]]`` — one inner list of styled
+    fragments per output line; concatenating an inner list's fragment texts is
+    the visible line. Wrapping is word-aware and hard-splits over-long tokens, so
+    no line exceeds ``max_chars`` (the renderers measure these very lines, so the
+    no-cut guarantee holds). Bold spans never widen a line: only the bold flag is
+    carried, the visible width is identical to :func:`wrap`.
+    """
+    if max_chars < 1:
+        max_chars = 1
+    spans = parse_inline_bold(text)
+    if not spans:
+        return [[("", False)]]
+
+    # Flatten to (word, is_bold) tokens, honoring hard newlines as line breaks.
+    # A token list of None marks a forced line break.
+    tokens = []  # each: (word, bold) or ("\n", None)
+    for frag, bold in spans:
+        parts = frag.split("\n")
+        for pi, part in enumerate(parts):
+            if pi > 0:
+                tokens.append(("\n", None))
+            for word in part.split(" "):
+                if word == "":
+                    continue
+                tokens.append((word, bold))
+
+    lines = []          # list[list[(seg, bold)]]
+    cur = []            # list[(word, bold)]
+    cur_len = 0
+
+    def _flush():
+        nonlocal cur, cur_len
+        # Merge adjacent same-weight words (with separating spaces) into segments.
+        merged = []
+        for k, (word, bold) in enumerate(cur):
+            piece = word if k == 0 else " " + word
+            if merged and merged[-1][1] == bold:
+                merged[-1] = (merged[-1][0] + piece, bold)
+            else:
+                merged.append((piece, bold))
+        lines.append(merged or [("", False)])
+        cur = []
+        cur_len = 0
+
+    for word, bold in tokens:
+        if bold is None:  # forced newline
+            _flush()
+            continue
+        if len(word) > max_chars:
+            if cur:
+                _flush()
+            chunks = _hard_split(word, max_chars)
+            for ci, chunk in enumerate(chunks):
+                if ci < len(chunks) - 1:
+                    lines.append([(chunk, bold)])
+                else:
+                    cur = [(chunk, bold)]
+                    cur_len = len(chunk)
+            continue
+        add = len(word) if cur_len == 0 else cur_len + 1 + len(word)
+        if cur_len != 0 and add > max_chars:
+            _flush()
+            cur = [(word, bold)]
+            cur_len = len(word)
+        else:
+            cur.append((word, bold))
+            cur_len = add
+    if cur:
+        _flush()
+    return lines or [[("", False)]]
+
+
 def parse_md_table(lines: list):
    """Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None.