From f5b30b23dc5af0c96caafc4f3dde1e75d6efc8b1 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 16:08:16 +0200 Subject: [PATCH] feat(eda): negrita inline real (**bold**) en renderers AutomaticEDA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit El render de Markdown del motor AutomaticEDA quitaba los marcadores **negrita** sin aplicar estilo. Ahora los spans **bold**/__bold__ se renderizan en negrita real, de forma aditiva y sin romper el anti-corte: - text_layout.py: parse_inline_bold() tokeniza spans preservando el texto visible (== strip_inline_md) y wrap_rich() envuelve por palabras a max_chars conservando el flag de negrita por segmento (la anchura visible no cambia, así que la paginación es idéntica). - render_pdf_impl.py: _place_rich_lines() dibuja cada segmento con su fontweight avanzando x por el mismo grid de caracteres que usa el wrap (párrafos+bullets). - render_pptx_impl.py: _add_rich_text() usa runs nativos de python-pptx con font.bold por segmento (negrita real de PowerPoint). - bold_render_test.py: helpers puros (no-overflow, bold preservado, marcadores desbalanceados) + e2e que abre el .pptx y confirma un run con font.bold True. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../automatic_eda/bold_render_test.py | 113 ++++++++++++++ .../automatic_eda/render_pdf_impl.py | 50 +++++-- .../automatic_eda/render_pptx_impl.py | 51 ++++++- .../datascience/automatic_eda/text_layout.py | 138 ++++++++++++++++++ 4 files changed, 334 insertions(+), 18 deletions(-) create mode 100644 python/functions/datascience/automatic_eda/bold_render_test.py diff --git a/python/functions/datascience/automatic_eda/bold_render_test.py b/python/functions/datascience/automatic_eda/bold_render_test.py new file mode 100644 index 00000000..dcd98628 --- /dev/null +++ b/python/functions/datascience/automatic_eda/bold_render_test.py @@ -0,0 +1,113 @@ +"""Tests for inline-bold rendering (**bold**) in the AutomaticEDA engine. + +Covers the pure helpers (parse_inline_bold / wrap_rich) and an end-to-end PPTX +check that a ``**bold**`` span is rendered with NATIVE PowerPoint bold +(``run.font.bold is True``) while no line overflows the wrap width (no-cut). +""" + +import os +import sys + +import pytest + +# Make the engine importable as a package (datascience.automatic_eda). +_HERE = os.path.dirname(os.path.abspath(__file__)) +_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions +if _FUNCTIONS not in sys.path: + sys.path.insert(0, _FUNCTIONS) + +from datascience.automatic_eda import model # noqa: E402 +from datascience.automatic_eda import text_layout as tl # noqa: E402 +from datascience.automatic_eda import render_pptx # noqa: E402 + + +# --------------------------------------------------------------------------- # +# Pure helpers. +# --------------------------------------------------------------------------- # +def test_parse_inline_bold_marks_spans_and_preserves_visible_text(): + src = "**Estacionariedad:** serie no estacionaria con `code` y normal." + segs = tl.parse_inline_bold(src) + # Visible text equals strip_inline_md (no characters lost, markers removed). + visible = "".join(s for s, _ in segs) + assert visible == tl.strip_inline_md(src) + # The span "Estacionariedad:" is flagged bold; the rest is not. + bold_text = "".join(s for s, b in segs if b) + assert "Estacionariedad:" in bold_text + assert "serie no estacionaria" not in bold_text + + +def test_parse_inline_bold_handles_unbalanced_markers(): + # An unbalanced ** must not crash and must be stripped (matches strip_inline_md). + segs = tl.parse_inline_bold("texto **sin cierre aqui") + visible = "".join(s for s, _ in segs) + assert visible == "texto sin cierre aqui" + assert not any(b for _, b in segs) # nothing rendered bold. + + +def test_wrap_rich_never_overflows_and_keeps_bold(): + text = ("**Segmento premium.** Clientes de alto gasto y baja frecuencia con " + "ticket medio elevado y recurrencia anual estable a lo largo del año.") + max_chars = 30 + lines = tl.wrap_rich(text, max_chars) + # No visible line exceeds max_chars (no-cut: the renderer measures these). + for ln in lines: + visible = "".join(s for s, _ in ln) + assert len(visible) <= max_chars, f"línea desborda: {visible!r}" + # At least one segment is bold and it is the span content. + bold_segs = [s for ln in lines for s, b in ln if b] + assert any("Segmento premium." in s for s in bold_segs) + + +def test_wrap_rich_hard_splits_long_token(): + long = "x" * 50 + lines = tl.wrap_rich(f"**{long}**", 20) + for ln in lines: + assert len("".join(s for s, _ in ln)) <= 20 + # The whole long token is preserved across the split lines. + joined = "".join(s for ln in lines for s, _ in ln) + assert joined == long + + +# --------------------------------------------------------------------------- # +# End-to-end: PPTX renders **bold** as a real bold run. +# --------------------------------------------------------------------------- # +def _has_pptx(): + try: + import pptx # noqa: F401 + return True + except Exception: # noqa: BLE001 + return False + + +@pytest.mark.skipif(not _has_pptx(), reason="python-pptx no instalado") +def test_pptx_renders_bold_span_as_native_bold_run(tmp_path): + from pptx import Presentation + + doc = [model.Chapter( + id="t", title="Negrita", version="1.0.0", + blocks=[model.Markdown( + text="Frase con **PALABRACLAVE** resaltada y texto normal después.")], + )] + out = str(tmp_path / "bold.pptx") + res = render_pptx(doc, out, {"title": "T"}) + assert res.get("path") == out + assert os.path.exists(out) + + prs = Presentation(out) + bold_texts = [] + all_text = [] + for slide in prs.slides: + for shape in slide.shapes: + if not shape.has_text_frame: + continue + for para in shape.text_frame.paragraphs: + for run in para.runs: + all_text.append(run.text) + if run.font.bold: + bold_texts.append(run.text) + # The bold span text appears in a run with font.bold True (native bold). + assert any("PALABRACLAVE" in t for t in bold_texts), \ + f"no se encontró run bold con el span; bold={bold_texts}" + # And the surrounding plain text is NOT bold (markers did not bleed). + assert any("resaltada" in t for t in all_text) + assert not any("resaltada" in t for t in bold_texts) diff --git a/python/functions/datascience/automatic_eda/render_pdf_impl.py b/python/functions/datascience/automatic_eda/render_pdf_impl.py index b7961b0c..fe8702ce 100644 --- a/python/functions/datascience/automatic_eda/render_pdf_impl.py +++ b/python/functions/datascience/automatic_eda/render_pdf_impl.py @@ -169,6 +169,38 @@ def _place_text_lines(st: _PdfState, lines: list, fs: float, color: str, st.y += lh +def _place_rich_lines(st: _PdfState, rich_lines: list, fs: float, color: str, + indent: float = 0.0, prefixes=None) -> None: + """Draw pre-wrapped lines of styled segments (bold spans rendered bold). + + Each line is ``[(text, is_bold), ...]``. Segments are placed left-to-right, + advancing x by the deterministic character grid (same metric the wrapper + used), so a bold span is rendered with ``fontweight='bold'`` without + changing the line's measured width — the no-cut guarantee is preserved. + ``prefixes`` is an optional ``(first_line, other_lines)`` pair (e.g. a + bullet) drawn before the segments. + """ + lh = tl.line_height_in(fs) + cw = tl.avg_char_width_in(fs) + for idx, segs in enumerate(rich_lines): + _ensure_space(st, lh) + x = _ML + indent + if prefixes is not None: + prefix = prefixes[0] if idx == 0 else prefixes[1] + if prefix: + st.fig.text(_xf(x), _yf(st.y), prefix, fontsize=fs, color=color, + ha="left", va="top") + x += cw * len(prefix) + for seg_text, is_bold in segs: + if seg_text == "": + continue + st.fig.text(_xf(x), _yf(st.y), seg_text, fontsize=fs, color=color, + ha="left", va="top", + fontweight="bold" if is_bold else "normal") + x += cw * len(seg_text) + st.y += lh + + def _place_markdown(st: _PdfState, block) -> None: raw = getattr(block, "text", "") or "" md_lines = str(raw).split("\n") @@ -208,29 +240,25 @@ def _place_markdown(st: _PdfState, block) -> None: i += 1 continue if stripped.startswith("- ") or stripped.startswith("* "): - content = tl.strip_inline_md(stripped[2:]) + content = stripped[2:] # keep inline markers for bold rendering. bullet_chars = tl.chars_per_line(_USABLE_W - 0.22, _FS_BODY) - wrapped = tl.wrap(content, bullet_chars) - first = True - for w in wrapped: - prefix = "• " if first else " " - _place_text_lines(st, [prefix + w], _FS_BODY, _INK, - indent=0.0) - first = False + rich = tl.wrap_rich(content, bullet_chars) + _place_rich_lines(st, rich, _FS_BODY, _INK, + prefixes=("• ", " ")) i += 1 continue # Plain paragraph (gather following plain lines into one paragraph). - para = [tl.strip_inline_md(stripped)] + para = [stripped] # keep inline markers; wrap_rich renders **bold**. j = i + 1 while j < n: nxt = md_lines[j].strip() if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")): break - para.append(tl.strip_inline_md(nxt)) + para.append(nxt) j += 1 text = " ".join(para) max_chars = tl.chars_per_line(_USABLE_W, _FS_BODY) - _place_text_lines(st, tl.wrap(text, max_chars), _FS_BODY, _INK) + _place_rich_lines(st, tl.wrap_rich(text, max_chars), _FS_BODY, _INK) i = j st.y += _GAP diff --git a/python/functions/datascience/automatic_eda/render_pptx_impl.py b/python/functions/datascience/automatic_eda/render_pptx_impl.py index 5494d604..db7d201a 100644 --- a/python/functions/datascience/automatic_eda/render_pptx_impl.py +++ b/python/functions/datascience/automatic_eda/render_pptx_impl.py @@ -151,6 +151,42 @@ def _add_text(st: _PptxState, lines: list, fs: float, color, bold=False, st.y += height +def _add_rich_text(st: _PptxState, rich_lines: list, fs: float, color, + indent=0.0, bullet=False) -> None: + """Add pre-wrapped lines of styled segments as one paragraph per line. + + Each line is ``[(text, is_bold), ...]``; every segment becomes its own run + so ``**bold**`` spans render with native PowerPoint bold (``run.font.bold``) + without affecting the measured height (one paragraph per pre-wrapped line). + """ + lh = tl.line_height_in(fs) + height = lh * len(rich_lines) + 0.05 + _ensure(st, height) + box = st.slide.shapes.add_textbox( + Inches(_ML + indent), Inches(st.y), Inches(_USABLE_W - indent), + Inches(height)) + tf = box.text_frame + tf.word_wrap = True + first = True + for segs in rich_lines: + p = tf.paragraphs[0] if first else tf.add_paragraph() + first = False + if bullet: + r0 = p.add_run() + r0.text = "• " + r0.font.size = Pt(fs) + r0.font.color.rgb = _rgb(color) + for seg_text, is_bold in segs: + if seg_text == "": + continue + run = p.add_run() + run.text = seg_text + run.font.size = Pt(fs) + run.font.bold = bool(is_bold) + run.font.color.rgb = _rgb(color) + st.y += height + + def _place_heading(st: _PptxState, block) -> None: level = max(1, min(3, int(getattr(block, "level", 1) or 1))) fs = {1: _FS_H1, 2: _FS_H2, 3: _FS_H3}[level] @@ -196,22 +232,23 @@ def _place_markdown(st: _PptxState, block) -> None: i += 1 continue if stripped.startswith("- ") or stripped.startswith("* "): - content = tl.strip_inline_md(stripped[2:]) - lines = tl.wrap(content, tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) - _add_text(st, lines, _FS_BODY, _INK, bullet=True) + content = stripped[2:] # keep inline markers for bold rendering. + rich = tl.wrap_rich(content, + tl.chars_per_line(_USABLE_W - 0.3, _FS_BODY)) + _add_rich_text(st, rich, _FS_BODY, _INK, bullet=True) i += 1 continue - para = [tl.strip_inline_md(stripped)] + para = [stripped] # keep inline markers; wrap_rich renders **bold**. j = i + 1 while j < n: nxt = md_lines[j].strip() if nxt == "" or nxt.startswith(("|", "#", "- ", "* ")): break - para.append(tl.strip_inline_md(nxt)) + para.append(nxt) j += 1 text = " ".join(para) - _add_text(st, tl.wrap(text, tl.chars_per_line(_USABLE_W, _FS_BODY)), - _FS_BODY, _INK) + _add_rich_text(st, tl.wrap_rich(text, tl.chars_per_line(_USABLE_W, _FS_BODY)), + _FS_BODY, _INK) i = j st.y += _GAP diff --git a/python/functions/datascience/automatic_eda/text_layout.py b/python/functions/datascience/automatic_eda/text_layout.py index dae00904..0d07d140 100644 --- a/python/functions/datascience/automatic_eda/text_layout.py +++ b/python/functions/datascience/automatic_eda/text_layout.py @@ -15,8 +15,15 @@ overflowing — that is wrapping, not loss: every character is still rendered. from __future__ import annotations +import re import textwrap +# Inline span markers: ``**bold**`` / ``__bold__`` (rendered bold) and +# `` `code` `` (markers removed, not styled). Matched non-greedily so the +# shortest balanced pair wins. Unbalanced leftovers are stripped afterwards so +# the visible text matches ``strip_inline_md`` exactly. +_INLINE_SPAN_RE = re.compile(r"(\*\*.+?\*\*|__.+?__|`.+?`)") + def avg_char_width_in(fontsize_pt: float) -> float: """Approximate average glyph width in inches for a sans-serif font. @@ -84,6 +91,137 @@ def strip_inline_md(text: str) -> str: return s +def _strip_leftover_markers(s: str) -> str: + """Drop any unbalanced inline markers from a plain (non-span) fragment. + + Keeps the visible text identical to :func:`strip_inline_md` even when a + ``**`` / ``__`` / `` ` `` has no matching closing marker. + """ + for marker in ("**", "__", "`"): + s = s.replace(marker, "") + return s + + +def parse_inline_bold(text: str): + """Split ``text`` into ``[(fragment, is_bold), ...]`` preserving order. + + ``**...**`` and ``__...__`` spans become bold fragments (markers removed); + `` `code` `` keeps its text without the backticks and is not bold; any other + text is emitted verbatim with unbalanced markers stripped. The concatenation + of all fragment texts equals :func:`strip_inline_md` of the input — so the + *visible* characters (and therefore line wrapping) are unchanged; only the + bold flag is added. Adjacent fragments of the same weight are merged. + """ + s = "" if text is None else str(text) + if not s: + return [] + out = [] + + def _emit(fragment: str, bold: bool) -> None: + if fragment == "": + return + if out and out[-1][1] == bold: + out[-1] = (out[-1][0] + fragment, bold) + else: + out.append((fragment, bold)) + + pos = 0 + for m in _INLINE_SPAN_RE.finditer(s): + if m.start() > pos: + _emit(_strip_leftover_markers(s[pos:m.start()]), False) + tok = m.group(0) + if tok.startswith("**") and tok.endswith("**"): + _emit(tok[2:-2], True) + elif tok.startswith("__") and tok.endswith("__"): + _emit(tok[2:-2], True) + else: # `code` + _emit(tok[1:-1], False) + pos = m.end() + if pos < len(s): + _emit(_strip_leftover_markers(s[pos:]), False) + return out + + +def _hard_split(word: str, max_chars: int): + """Split a single long token into <= max_chars chunks (never loses chars).""" + return [word[i:i + max_chars] for i in range(0, len(word), max_chars)] or [""] + + +def wrap_rich(text: str, max_chars: int): + """Word-wrap ``text`` to ``max_chars`` while preserving inline bold spans. + + Returns ``list[list[(fragment, is_bold)]]`` — one inner list of styled + fragments per output line; concatenating an inner list's fragment texts is + the visible line. Wrapping is word-aware and hard-splits over-long tokens, so + no line exceeds ``max_chars`` (the renderers measure these very lines, so the + no-cut guarantee holds). Bold spans never widen a line: only the bold flag is + carried, the visible width is identical to :func:`wrap`. + """ + if max_chars < 1: + max_chars = 1 + spans = parse_inline_bold(text) + if not spans: + return [[("", False)]] + + # Flatten to (word, is_bold) tokens, honoring hard newlines as line breaks. + # A token list of None marks a forced line break. + tokens = [] # each: (word, bold) or ("\n", None) + for frag, bold in spans: + parts = frag.split("\n") + for pi, part in enumerate(parts): + if pi > 0: + tokens.append(("\n", None)) + for word in part.split(" "): + if word == "": + continue + tokens.append((word, bold)) + + lines = [] # list[list[(seg, bold)]] + cur = [] # list[(word, bold)] + cur_len = 0 + + def _flush(): + nonlocal cur, cur_len + # Merge adjacent same-weight words (with separating spaces) into segments. + merged = [] + for k, (word, bold) in enumerate(cur): + piece = word if k == 0 else " " + word + if merged and merged[-1][1] == bold: + merged[-1] = (merged[-1][0] + piece, bold) + else: + merged.append((piece, bold)) + lines.append(merged or [("", False)]) + cur = [] + cur_len = 0 + + for word, bold in tokens: + if bold is None: # forced newline + _flush() + continue + if len(word) > max_chars: + if cur: + _flush() + chunks = _hard_split(word, max_chars) + for ci, chunk in enumerate(chunks): + if ci < len(chunks) - 1: + lines.append([(chunk, bold)]) + else: + cur = [(chunk, bold)] + cur_len = len(chunk) + continue + add = len(word) if cur_len == 0 else cur_len + 1 + len(word) + if cur_len != 0 and add > max_chars: + _flush() + cur = [(word, bold)] + cur_len = len(word) + else: + cur.append((word, bold)) + cur_len = add + if cur: + _flush() + return lines or [[("", False)]] + + def parse_md_table(lines: list): """Parse consecutive ``| a | b |`` lines into ``(header, rows)`` or None.