merge: 4b analisis_llm — dedup Diccionario de datos + Datos personales (verificado met)

2026-06-30 18:14:17 +02:00
parent 32054ad781 7045f37554
commit 43821ab11d
2 changed files with 58 additions and 5 deletions
@@ -42,7 +42,11 @@ from __future__ import annotations

 from .. import model

-CHAPTER_VERSION = "1.0.0"
+# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables
+# no longer carry a ``title`` (the section Heading labels them once, per the
+# OVERVIEW pattern in the contract). The data-dictionary column already reads
+# "Significado de negocio".
+CHAPTER_VERSION = "1.1.0"
 CHAPTER_ID = "analisis_llm"
 CHAPTER_TITLE = "Análisis LLM"

@@ -118,6 +122,11 @@ def _dictionary_block(llm: dict):
    Columns: Columna / Descripción / Significado de negocio / Unidad. The
    paginator splits this by rows repeating the header and wraps long cells, so a
    long dictionary (many columns) never gets cut.
+
+    The block carries **no** ``title``: the section is labelled once by the
+    ``Heading`` that ``build_analisis_llm`` appends right before it (the canonical
+    OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would
+    print "Diccionario de datos" twice in a row.
    """
    entries = llm.get("dictionary")
    if not isinstance(entries, (list, tuple)) or not entries:
@@ -137,7 +146,7 @@ def _dictionary_block(llm: dict):
        ])
    if not rows:
        return None
-    return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
+    return model.DataTable(header=header, rows=rows)


 def _analyses_blocks(llm: dict) -> list:
@@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list:


 def _pii_block(llm: dict):
-    """DataTable for PII/GDPR findings, or None if absent/empty."""
+    """DataTable for PII/GDPR findings, or None if absent/empty.
+
+    Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in
+    ``build_analisis_llm`` labels the section once); it keeps its ``note`` with
+    the orientative-detection caveat, which the renderers print under the table.
+    """
    entries = llm.get("pii")
    if not isinstance(entries, (list, tuple)) or not entries:
        return None
@@ -176,7 +190,7 @@ def _pii_block(llm: dict):
    if not rows:
        return None
    return model.DataTable(
-        header=header, rows=rows, title="Datos personales (PII / RGPD)",
+        header=header, rows=rows,
        note="detección automática orientativa — revisar antes de tratar los datos")


@@ -24,7 +24,7 @@ from pptx import Presentation
 from datascience.automatic_eda.chapters.analisis_llm import (
    build_analisis_llm, CHAPTER_VERSION)
 from datascience.automatic_eda.chapters_registry import build_document
-from datascience.automatic_eda.model import Chapter, DataTable
+from datascience.automatic_eda.model import Chapter, DataTable, Heading
 from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
 from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx

@@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx():
        assert "DESCTOKEN" in ptx


+def test_sin_rotulos_duplicados_y_significado_de_negocio():
+    """The dictionary / PII sections must be labelled ONCE.
+
+    Regression for the duplicated 'Diccionario de datos' and 'Datos personales
+    (PII / RGPD)' headings (each section used to print its label twice: a Heading
+    plus the DataTable's own title). The fix drops the DataTable title and keeps
+    a single Heading — the OVERVIEW pattern. The data-dictionary column header is
+    also pinned to the exact text 'Significado de negocio'.
+    """
+    ch = build_analisis_llm(_profile(), {})
+    assert ch is not None
+
+    # Structure: section labels come from Headings; tables carry no title.
+    headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
+    assert headings.count("Diccionario de datos") == 1
+    assert headings.count("Datos personales (PII / RGPD)") == 1
+    for b in ch.blocks:
+        if isinstance(b, DataTable):
+            assert not b.title, f"DataTable should not duplicate the label: {b.title!r}"
+
+    # The data dictionary's third column reads exactly 'Significado de negocio'.
+    dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header]
+    assert dicts, "expected the data-dictionary DataTable"
+    assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"]
+
+    # The PII table keeps its orientative-detection note.
+    pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]]
+    assert pii and pii[0].note and "orientativa" in pii[0].note
+
+    # Render: each label appears exactly once across the whole document (the only
+    # 'Diccionario de datos' / 'Datos personales' producer is this chapter).
+    with tempfile.TemporaryDirectory() as d:
+        out_pdf = os.path.join(d, "eda.pdf")
+        render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"})
+        txt = _pdf_text(out_pdf)
+        assert txt.count("Diccionario de datos") == 1
+        assert txt.count("Datos personales") == 1
+
+
 def test_orden_capitulo_junto_a_overview():
    chapters = build_document(_profile(), {})
    ids = [c.id for c in chapters]