diff --git a/python/functions/datascience/automatic_eda/chapters/analisis_llm.py b/python/functions/datascience/automatic_eda/chapters/analisis_llm.py index e182e6a0..f1a8511c 100644 --- a/python/functions/datascience/automatic_eda/chapters/analisis_llm.py +++ b/python/functions/datascience/automatic_eda/chapters/analisis_llm.py @@ -42,7 +42,11 @@ from __future__ import annotations from .. import model -CHAPTER_VERSION = "1.0.0" +# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables +# no longer carry a ``title`` (the section Heading labels them once, per the +# OVERVIEW pattern in the contract). The data-dictionary column already reads +# "Significado de negocio". +CHAPTER_VERSION = "1.1.0" CHAPTER_ID = "analisis_llm" CHAPTER_TITLE = "Análisis LLM" @@ -118,6 +122,11 @@ def _dictionary_block(llm: dict): Columns: Columna / Descripción / Significado de negocio / Unidad. The paginator splits this by rows repeating the header and wraps long cells, so a long dictionary (many columns) never gets cut. + + The block carries **no** ``title``: the section is labelled once by the + ``Heading`` that ``build_analisis_llm`` appends right before it (the canonical + OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would + print "Diccionario de datos" twice in a row. """ entries = llm.get("dictionary") if not isinstance(entries, (list, tuple)) or not entries: @@ -137,7 +146,7 @@ def _dictionary_block(llm: dict): ]) if not rows: return None - return model.DataTable(header=header, rows=rows, title="Diccionario de datos") + return model.DataTable(header=header, rows=rows) def _analyses_blocks(llm: dict) -> list: @@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list: def _pii_block(llm: dict): - """DataTable for PII/GDPR findings, or None if absent/empty.""" + """DataTable for PII/GDPR findings, or None if absent/empty. + + Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in + ``build_analisis_llm`` labels the section once); it keeps its ``note`` with + the orientative-detection caveat, which the renderers print under the table. + """ entries = llm.get("pii") if not isinstance(entries, (list, tuple)) or not entries: return None @@ -176,7 +190,7 @@ def _pii_block(llm: dict): if not rows: return None return model.DataTable( - header=header, rows=rows, title="Datos personales (PII / RGPD)", + header=header, rows=rows, note="detección automática orientativa — revisar antes de tratar los datos") diff --git a/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py b/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py index 2b32470a..56b884e5 100644 --- a/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py +++ b/python/functions/datascience/automatic_eda/chapters/analisis_llm_test.py @@ -24,7 +24,7 @@ from pptx import Presentation from datascience.automatic_eda.chapters.analisis_llm import ( build_analisis_llm, CHAPTER_VERSION) from datascience.automatic_eda.chapters_registry import build_document -from datascience.automatic_eda.model import Chapter, DataTable +from datascience.automatic_eda.model import Chapter, DataTable, Heading from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx @@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx(): assert "DESCTOKEN" in ptx +def test_sin_rotulos_duplicados_y_significado_de_negocio(): + """The dictionary / PII sections must be labelled ONCE. + + Regression for the duplicated 'Diccionario de datos' and 'Datos personales + (PII / RGPD)' headings (each section used to print its label twice: a Heading + plus the DataTable's own title). The fix drops the DataTable title and keeps + a single Heading — the OVERVIEW pattern. The data-dictionary column header is + also pinned to the exact text 'Significado de negocio'. + """ + ch = build_analisis_llm(_profile(), {}) + assert ch is not None + + # Structure: section labels come from Headings; tables carry no title. + headings = [b.text for b in ch.blocks if isinstance(b, Heading)] + assert headings.count("Diccionario de datos") == 1 + assert headings.count("Datos personales (PII / RGPD)") == 1 + for b in ch.blocks: + if isinstance(b, DataTable): + assert not b.title, f"DataTable should not duplicate the label: {b.title!r}" + + # The data dictionary's third column reads exactly 'Significado de negocio'. + dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header] + assert dicts, "expected the data-dictionary DataTable" + assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"] + + # The PII table keeps its orientative-detection note. + pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]] + assert pii and pii[0].note and "orientativa" in pii[0].note + + # Render: each label appears exactly once across the whole document (the only + # 'Diccionario de datos' / 'Datos personales' producer is this chapter). + with tempfile.TemporaryDirectory() as d: + out_pdf = os.path.join(d, "eda.pdf") + render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"}) + txt = _pdf_text(out_pdf) + assert txt.count("Diccionario de datos") == 1 + assert txt.count("Datos personales") == 1 + + def test_orden_capitulo_junto_a_overview(): chapters = build_document(_profile(), {}) ids = [c.id for c in chapters]