diff --git a/python/functions/datascience/automatic_eda/chapters/analisis_llm.py b/python/functions/datascience/automatic_eda/chapters/analisis_llm.py new file mode 100644 index 00000000..e182e6a0 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/analisis_llm.py @@ -0,0 +1,221 @@ +"""LLM analysis chapter (ANÁLISIS LLM) — the interpretive layer, next to overview. + +Third reference chapter for AutomaticEDA. Renders the ``llm`` block that the +``eda`` group function ``eda_llm_insights`` already produced and stored in the +``TableProfile`` — it does NOT call the LLM nor recompute anything. The block is +turned into clean, markdown-style document blocks so it reads as a real chapter +(table summary, row meaning, data dictionary, suggested analyses, cleaning +suggestions, PII findings) and, crucially, **nothing is ever cut** in PDF or +PPTX: + +* Prose (summary, row meaning) → ``Markdown`` blocks the renderers wrap to whole + lines, so no word is lost no matter how long the text is. +* The data dictionary and PII findings → ``DataTable`` blocks the paginator + splits by rows (repeating the header) and whose long cells wrap inside their + column — wide, multi-row tables never overflow a page/slide. +* Cleaning suggestions and suggested analyses → ``Markdown`` bullet lists; each + item is a whole line the renderer wraps, never truncated mid-entry. + +Position: this chapter is declared in ``chapters_registry.CHAPTER_ORDER`` right +after ``overview`` so the interpretation sits next to the table preview, as the +user asked ("va junto al overview"). + +Data source: the ``llm`` dict produced by ``eda_llm_insights`` (group ``eda``), +read from ``profile['llm']`` (or ``ctx['llm']`` as a fallback). Shape:: + + { + "summary": str, # what the table is, 2-3 sentences + "row_meaning": str, # what one row represents / granularity + "dictionary": [ {"column","description","business_meaning","unit"} ], + "pii": [ {"column","kind","severity"} ], + "cleaning": [str], # cleaning / transformation suggestions + "analyses": [str], # suggested questions / analyses / hypotheses + } + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +Reads everything defensively (``.get``) and NEVER raises; returns ``None`` when +the profile carries no LLM block (e.g. ``profile_table`` ran without +``run_llm``), so the chapter is simply omitted from the document. +""" + +from __future__ import annotations + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "analisis_llm" +CHAPTER_TITLE = "Análisis LLM" + +# Key under which eda_llm_insights stores its interpretive block in the profile. +LLM_KEY = "llm" + + +def _clean_text(value) -> str: + """Coerce a value to a single trimmed line (collapse inner newlines). + + Used for bullet items so each suggestion stays a single markdown bullet the + renderer wraps; never drops content, only normalizes whitespace. + """ + text = model._safe_str(value).strip() + if not text: + return "" + return " ".join(text.split()) + + +def _para(value) -> str: + """Coerce a value to trimmed prose, preserving paragraph breaks.""" + text = model._safe_str(value).strip() + if not text: + return "" + # Keep blank-line paragraph breaks; collapse runs of spaces/tabs per line. + lines = [" ".join(ln.split()) for ln in text.splitlines()] + out: list = [] + for ln in lines: + if ln or (out and out[-1] != ""): + out.append(ln) + return "\n".join(out).strip() + + +def _bullets(items) -> str: + """Build a markdown bullet list from a sequence of strings. + + Each item becomes one ``- ...`` line (a whole, wrappable unit). Empty items + and non-list inputs are handled gracefully; returns "" when there is nothing. + """ + if isinstance(items, str): + items = [items] + if not isinstance(items, (list, tuple)): + return "" + lines = [] + for it in items: + text = _clean_text(it) + if text: + lines.append(f"- {text}") + return "\n".join(lines) + + +def _summary_blocks(llm: dict) -> list: + """Heading + prose for the table summary, or [] if absent.""" + text = _para(llm.get("summary")) + if not text: + return [] + return [model.Heading(text="Resumen de la tabla", level=2), + model.Markdown(text=text)] + + +def _row_meaning_blocks(llm: dict) -> list: + """Heading + prose for what one row represents, or [] if absent.""" + text = _para(llm.get("row_meaning")) + if not text: + return [] + return [model.Heading(text="Significado de una fila", level=2), + model.Markdown(text=text)] + + +def _dictionary_block(llm: dict): + """DataTable for the data dictionary, or None if absent/empty. + + Columns: Columna / Descripción / Significado de negocio / Unidad. The + paginator splits this by rows repeating the header and wraps long cells, so a + long dictionary (many columns) never gets cut. + """ + entries = llm.get("dictionary") + if not isinstance(entries, (list, tuple)) or not entries: + return None + header = ["Columna", "Descripción", "Significado de negocio", "Unidad"] + rows = [] + for e in entries: + if not isinstance(e, dict): + # Be tolerant: a bare string still shows up as a description row. + rows.append(["—", _clean_text(e), "", ""]) + continue + rows.append([ + _clean_text(e.get("column")) or "—", + _clean_text(e.get("description")), + _clean_text(e.get("business_meaning")), + _clean_text(e.get("unit")), + ]) + if not rows: + return None + return model.DataTable(header=header, rows=rows, title="Diccionario de datos") + + +def _analyses_blocks(llm: dict) -> list: + """Heading + bullet list of suggested analyses, or [] if absent.""" + bullets = _bullets(llm.get("analyses")) + if not bullets: + return [] + return [model.Heading(text="Análisis sugeridos", level=2), + model.Markdown(text=bullets)] + + +def _cleaning_blocks(llm: dict) -> list: + """Heading + bullet list of cleaning suggestions, or [] if absent.""" + bullets = _bullets(llm.get("cleaning")) + if not bullets: + return [] + return [model.Heading(text="Limpieza sugerida", level=2), + model.Markdown(text=bullets)] + + +def _pii_block(llm: dict): + """DataTable for PII/GDPR findings, or None if absent/empty.""" + entries = llm.get("pii") + if not isinstance(entries, (list, tuple)) or not entries: + return None + header = ["Columna", "Tipo", "Severidad"] + rows = [] + for e in entries: + if not isinstance(e, dict): + continue + rows.append([ + _clean_text(e.get("column")) or "—", + _clean_text(e.get("kind")), + _clean_text(e.get("severity")), + ]) + if not rows: + return None + return model.DataTable( + header=header, rows=rows, title="Datos personales (PII / RGPD)", + note="detección automática orientativa — revisar antes de tratar los datos") + + +def build_analisis_llm(profile: dict, ctx: dict): + """Build the LLM analysis Chapter, or None if there is no LLM block. + + Consumes ``profile['llm']`` (the block produced by ``eda_llm_insights``, + group ``eda``); falls back to ``ctx['llm']``. Returns ``None`` when no LLM + block is present or it carries no usable content, so the chapter is omitted + rather than rendering an empty section. + """ + profile = profile or {} + ctx = ctx or {} + + llm = profile.get(LLM_KEY) + if not isinstance(llm, dict): + llm = ctx.get(LLM_KEY) + if not isinstance(llm, dict) or not llm: + return None + + blocks: list = [] + blocks += _summary_blocks(llm) + blocks += _row_meaning_blocks(llm) + + dict_block = _dictionary_block(llm) + if dict_block is not None: + blocks.append(model.Heading(text="Diccionario de datos", level=2)) + blocks.append(dict_block) + + blocks += _analyses_blocks(llm) + blocks += _cleaning_blocks(llm) + + pii_block = _pii_block(llm) + if pii_block is not None: + blocks.append(model.Heading(text="Datos personales (PII / RGPD)", level=2)) + blocks.append(pii_block) + + if not blocks: + return None # LLM block present but every field empty → omit chapter. + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py index 1d6743f4..6dd73237 100644 --- a/python/functions/datascience/automatic_eda/chapters_registry.py +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -28,12 +28,12 @@ from . import model CHAPTER_ORDER = [ "portada", # cover "overview", # df.head + columns/types/nulls/examples + describe + "analisis_llm", # LLM interpretation — sits next to overview (user request) "num_distr", # numeric distributions "cat_distr", # categorical distributions "calidad", # data quality "correlacion", # correlations / associations "modelos", # cheap models (PCA/KMeans/outliers) - "analisis_llm", # LLM interpretation "timeseries", # time-series analysis "geospatial", # geospatial "agregacion", # aggregations / pivots