Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d412522db9 |
@@ -1,221 +0,0 @@
|
||||
"""LLM analysis chapter (ANÁLISIS LLM) — the interpretive layer, next to overview.
|
||||
|
||||
Third reference chapter for AutomaticEDA. Renders the ``llm`` block that the
|
||||
``eda`` group function ``eda_llm_insights`` already produced and stored in the
|
||||
``TableProfile`` — it does NOT call the LLM nor recompute anything. The block is
|
||||
turned into clean, markdown-style document blocks so it reads as a real chapter
|
||||
(table summary, row meaning, data dictionary, suggested analyses, cleaning
|
||||
suggestions, PII findings) and, crucially, **nothing is ever cut** in PDF or
|
||||
PPTX:
|
||||
|
||||
* Prose (summary, row meaning) → ``Markdown`` blocks the renderers wrap to whole
|
||||
lines, so no word is lost no matter how long the text is.
|
||||
* The data dictionary and PII findings → ``DataTable`` blocks the paginator
|
||||
splits by rows (repeating the header) and whose long cells wrap inside their
|
||||
column — wide, multi-row tables never overflow a page/slide.
|
||||
* Cleaning suggestions and suggested analyses → ``Markdown`` bullet lists; each
|
||||
item is a whole line the renderer wraps, never truncated mid-entry.
|
||||
|
||||
Position: this chapter is declared in ``chapters_registry.CHAPTER_ORDER`` right
|
||||
after ``overview`` so the interpretation sits next to the table preview, as the
|
||||
user asked ("va junto al overview").
|
||||
|
||||
Data source: the ``llm`` dict produced by ``eda_llm_insights`` (group ``eda``),
|
||||
read from ``profile['llm']`` (or ``ctx['llm']`` as a fallback). Shape::
|
||||
|
||||
{
|
||||
"summary": str, # what the table is, 2-3 sentences
|
||||
"row_meaning": str, # what one row represents / granularity
|
||||
"dictionary": [ {"column","description","business_meaning","unit"} ],
|
||||
"pii": [ {"column","kind","severity"} ],
|
||||
"cleaning": [str], # cleaning / transformation suggestions
|
||||
"analyses": [str], # suggested questions / analyses / hypotheses
|
||||
}
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
Reads everything defensively (``.get``) and NEVER raises; returns ``None`` when
|
||||
the profile carries no LLM block (e.g. ``profile_table`` ran without
|
||||
``run_llm``), so the chapter is simply omitted from the document.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "analisis_llm"
|
||||
CHAPTER_TITLE = "Análisis LLM"
|
||||
|
||||
# Key under which eda_llm_insights stores its interpretive block in the profile.
|
||||
LLM_KEY = "llm"
|
||||
|
||||
|
||||
def _clean_text(value) -> str:
|
||||
"""Coerce a value to a single trimmed line (collapse inner newlines).
|
||||
|
||||
Used for bullet items so each suggestion stays a single markdown bullet the
|
||||
renderer wraps; never drops content, only normalizes whitespace.
|
||||
"""
|
||||
text = model._safe_str(value).strip()
|
||||
if not text:
|
||||
return ""
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
def _para(value) -> str:
|
||||
"""Coerce a value to trimmed prose, preserving paragraph breaks."""
|
||||
text = model._safe_str(value).strip()
|
||||
if not text:
|
||||
return ""
|
||||
# Keep blank-line paragraph breaks; collapse runs of spaces/tabs per line.
|
||||
lines = [" ".join(ln.split()) for ln in text.splitlines()]
|
||||
out: list = []
|
||||
for ln in lines:
|
||||
if ln or (out and out[-1] != ""):
|
||||
out.append(ln)
|
||||
return "\n".join(out).strip()
|
||||
|
||||
|
||||
def _bullets(items) -> str:
|
||||
"""Build a markdown bullet list from a sequence of strings.
|
||||
|
||||
Each item becomes one ``- ...`` line (a whole, wrappable unit). Empty items
|
||||
and non-list inputs are handled gracefully; returns "" when there is nothing.
|
||||
"""
|
||||
if isinstance(items, str):
|
||||
items = [items]
|
||||
if not isinstance(items, (list, tuple)):
|
||||
return ""
|
||||
lines = []
|
||||
for it in items:
|
||||
text = _clean_text(it)
|
||||
if text:
|
||||
lines.append(f"- {text}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _summary_blocks(llm: dict) -> list:
|
||||
"""Heading + prose for the table summary, or [] if absent."""
|
||||
text = _para(llm.get("summary"))
|
||||
if not text:
|
||||
return []
|
||||
return [model.Heading(text="Resumen de la tabla", level=2),
|
||||
model.Markdown(text=text)]
|
||||
|
||||
|
||||
def _row_meaning_blocks(llm: dict) -> list:
|
||||
"""Heading + prose for what one row represents, or [] if absent."""
|
||||
text = _para(llm.get("row_meaning"))
|
||||
if not text:
|
||||
return []
|
||||
return [model.Heading(text="Significado de una fila", level=2),
|
||||
model.Markdown(text=text)]
|
||||
|
||||
|
||||
def _dictionary_block(llm: dict):
|
||||
"""DataTable for the data dictionary, or None if absent/empty.
|
||||
|
||||
Columns: Columna / Descripción / Significado de negocio / Unidad. The
|
||||
paginator splits this by rows repeating the header and wraps long cells, so a
|
||||
long dictionary (many columns) never gets cut.
|
||||
"""
|
||||
entries = llm.get("dictionary")
|
||||
if not isinstance(entries, (list, tuple)) or not entries:
|
||||
return None
|
||||
header = ["Columna", "Descripción", "Significado de negocio", "Unidad"]
|
||||
rows = []
|
||||
for e in entries:
|
||||
if not isinstance(e, dict):
|
||||
# Be tolerant: a bare string still shows up as a description row.
|
||||
rows.append(["—", _clean_text(e), "", ""])
|
||||
continue
|
||||
rows.append([
|
||||
_clean_text(e.get("column")) or "—",
|
||||
_clean_text(e.get("description")),
|
||||
_clean_text(e.get("business_meaning")),
|
||||
_clean_text(e.get("unit")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
|
||||
|
||||
|
||||
def _analyses_blocks(llm: dict) -> list:
|
||||
"""Heading + bullet list of suggested analyses, or [] if absent."""
|
||||
bullets = _bullets(llm.get("analyses"))
|
||||
if not bullets:
|
||||
return []
|
||||
return [model.Heading(text="Análisis sugeridos", level=2),
|
||||
model.Markdown(text=bullets)]
|
||||
|
||||
|
||||
def _cleaning_blocks(llm: dict) -> list:
|
||||
"""Heading + bullet list of cleaning suggestions, or [] if absent."""
|
||||
bullets = _bullets(llm.get("cleaning"))
|
||||
if not bullets:
|
||||
return []
|
||||
return [model.Heading(text="Limpieza sugerida", level=2),
|
||||
model.Markdown(text=bullets)]
|
||||
|
||||
|
||||
def _pii_block(llm: dict):
|
||||
"""DataTable for PII/GDPR findings, or None if absent/empty."""
|
||||
entries = llm.get("pii")
|
||||
if not isinstance(entries, (list, tuple)) or not entries:
|
||||
return None
|
||||
header = ["Columna", "Tipo", "Severidad"]
|
||||
rows = []
|
||||
for e in entries:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
rows.append([
|
||||
_clean_text(e.get("column")) or "—",
|
||||
_clean_text(e.get("kind")),
|
||||
_clean_text(e.get("severity")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(
|
||||
header=header, rows=rows, title="Datos personales (PII / RGPD)",
|
||||
note="detección automática orientativa — revisar antes de tratar los datos")
|
||||
|
||||
|
||||
def build_analisis_llm(profile: dict, ctx: dict):
|
||||
"""Build the LLM analysis Chapter, or None if there is no LLM block.
|
||||
|
||||
Consumes ``profile['llm']`` (the block produced by ``eda_llm_insights``,
|
||||
group ``eda``); falls back to ``ctx['llm']``. Returns ``None`` when no LLM
|
||||
block is present or it carries no usable content, so the chapter is omitted
|
||||
rather than rendering an empty section.
|
||||
"""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
|
||||
llm = profile.get(LLM_KEY)
|
||||
if not isinstance(llm, dict):
|
||||
llm = ctx.get(LLM_KEY)
|
||||
if not isinstance(llm, dict) or not llm:
|
||||
return None
|
||||
|
||||
blocks: list = []
|
||||
blocks += _summary_blocks(llm)
|
||||
blocks += _row_meaning_blocks(llm)
|
||||
|
||||
dict_block = _dictionary_block(llm)
|
||||
if dict_block is not None:
|
||||
blocks.append(model.Heading(text="Diccionario de datos", level=2))
|
||||
blocks.append(dict_block)
|
||||
|
||||
blocks += _analyses_blocks(llm)
|
||||
blocks += _cleaning_blocks(llm)
|
||||
|
||||
pii_block = _pii_block(llm)
|
||||
if pii_block is not None:
|
||||
blocks.append(model.Heading(text="Datos personales (PII / RGPD)", level=2))
|
||||
blocks.append(pii_block)
|
||||
|
||||
if not blocks:
|
||||
return None # LLM block present but every field empty → omit chapter.
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -1,190 +0,0 @@
|
||||
"""Tests for the ANÁLISIS LLM chapter — DoD: golden + edges + anti-cut.
|
||||
|
||||
Self-contained: builds a synthetic TableProfile carrying an ``llm`` block (the
|
||||
shape ``eda_llm_insights`` produces) so the suite is fast and deterministic — no
|
||||
DuckDB and no LLM call. Verifies:
|
||||
|
||||
* golden — ``build_analisis_llm`` yields the chapter and the full document
|
||||
renders to PDF *and* PPTX with the summary, a suggested analysis, a cleaning
|
||||
suggestion and a dictionary column all present;
|
||||
* order — the chapter sits immediately after ``overview`` (user requirement);
|
||||
* edges — a profile with no ``llm`` block (or None/empty/malformed) returns
|
||||
``None`` and never raises;
|
||||
* anti-cut — a long dictionary (40 rows) and a 150-char cleaning suggestion are
|
||||
rendered to PDF and PPTX without losing a single row or word.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.chapters.analisis_llm import (
|
||||
build_analisis_llm, CHAPTER_VERSION)
|
||||
from datascience.automatic_eda.chapters_registry import build_document
|
||||
from datascience.automatic_eda.model import Chapter, DataTable
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _profile() -> dict:
|
||||
return {
|
||||
"table": "ventas",
|
||||
"source": "/data/ventas.csv",
|
||||
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||
"n_rows": 1000,
|
||||
"n_cols": 2,
|
||||
"quality_score": 92.5,
|
||||
"columns": [
|
||||
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"null_count": 0,
|
||||
"numeric": {"mean": 42.5, "median": 40.0, "min": 1.0,
|
||||
"max": 100.0, "std": 12.3}},
|
||||
{"name": "categoria", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "null_count": 0,
|
||||
"categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
|
||||
],
|
||||
"llm": {
|
||||
"summary": "Tabla de ventas por producto. Token SUMMARYTOKEN.",
|
||||
"row_meaning": "Cada fila es una venta. Token ROWTOKEN.",
|
||||
"dictionary": [
|
||||
{"column": "precio", "description": "Precio unitario DESCTOKEN",
|
||||
"business_meaning": "Ingreso por unidad", "unit": "EUR"},
|
||||
{"column": "categoria", "description": "Familia de producto",
|
||||
"business_meaning": "Segmento comercial", "unit": ""},
|
||||
],
|
||||
"pii": [{"column": "categoria", "kind": "ninguno", "severity": "low"}],
|
||||
"cleaning": ["Quitar nulos de precio CLEANTOKEN",
|
||||
"Normalizar mayusculas en categoria"],
|
||||
"analyses": ["Estudiar relacion precio-categoria ANALYSISTOKEN",
|
||||
"Detectar outliers de precio"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def _pptx_text(path: str) -> str:
|
||||
prs = Presentation(path)
|
||||
parts = []
|
||||
for sl in prs.slides:
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
parts.append(sh.text_frame.text)
|
||||
if sh.has_table:
|
||||
tb = sh.table
|
||||
for r in range(len(tb.rows)):
|
||||
for c in range(len(tb.columns)):
|
||||
parts.append(tb.cell(r, c).text)
|
||||
return re.sub(r"\s+", " ", " ".join(parts))
|
||||
|
||||
|
||||
def test_golden_build_y_render_pdf_pptx():
|
||||
prof = _profile()
|
||||
ch = build_analisis_llm(prof, {})
|
||||
assert ch is not None
|
||||
assert ch.id == "analisis_llm"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
assert ch.blocks # non-empty.
|
||||
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out_pdf = os.path.join(d, "eda.pdf")
|
||||
res = render_automatic_eda_pdf(prof, out_pdf, {"title": "EDA — ventas"})
|
||||
assert res["path"] == out_pdf and os.path.exists(out_pdf)
|
||||
ids = [c["id"] for c in res["chapters"]]
|
||||
assert "analisis_llm" in ids
|
||||
txt = _pdf_text(out_pdf)
|
||||
# The user's required content: summary, suggested analyses, cleaning.
|
||||
assert "SUMMARYTOKEN" in txt
|
||||
assert "ANALYSISTOKEN" in txt
|
||||
assert "CLEANTOKEN" in txt
|
||||
assert "DESCTOKEN" in txt # data dictionary cell.
|
||||
|
||||
out_pptx = os.path.join(d, "eda.pptx")
|
||||
res2 = render_automatic_eda_pptx(prof, out_pptx, {"title": "EDA — ventas"})
|
||||
assert res2["path"] == out_pptx and os.path.exists(out_pptx)
|
||||
ids2 = [c["id"] for c in res2["chapters"]]
|
||||
assert "analisis_llm" in ids2
|
||||
ptx = _pptx_text(out_pptx)
|
||||
assert "SUMMARYTOKEN" in ptx
|
||||
assert "ANALYSISTOKEN" in ptx
|
||||
assert "CLEANTOKEN" in ptx
|
||||
assert "DESCTOKEN" in ptx
|
||||
|
||||
|
||||
def test_orden_capitulo_junto_a_overview():
|
||||
chapters = build_document(_profile(), {})
|
||||
ids = [c.id for c in chapters]
|
||||
assert "overview" in ids and "analisis_llm" in ids
|
||||
# User requirement: the LLM chapter sits right after overview.
|
||||
assert ids.index("analisis_llm") == ids.index("overview") + 1
|
||||
|
||||
|
||||
def test_edge_sin_llm_devuelve_none():
|
||||
# No llm block at all.
|
||||
prof = {k: v for k, v in _profile().items() if k != "llm"}
|
||||
assert build_analisis_llm(prof, {}) is None
|
||||
# None / empty / malformed never raise and yield None.
|
||||
assert build_analisis_llm(None, None) is None
|
||||
assert build_analisis_llm({}, {}) is None
|
||||
assert build_analisis_llm({"llm": {}}, {}) is None
|
||||
assert build_analisis_llm({"llm": "not-a-dict"}, {}) is None
|
||||
# All-empty fields → omitted (no blocks).
|
||||
empty = {"llm": {"summary": "", "dictionary": [], "cleaning": [],
|
||||
"analyses": [], "pii": [], "row_meaning": ""}}
|
||||
assert build_analisis_llm(empty, {}) is None
|
||||
|
||||
|
||||
def test_edge_llm_via_ctx_fallback():
|
||||
# The block may arrive in ctx instead of the profile.
|
||||
prof = {k: v for k, v in _profile().items() if k != "llm"}
|
||||
ctx = {"llm": {"summary": "Resumen via ctx CTXTOKEN."}}
|
||||
ch = build_analisis_llm(prof, ctx)
|
||||
assert ch is not None and ch.id == "analisis_llm"
|
||||
|
||||
|
||||
def test_anti_cortes_diccionario_largo_y_limpieza_larga():
|
||||
long_clean = ("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do "
|
||||
"eiusmod tempor incididunt ut labore et dolore magna aliqua "
|
||||
"reprehenderit voluptate velit esse cillum dolore")
|
||||
dictionary = [
|
||||
{"column": f"col_{i}",
|
||||
"description": f"Descripcion larga numero {i} con bastante texto para "
|
||||
f"forzar el wrap dentro de la celda fila{i}",
|
||||
"business_meaning": f"Significado de negocio {i}", "unit": "u"}
|
||||
for i in range(40)
|
||||
]
|
||||
prof = {
|
||||
"table": "t", "n_rows": 1, "n_cols": 1, "columns": [],
|
||||
"llm": {"summary": "S", "dictionary": dictionary,
|
||||
"cleaning": [long_clean], "analyses": ["A"]},
|
||||
}
|
||||
ch = build_analisis_llm(prof, {})
|
||||
assert ch is not None
|
||||
# Structure: the dictionary DataTable keeps ALL 40 rows — none dropped on
|
||||
# construction (the renderers then split it by rows, repeating the header).
|
||||
dts = [b for b in ch.blocks if isinstance(b, DataTable)]
|
||||
assert any(len(dt.rows) == 40 for dt in dts)
|
||||
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out_pdf = os.path.join(d, "x.pdf")
|
||||
render_automatic_eda_pdf([ch], out_pdf, {"write_manifest": False})
|
||||
# 40 wide rows + a long cleaning line cannot fit one page → it spills,
|
||||
# which is exactly the no-cut behaviour (paginate, never truncate).
|
||||
assert len(PdfReader(out_pdf).pages) > 1
|
||||
txt = _pdf_text(out_pdf)
|
||||
# The long cleaning suggestion is wrapped word-by-word, not truncated.
|
||||
for word in ("Lorem", "incididunt", "reprehenderit", "voluptate", "cillum"):
|
||||
assert word in txt
|
||||
|
||||
out_pptx = os.path.join(d, "x.pptx")
|
||||
res2 = render_automatic_eda_pptx([ch], out_pptx, {"write_manifest": False})
|
||||
assert res2["n_slides"] > 1 # table + long text spill across slides.
|
||||
ptx = _pptx_text(out_pptx)
|
||||
for word in ("Lorem", "reprehenderit", "voluptate"):
|
||||
assert word in ptx
|
||||
@@ -0,0 +1,266 @@
|
||||
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
||||
|
||||
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
||||
chapter answers, in Spanish and as tables, the three things the user asked for:
|
||||
|
||||
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
|
||||
their weights (completeness, validity, consistency) before any number, plus a
|
||||
table-level summary (global score and aggregates).
|
||||
2. **Scores por columna** — a table with, per column, the total quality score and
|
||||
its breakdown into completeness / validity / consistency.
|
||||
3. **Problemas en español** — a second table listing, per column, the readable
|
||||
issues in Spanish (kept separate from the type ``flags``).
|
||||
|
||||
The breakdown and the issues are NOT recomputed here: they come from the registry
|
||||
function ``column_quality_score`` (group ``eda``), which already derives
|
||||
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
|
||||
This chapter is render-only — it consumes that function and lays the result out
|
||||
as model blocks; the renderers paginate tables (splitting by rows, repeating the
|
||||
header) and wrap long cells so nothing is ever cut.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
# Reuse the registry's pure quality function (group ``eda``). Import defensively:
|
||||
# if the package cannot be imported for any reason the chapter degrades to the
|
||||
# per-column ``quality_score`` already present in the profile instead of failing.
|
||||
try: # pragma: no cover - import wiring
|
||||
from ...column_quality_score import column_quality_score as _column_quality_score
|
||||
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
||||
_column_quality_score = None
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "calidad"
|
||||
CHAPTER_TITLE = "Calidad"
|
||||
|
||||
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
|
||||
# consistency 0.2. Kept here only to render the human explanation; the actual
|
||||
# numbers always come from the function so the two never drift in computation.
|
||||
_CRITERIA_INTRO = (
|
||||
"La calidad de cada columna es un score de 0 a 100 que combina tres "
|
||||
"criterios, cada uno con un peso:\n\n"
|
||||
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
|
||||
"ni vacíos). Una columna con muchos nulos baja de score.\n"
|
||||
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
|
||||
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
|
||||
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
|
||||
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
|
||||
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
|
||||
"Los problemas detectados por columna se listan en español más abajo."
|
||||
)
|
||||
|
||||
# Cap for the joined issues cell so a single row never grows taller than a page;
|
||||
# the remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||
_ISSUES_MAXLEN = 160
|
||||
|
||||
|
||||
def _fmt_score(value) -> str:
|
||||
"""Format a 0-100 score as ``NN / 100`` (or a placeholder)."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
text = f"{num:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text} / 100"
|
||||
|
||||
|
||||
def _fmt_unit_pct(value) -> str:
|
||||
"""Format a 0-1 fraction as a percentage (``95%``)."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value) * 100:.0f}%"
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _quality_of(col: dict) -> dict:
|
||||
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
|
||||
|
||||
Uses the registry ``column_quality_score`` when available; otherwise falls
|
||||
back to the per-column ``quality_score`` already in the profile (number only,
|
||||
empty breakdown/issues). Never raises.
|
||||
"""
|
||||
if not isinstance(col, dict):
|
||||
col = {}
|
||||
if _column_quality_score is not None:
|
||||
try:
|
||||
res = _column_quality_score(col)
|
||||
if isinstance(res, dict):
|
||||
return res
|
||||
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
||||
pass
|
||||
# Fallback: only the final score is available pre-computed in the profile.
|
||||
return {
|
||||
"score": col.get("quality_score"),
|
||||
"completeness": None,
|
||||
"validity": None,
|
||||
"consistency": None,
|
||||
"issues": [],
|
||||
}
|
||||
|
||||
|
||||
def _join_issues(issues) -> str:
|
||||
"""Join Spanish issue strings into one cell, truncating overly long lists.
|
||||
|
||||
The renderer wraps cell text, but a column with many long issues could make a
|
||||
single row taller than a whole page; cap the length and append ``(+N más)``
|
||||
so the count of hidden issues is honest rather than silently lost.
|
||||
"""
|
||||
if not isinstance(issues, (list, tuple)) or not issues:
|
||||
return ""
|
||||
parts = [model._safe_str(i).strip() for i in issues]
|
||||
parts = [p for p in parts if p]
|
||||
if not parts:
|
||||
return ""
|
||||
out = []
|
||||
used = 0
|
||||
for idx, part in enumerate(parts):
|
||||
extra = len(part) + (2 if out else 0)
|
||||
if used + extra > _ISSUES_MAXLEN and out:
|
||||
remaining = len(parts) - idx
|
||||
out.append(f"(+{remaining} más)")
|
||||
return "; ".join(out)
|
||||
out.append(part)
|
||||
used += extra
|
||||
return "; ".join(out)
|
||||
|
||||
|
||||
def _columns_with_quality(profile: dict):
|
||||
"""Yield ``(col, quality_dict)`` for every column dict in the profile."""
|
||||
cols = profile.get("columns") or []
|
||||
for c in cols:
|
||||
if isinstance(c, dict):
|
||||
yield c, _quality_of(c)
|
||||
|
||||
|
||||
def _summary_block(profile: dict, evaluated: list):
|
||||
"""Table-level KVTable: global score and quality aggregates."""
|
||||
rows = []
|
||||
score = profile.get("quality_score")
|
||||
rows.append(("Calidad global", _fmt_score(score)))
|
||||
rows.append(("Columnas evaluadas", str(len(evaluated))))
|
||||
|
||||
comps = [q.get("completeness") for _, q in evaluated
|
||||
if isinstance(q.get("completeness"), (int, float))]
|
||||
vals = [q.get("validity") for _, q in evaluated
|
||||
if isinstance(q.get("validity"), (int, float))]
|
||||
cons = [q.get("consistency") for _, q in evaluated
|
||||
if isinstance(q.get("consistency"), (int, float))]
|
||||
if comps:
|
||||
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
||||
if vals:
|
||||
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
|
||||
if cons:
|
||||
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
|
||||
|
||||
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
||||
rows.append(("Columnas con problemas", str(n_problem)))
|
||||
|
||||
# Extra table-wide quality signals already in the profile, when present.
|
||||
dup_pct = profile.get("duplicate_pct")
|
||||
if dup_pct is not None:
|
||||
rows.append(("Filas duplicadas", _fmt_unit_pct_or_pct(dup_pct)))
|
||||
null_cell_pct = profile.get("null_cell_pct")
|
||||
if null_cell_pct is not None:
|
||||
rows.append(("Celdas nulas (global)", _fmt_unit_pct_or_pct(null_cell_pct)))
|
||||
constant_cols = profile.get("constant_cols")
|
||||
if isinstance(constant_cols, (list, tuple)) and constant_cols:
|
||||
rows.append(("Columnas constantes", str(len(constant_cols))))
|
||||
all_null_cols = profile.get("all_null_cols")
|
||||
if isinstance(all_null_cols, (list, tuple)) and all_null_cols:
|
||||
rows.append(("Columnas 100% nulas", str(len(all_null_cols))))
|
||||
|
||||
return model.KVTable(rows=rows, title="Resumen de calidad")
|
||||
|
||||
|
||||
def _fmt_unit_pct_or_pct(value) -> str:
|
||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
pct = num * 100 if num <= 1.0 else num
|
||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text}%"
|
||||
|
||||
|
||||
def _scores_block(evaluated: list):
|
||||
"""DataTable with per-column score and its three-criteria breakdown."""
|
||||
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
|
||||
rows = []
|
||||
# Worst columns first so the reader sees the problems at the top.
|
||||
ordered = sorted(
|
||||
evaluated,
|
||||
key=lambda cq: (cq[1].get("score")
|
||||
if isinstance(cq[1].get("score"), (int, float)) else 101.0),
|
||||
)
|
||||
for col, q in ordered:
|
||||
rows.append([
|
||||
col.get("name") or "(col)",
|
||||
_fmt_score(q.get("score")),
|
||||
_fmt_unit_pct(q.get("completeness")),
|
||||
_fmt_unit_pct(q.get("validity")),
|
||||
_fmt_unit_pct(q.get("consistency")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Scores de calidad por columna",
|
||||
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
|
||||
|
||||
|
||||
def _issues_block(evaluated: list):
|
||||
"""DataTable listing Spanish issues per column, or a Note when there are none."""
|
||||
header = ["Columna", "Problemas detectados (español)"]
|
||||
rows = []
|
||||
for col, q in evaluated:
|
||||
joined = _join_issues(q.get("issues"))
|
||||
if joined:
|
||||
rows.append([col.get("name") or "(col)", joined])
|
||||
if not rows:
|
||||
return model.Note(
|
||||
"No se detectaron problemas de calidad en las columnas evaluadas.")
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Problemas de calidad por columna")
|
||||
|
||||
|
||||
def build_calidad(profile: dict, ctx: dict):
|
||||
"""Build the data-quality Chapter, or None if the profile has no columns.
|
||||
|
||||
Reads everything defensively; returns ``None`` when there are no columns to
|
||||
score (the chapter does not apply), and never raises on a malformed profile.
|
||||
"""
|
||||
profile = profile or {}
|
||||
if not isinstance(profile, dict):
|
||||
profile = {}
|
||||
ctx = ctx or {}
|
||||
|
||||
evaluated = list(_columns_with_quality(profile))
|
||||
if not evaluated:
|
||||
return None # no columns to score -> chapter does not apply.
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Cómo se calcula la calidad", level=2),
|
||||
model.Markdown(text=_CRITERIA_INTRO),
|
||||
_summary_block(profile, evaluated),
|
||||
model.Heading(text="Scores por columna", level=2),
|
||||
]
|
||||
scores = _scores_block(evaluated)
|
||||
if scores is not None:
|
||||
blocks.append(scores)
|
||||
blocks.append(model.Heading(text="Problemas detectados", level=2))
|
||||
blocks.append(_issues_block(evaluated))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,194 @@
|
||||
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that the chapter explains the quality criteria, shows
|
||||
per-column scores with the completeness/validity/consistency breakdown, lists the
|
||||
issues in Spanish (separate from the type flags), returns None when it does not
|
||||
apply, and that a wide profile with long names renders to PDF and PPTX without
|
||||
cutting any cell text (long content wraps, it is never truncated).
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.chapters.calidad import (
|
||||
build_calidad,
|
||||
CHAPTER_VERSION,
|
||||
)
|
||||
from datascience.automatic_eda import build_document, render_pdf, render_pptx
|
||||
|
||||
|
||||
def _profile() -> dict:
|
||||
"""A small profile with one column per quality problem (nulls, outliers,
|
||||
constant, high-cardinality id) plus one clean column."""
|
||||
return {
|
||||
"table": "demo",
|
||||
"quality_score": 72.5,
|
||||
"duplicate_pct": 0.04,
|
||||
"null_cell_pct": 0.11,
|
||||
"constant_cols": ["flag_const"],
|
||||
"all_null_cols": [],
|
||||
"columns": [
|
||||
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
|
||||
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
|
||||
"quality_score": 60},
|
||||
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
|
||||
"unique_pct": 0.98, "quality_score": 80},
|
||||
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
|
||||
"flags": ["constant"], "quality_score": 50},
|
||||
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _tables(chapter):
|
||||
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
|
||||
|
||||
|
||||
def _scores_table(chapter):
|
||||
for t in _tables(chapter):
|
||||
if "Scores" in (t.title or ""):
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def _issues_table(chapter):
|
||||
for t in _tables(chapter):
|
||||
if "Problemas" in (t.title or ""):
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_chapter_estructura_y_version():
|
||||
ch = build_calidad(_profile(), {})
|
||||
assert ch is not None
|
||||
assert ch.id == "calidad"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = [b.kind for b in ch.blocks]
|
||||
# intro heading + markdown criteria + summary kv + scores table + issues table
|
||||
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
|
||||
|
||||
|
||||
def test_golden_intro_explica_criterios_y_pesos():
|
||||
ch = build_calidad(_profile(), {})
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
for needle in ("Completitud", "Validez", "Consistencia",
|
||||
"50%", "30%", "20%"):
|
||||
assert needle in intro, f"falta {needle!r} en la intro de criterios"
|
||||
|
||||
|
||||
def test_golden_scores_incluyen_desglose_por_criterio():
|
||||
ch = build_calidad(_profile(), {})
|
||||
scores = _scores_table(ch)
|
||||
assert scores is not None
|
||||
assert scores.header == ["Columna", "Calidad", "Completitud",
|
||||
"Validez", "Consistencia"]
|
||||
# 4 columns scored, none dropped.
|
||||
assert len(scores.rows) == 4
|
||||
names = {r[0] for r in scores.rows}
|
||||
assert names == {"edad", "nombre", "flag_const", "limpia"}
|
||||
|
||||
|
||||
def test_golden_issues_en_espanol_separados_de_flags():
|
||||
ch = build_calidad(_profile(), {})
|
||||
issues = _issues_table(ch)
|
||||
assert issues is not None
|
||||
flat = " | ".join(" ".join(r) for r in issues.rows)
|
||||
assert "nulos" in flat # completeness issue (ES)
|
||||
assert "outliers" in flat # validity issue (ES)
|
||||
assert "columna constante" in flat
|
||||
assert "posible id de alta cardinalidad" in flat
|
||||
# The raw type flag string must NOT leak as a "problem".
|
||||
assert "constant" not in flat or "columna constante" in flat
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edges
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_edge_none_vacio_sin_columnas_devuelve_none():
|
||||
assert build_calidad(None, None) is None
|
||||
assert build_calidad({}, {}) is None
|
||||
assert build_calidad({"columns": []}, {}) is None
|
||||
assert build_calidad("not a dict", {}) is None
|
||||
|
||||
|
||||
def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
||||
prof = {
|
||||
"quality_score": 100,
|
||||
"columns": [
|
||||
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}},
|
||||
],
|
||||
}
|
||||
ch = build_calidad(prof, {})
|
||||
assert ch is not None
|
||||
assert _issues_table(ch) is None # no issues table
|
||||
notes = [b for b in ch.blocks if b.kind == "note"]
|
||||
assert notes and "No se detectaron problemas" in notes[0].text
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Anti-cut: a wide profile with long names renders without truncation
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _wide_profile(ncols: int = 22) -> dict:
|
||||
cols = [
|
||||
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
|
||||
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
|
||||
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
|
||||
]
|
||||
for k in range(ncols - 2):
|
||||
cols.append({
|
||||
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
|
||||
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
|
||||
})
|
||||
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
|
||||
|
||||
|
||||
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
|
||||
prof = _wide_profile(22)
|
||||
full = build_document(prof, {"dataset_name": "ancha"})
|
||||
assert any(c.id == "calidad" for c in full)
|
||||
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
|
||||
# this chapter (other chapters, e.g. portada, legitimately contain '…').
|
||||
chapters = [c for c in full if c.id == "calidad"]
|
||||
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "q.pdf")
|
||||
pptx = os.path.join(d, "q.pptx")
|
||||
rp = render_pdf(chapters, pdf, {"title": "EDA"})
|
||||
rx = render_pptx(chapters, pptx, {"title": "EDA"})
|
||||
assert os.path.exists(pdf) and os.path.exists(pptx)
|
||||
# The wide table forces pagination across several pages/slides.
|
||||
assert (rp or {}).get("n_pages", 0) >= 2
|
||||
|
||||
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
|
||||
# and there is no truncation marker.
|
||||
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
|
||||
assert "…" not in pdf_txt and "..." not in pdf_txt
|
||||
norm = re.sub(r"\s+", "", pdf_txt)
|
||||
assert long_name in norm, "el nombre largo se cortó en el PDF"
|
||||
|
||||
# PPTX: long name present in some cell, untruncated.
|
||||
allt = []
|
||||
for s in Presentation(pptx).slides:
|
||||
for sh in s.shapes:
|
||||
if sh.has_text_frame:
|
||||
allt.append(sh.text_frame.text)
|
||||
if sh.has_table:
|
||||
for row in sh.table.rows:
|
||||
for c in row.cells:
|
||||
allt.append(c.text)
|
||||
joined = re.sub(r"\s+", "", "\n".join(allt))
|
||||
assert long_name in joined, "el nombre largo se cortó en el PPTX"
|
||||
@@ -28,12 +28,12 @@ from . import model
|
||||
CHAPTER_ORDER = [
|
||||
"portada", # cover
|
||||
"overview", # df.head + columns/types/nulls/examples + describe
|
||||
"analisis_llm", # LLM interpretation — sits next to overview (user request)
|
||||
"num_distr", # numeric distributions
|
||||
"cat_distr", # categorical distributions
|
||||
"calidad", # data quality
|
||||
"correlacion", # correlations / associations
|
||||
"modelos", # cheap models (PCA/KMeans/outliers)
|
||||
"analisis_llm", # LLM interpretation
|
||||
"timeseries", # time-series analysis
|
||||
"geospatial", # geospatial
|
||||
"agregacion", # aggregations / pivots
|
||||
|
||||
Reference in New Issue
Block a user