Compare commits

..

1 Commits

Author SHA1 Message Date
egutierrez 7045f37554 fix(eda): quita rótulos duplicados en capítulo ANÁLISIS LLM
El capítulo etiquetaba dos secciones por partida doble: un Heading de nivel 2
más el 'title' del propio DataTable, imprimiendo 'Diccionario de datos' y
'Datos personales (PII / RGPD)' dos veces seguidas en PDF y PPTX.

Se elimina el 'title' de ambos DataTable y se conserva el Heading único (el
patrón canónico OVERVIEW del contrato §8: el rótulo lo da el Heading, la tabla
solo repite su cabecera de columnas al paginar). El DataTable de PII mantiene su
'note' orientativa. La columna del diccionario ya lee 'Significado de negocio'.

CHAPTER_VERSION 1.0.0 -> 1.1.0. Test nuevo
test_sin_rotulos_duplicados_y_significado_de_negocio fija: tablas sin title,
cabecera exacta 'Significado de negocio', y cada rótulo una sola vez en el PDF.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 18:07:12 +02:00
12 changed files with 384 additions and 782 deletions
@@ -42,7 +42,11 @@ from __future__ import annotations
from .. import model
CHAPTER_VERSION = "1.0.0"
# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables
# no longer carry a ``title`` (the section Heading labels them once, per the
# OVERVIEW pattern in the contract). The data-dictionary column already reads
# "Significado de negocio".
CHAPTER_VERSION = "1.1.0"
CHAPTER_ID = "analisis_llm"
CHAPTER_TITLE = "Análisis LLM"
@@ -118,6 +122,11 @@ def _dictionary_block(llm: dict):
Columns: Columna / Descripción / Significado de negocio / Unidad. The
paginator splits this by rows repeating the header and wraps long cells, so a
long dictionary (many columns) never gets cut.
The block carries **no** ``title``: the section is labelled once by the
``Heading`` that ``build_analisis_llm`` appends right before it (the canonical
OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would
print "Diccionario de datos" twice in a row.
"""
entries = llm.get("dictionary")
if not isinstance(entries, (list, tuple)) or not entries:
@@ -137,7 +146,7 @@ def _dictionary_block(llm: dict):
])
if not rows:
return None
return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
return model.DataTable(header=header, rows=rows)
def _analyses_blocks(llm: dict) -> list:
@@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list:
def _pii_block(llm: dict):
"""DataTable for PII/GDPR findings, or None if absent/empty."""
"""DataTable for PII/GDPR findings, or None if absent/empty.
Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in
``build_analisis_llm`` labels the section once); it keeps its ``note`` with
the orientative-detection caveat, which the renderers print under the table.
"""
entries = llm.get("pii")
if not isinstance(entries, (list, tuple)) or not entries:
return None
@@ -176,7 +190,7 @@ def _pii_block(llm: dict):
if not rows:
return None
return model.DataTable(
header=header, rows=rows, title="Datos personales (PII / RGPD)",
header=header, rows=rows,
note="detección automática orientativa — revisar antes de tratar los datos")
@@ -24,7 +24,7 @@ from pptx import Presentation
from datascience.automatic_eda.chapters.analisis_llm import (
build_analisis_llm, CHAPTER_VERSION)
from datascience.automatic_eda.chapters_registry import build_document
from datascience.automatic_eda.model import Chapter, DataTable
from datascience.automatic_eda.model import Chapter, DataTable, Heading
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
@@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx():
assert "DESCTOKEN" in ptx
def test_sin_rotulos_duplicados_y_significado_de_negocio():
"""The dictionary / PII sections must be labelled ONCE.
Regression for the duplicated 'Diccionario de datos' and 'Datos personales
(PII / RGPD)' headings (each section used to print its label twice: a Heading
plus the DataTable's own title). The fix drops the DataTable title and keeps
a single Heading — the OVERVIEW pattern. The data-dictionary column header is
also pinned to the exact text 'Significado de negocio'.
"""
ch = build_analisis_llm(_profile(), {})
assert ch is not None
# Structure: section labels come from Headings; tables carry no title.
headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
assert headings.count("Diccionario de datos") == 1
assert headings.count("Datos personales (PII / RGPD)") == 1
for b in ch.blocks:
if isinstance(b, DataTable):
assert not b.title, f"DataTable should not duplicate the label: {b.title!r}"
# The data dictionary's third column reads exactly 'Significado de negocio'.
dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header]
assert dicts, "expected the data-dictionary DataTable"
assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"]
# The PII table keeps its orientative-detection note.
pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]]
assert pii and pii[0].note and "orientativa" in pii[0].note
# Render: each label appears exactly once across the whole document (the only
# 'Diccionario de datos' / 'Datos personales' producer is this chapter).
with tempfile.TemporaryDirectory() as d:
out_pdf = os.path.join(d, "eda.pdf")
render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"})
txt = _pdf_text(out_pdf)
assert txt.count("Diccionario de datos") == 1
assert txt.count("Datos personales") == 1
def test_orden_capitulo_junto_a_overview():
chapters = build_document(_profile(), {})
ids = [c.id for c in chapters]
@@ -1,26 +1,22 @@
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
chapter implements the quality model of report 2046:
chapter answers, in Spanish and as tables, the three things the user asked for:
1. **En qué se basa la calidad** — an intro paragraph explaining the two scored
dimensions and their weights (completitud 60%, validez 40%) plus the
table-level row uniqueness, BEFORE any number, and stating explicitly that
outliers are reported as observations and do **not** lower the score. The
criteria terms (calidad de datos, completitud, validez, unicidad de registro)
are hooked into the shared glossary as clickable jumps.
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
their weights (completeness, validity, consistency) before any number, plus a
table-level summary (global score and aggregates).
2. **Scores por columna** — a table with, per column, the total quality score and
its breakdown into completeness / validity (no consistency dimension).
3. **Problemas de calidad** — a table listing ONLY real quality defects
(nulls, empty cells, values not conforming to their type/semantics).
4. **Observaciones analíticas** — a SEPARATE table for outliers, constant
columns, high-cardinality ids and strong skew, with an explicit note that
these do not affect the score.
its breakdown into completeness / validity / consistency.
3. **Problemas en español** — a second table listing, per column, the readable
issues in Spanish (kept separate from the type ``flags``).
The breakdown, issues and observations are NOT recomputed here: they come from
the registry function ``column_quality_score`` (group ``eda``), which derives
``{score, completeness, validity, dimensions, applicable, issues,
observations}`` from the ColumnProfile. This chapter is render-only.
The breakdown and the issues are NOT recomputed here: they come from the registry
function ``column_quality_score`` (group ``eda``), which already derives
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
This chapter is render-only — it consumes that function and lays the result out
as model blocks; the renderers paginate tables (splitting by rows, repeating the
header) and wrap long cells so nothing is ever cut.
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
"""
@@ -37,47 +33,28 @@ try: # pragma: no cover - import wiring
except Exception: # noqa: BLE001 - never let an import error abort the document.
_column_quality_score = None
CHAPTER_VERSION = "2.0.0"
CHAPTER_VERSION = "1.0.0"
CHAPTER_ID = "calidad"
CHAPTER_TITLE = "Calidad"
# Glossary terms this chapter explains (report 2046 §6). Registered in the shared
# collector and marked clickable on their first appearance (contract §11.1).
_TERMS = {
"calidad_datos": (
"Calidad de datos (score 0-100)",
"Mide hasta qué punto los datos están presentes y son utilizables tal "
"cual, no si son «buenos para el análisis». Se compone solo de "
"dimensiones medibles automáticamente desde el perfil de la tabla, sin "
"fuente externa de verdad: completitud (60%), validez (40%, cuando es "
"medible) y, a nivel de tabla, unicidad de registro. Los valores "
"atípicos NO bajan la calidad: se listan aparte como observaciones.",
),
"completitud": (
"Completitud",
"Proporción de valores realmente presentes en una columna (1 % de "
"nulos; en texto, las celdas vacías también cuentan como faltantes). Los "
"nulos y vacíos bajan el score porque falta información que debería "
"estar. Pesa el 60% del score de columna.",
),
"validez": (
"Validez",
"Proporción de valores que encajan con su tipo o formato esperado: un "
"número que parsea, una fecha legible, un email con forma de email. Los "
"valores que no parsean a su tipo bajan el score. Si la columna es texto "
"libre sin formato esperado, la validez no se puede medir y el score se "
"basa solo en la completitud. Pesa el 40% del score cuando es medible.",
),
"unicidad_registro": (
"Unicidad de registro",
"A nivel de tabla, las filas duplicadas restan calidad al conjunto "
"(1 % de filas duplicadas). Es distinta de que una columna no-clave "
"repita valores, que no es un defecto de calidad.",
),
}
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
# consistency 0.2. Kept here only to render the human explanation; the actual
# numbers always come from the function so the two never drift in computation.
_CRITERIA_INTRO = (
"La calidad de cada columna es un score de 0 a 100 que combina tres "
"criterios, cada uno con un peso:\n\n"
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
"ni vacíos). Una columna con muchos nulos baja de score.\n"
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
"Los problemas detectados por columna se listan en español más abajo."
)
# Cap for the joined cell so a single row never grows taller than a page; the
# remainder is summarized as "(+N más)" instead of being silently dropped.
# Cap for the joined issues cell so a single row never grows taller than a page;
# the remainder is summarized as "(+N más)" instead of being silently dropped.
_ISSUES_MAXLEN = 160
@@ -105,19 +82,12 @@ def _fmt_unit_pct(value) -> str:
return str(value)
def _fmt_validity(value) -> str:
"""Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%."""
if value is None:
return "n/a"
return _fmt_unit_pct(value)
def _quality_of(col: dict) -> dict:
"""Return the quality dict for a column.
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
Uses the registry ``column_quality_score`` when available; otherwise falls
back to the per-column ``quality_score`` already in the profile (number only,
empty breakdown/issues/observations). Never raises.
empty breakdown/issues). Never raises.
"""
if not isinstance(col, dict):
col = {}
@@ -128,25 +98,26 @@ def _quality_of(col: dict) -> dict:
return res
except Exception: # noqa: BLE001 - degrade instead of aborting.
pass
# Fallback: only the final score is available pre-computed in the profile.
return {
"score": col.get("quality_score"),
"completeness": None,
"validity": None,
"consistency": None,
"issues": [],
"observations": [],
}
def _join_cells(items) -> str:
"""Join Spanish strings into one cell, truncating overly long lists.
def _join_issues(issues) -> str:
"""Join Spanish issue strings into one cell, truncating overly long lists.
The renderer wraps cell text, but a column with many long entries could make
a single row taller than a whole page; cap the length and append ``(+N más)``
so the count of hidden entries is honest rather than silently lost.
The renderer wraps cell text, but a column with many long issues could make a
single row taller than a whole page; cap the length and append ``(+N más)``
so the count of hidden issues is honest rather than silently lost.
"""
if not isinstance(items, (list, tuple)) or not items:
if not isinstance(issues, (list, tuple)) or not issues:
return ""
parts = [model._safe_str(i).strip() for i in items]
parts = [model._safe_str(i).strip() for i in issues]
parts = [p for p in parts if p]
if not parts:
return ""
@@ -171,33 +142,6 @@ def _columns_with_quality(profile: dict):
yield c, _quality_of(c)
def _fmt_unit_pct_or_pct(value) -> str:
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
try:
num = float(value)
except (TypeError, ValueError):
return model._safe_str(value)
if num != num: # NaN
return ""
pct = num * 100 if num <= 1.0 else num
text = f"{pct:.1f}".rstrip("0").rstrip(".")
return f"{text}%"
def _row_uniqueness(profile: dict):
"""Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown."""
dup = profile.get("duplicate_pct")
if dup is None:
return None
try:
d = float(dup)
except (TypeError, ValueError):
return None
if d > 1.0: # tolerate a 0-100 scale
d = d / 100.0
return max(0.0, min(1.0, 1.0 - d))
def _summary_block(profile: dict, evaluated: list):
"""Table-level KVTable: global score and quality aggregates."""
rows = []
@@ -209,15 +153,14 @@ def _summary_block(profile: dict, evaluated: list):
if isinstance(q.get("completeness"), (int, float))]
vals = [q.get("validity") for _, q in evaluated
if isinstance(q.get("validity"), (int, float))]
cons = [q.get("consistency") for _, q in evaluated
if isinstance(q.get("consistency"), (int, float))]
if comps:
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
if vals:
rows.append(("Validez media (donde aplica)",
_fmt_unit_pct(sum(vals) / len(vals))))
ru = _row_uniqueness(profile)
if ru is not None:
rows.append(("Unicidad de registro", _fmt_unit_pct(ru)))
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
if cons:
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
rows.append(("Columnas con problemas", str(n_problem)))
@@ -239,9 +182,22 @@ def _summary_block(profile: dict, evaluated: list):
return model.KVTable(rows=rows, title="Resumen de calidad")
def _fmt_unit_pct_or_pct(value) -> str:
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
try:
num = float(value)
except (TypeError, ValueError):
return model._safe_str(value)
if num != num: # NaN
return ""
pct = num * 100 if num <= 1.0 else num
text = f"{pct:.1f}".rstrip("0").rstrip(".")
return f"{text}%"
def _scores_block(evaluated: list):
"""DataTable with per-column score and its completeness/validity breakdown."""
header = ["Columna", "Calidad", "Completitud", "Validez"]
"""DataTable with per-column score and its three-criteria breakdown."""
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
rows = []
# Worst columns first so the reader sees the problems at the top.
ordered = sorted(
@@ -254,22 +210,22 @@ def _scores_block(evaluated: list):
col.get("name") or "(col)",
_fmt_score(q.get("score")),
_fmt_unit_pct(q.get("completeness")),
_fmt_validity(q.get("validity")),
_fmt_unit_pct(q.get("validity")),
_fmt_unit_pct(q.get("consistency")),
])
if not rows:
return None
return model.DataTable(header=header, rows=rows,
title="Scores de calidad por columna",
note="0 = peor, 100 = mejor; «n/a» = dimensión no "
"medible; ordenado de peor a mejor")
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
def _issues_block(evaluated: list):
"""DataTable listing ONLY real quality defects per column, or a Note."""
header = ["Columna", "Problemas de calidad (español)"]
"""DataTable listing Spanish issues per column, or a Note when there are none."""
header = ["Columna", "Problemas detectados (español)"]
rows = []
for col, q in evaluated:
joined = _join_cells(q.get("issues"))
joined = _join_issues(q.get("issues"))
if joined:
rows.append([col.get("name") or "(col)", joined])
if not rows:
@@ -279,63 +235,6 @@ def _issues_block(evaluated: list):
title="Problemas de calidad por columna")
def _observations_block(evaluated: list):
"""DataTable listing analytical observations per column, or None.
Observations (outliers, constant columns, ids, strong skew) are NOT quality
defects: they do not affect the score. Returned as a separate table from the
issues so the report never presents a legitimate outlier as a problem.
"""
header = ["Columna", "Observaciones analíticas"]
rows = []
for col, q in evaluated:
joined = _join_cells(q.get("observations"))
if joined:
rows.append([col.get("name") or "(col)", joined])
if not rows:
return None
return model.DataTable(
header=header, rows=rows,
title="Observaciones analíticas por columna",
note="No son defectos de calidad y NO afectan al score; orientan el "
"análisis (atípicos, columnas constantes, identificadores).")
def _term(key: str, label: str, mark: bool) -> str:
"""Render a term as a clickable glossary span when marking is enabled."""
if mark:
return f"[[term:{key}]]**{label}**[[/term]]"
return f"**{label}**"
def _criteria_intro(mark: bool) -> str:
"""Intro paragraph explaining the two scored dimensions and the principle."""
calidad = _term("calidad_datos", "calidad de datos", mark)
completitud = _term("completitud", "Completitud (peso 60%)", mark)
validez = _term("validez", "Validez (peso 40%, cuando es medible)", mark)
unicidad = _term("unicidad_registro", "unicidad de registro", mark)
return (
f"La {calidad} de cada columna es un score de 0 a 100 que combina solo "
"dimensiones medibles desde el perfil de la tabla, sin fuente externa "
"de verdad:\n\n"
f"- {completitud}: proporción de valores presentes (1 % de nulos; en "
"texto, las celdas vacías cuentan como faltantes). Los nulos y vacíos "
"bajan el score.\n"
f"- {validez}: proporción de valores que encajan con su tipo o formato "
"(un número que parsea, una fecha legible, un email con forma de email). "
"Si una columna es texto libre sin formato esperado, la validez no se "
"mide y el score se basa solo en la completitud.\n\n"
f"Score de columna = 100 × (0,6·completitud + 0,4·validez), "
"renormalizado cuando la validez no aplica. A nivel de tabla se añade "
f"la {unicidad} (1 % de filas duplicadas).\n\n"
"**Los valores atípicos (outliers) NO bajan la calidad.** Un valor "
"extremo puede ser real y correcto; detectar atípicos es parte del "
"análisis de la distribución, no un juicio de corrección. Por eso, junto "
"con las columnas constantes y los identificadores, se listan aparte "
"como **observaciones analíticas** que no afectan al score."
)
def build_calidad(profile: dict, ctx: dict):
"""Build the data-quality Chapter, or None if the profile has no columns.
@@ -351,35 +250,17 @@ def build_calidad(profile: dict, ctx: dict):
if not evaluated:
return None # no columns to score -> chapter does not apply.
# Register the criteria terms in the shared glossary (if present) and mark
# their first appearance clickable. Contract §11.1.
glossary = ctx.get("glossary")
mark = False
if isinstance(glossary, model.GlossaryCollector):
for key, (label, definition) in _TERMS.items():
glossary.add(key, label, definition)
mark = True
blocks = [
model.Heading(text="Cómo se calcula la calidad", level=2),
model.Markdown(text=_criteria_intro(mark)),
model.Markdown(text=_CRITERIA_INTRO),
_summary_block(profile, evaluated),
model.Heading(text="Scores por columna", level=2),
]
scores = _scores_block(evaluated)
if scores is not None:
blocks.append(scores)
blocks.append(model.Heading(text="Problemas de calidad", level=2))
blocks.append(model.Heading(text="Problemas detectados", level=2))
blocks.append(_issues_block(evaluated))
observations = _observations_block(evaluated)
if observations is not None:
blocks.append(model.Heading(text="Observaciones analíticas", level=2))
blocks.append(model.Note(
"Las observaciones siguientes NO son defectos de calidad y no "
"afectan al score: son señales para orientar el análisis."))
blocks.append(observations)
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
version=CHAPTER_VERSION, blocks=blocks)
@@ -1,12 +1,11 @@
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut + glossary.
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
and deterministic. Verifies the report-2046 quality model: the chapter explains
the two scored dimensions (completitud 60% / validez 40%), shows per-column
scores without a consistency column, keeps quality DEFECTS (issues) separate
from analytical OBSERVATIONS (outliers, constant, ids), hooks the criteria terms
into the glossary, returns None when it does not apply, and renders a wide
profile to PDF and PPTX without cutting any cell text.
and deterministic. Verifies that the chapter explains the quality criteria, shows
per-column scores with the completeness/validity/consistency breakdown, lists the
issues in Spanish (separate from the type flags), returns None when it does not
apply, and that a wide profile with long names renders to PDF and PPTX without
cutting any cell text (long content wraps, it is never truncated).
"""
import os
@@ -21,30 +20,28 @@ from datascience.automatic_eda.chapters.calidad import (
CHAPTER_VERSION,
)
from datascience.automatic_eda import build_document, render_pdf, render_pptx
from datascience.automatic_eda import model
def _profile() -> dict:
"""A small profile with one column per quality problem (nulls, outliers,
constant, high-cardinality id) plus one clean column. ``outlier_pct`` is in
the 0-100 scale that describe_numeric actually emits."""
constant, high-cardinality id) plus one clean column."""
return {
"table": "demo",
"quality_score": 82.0,
"quality_score": 72.5,
"duplicate_pct": 0.04,
"null_cell_pct": 0.11,
"constant_cols": ["flag_const"],
"all_null_cols": [],
"columns": [
{"name": "edad", "inferred_type": "numeric", "null_pct": 0.2,
"n_rows": 100, "unique_pct": 0.5,
"numeric": {"outlier_pct": 15.0, "min": 0, "max": 99}},
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
"quality_score": 60},
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
"unique_pct": 0.98, "flags": ["possible_id"]},
"unique_pct": 0.98, "quality_score": 80},
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
"unique_pct": 0.01, "flags": ["constant"]},
{"name": "limpia", "inferred_type": "numeric", "null_pct": 0.0,
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
"flags": ["constant"], "quality_score": 50},
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
],
}
@@ -53,9 +50,16 @@ def _tables(chapter):
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
def _table_by_title(chapter, needle):
def _scores_table(chapter):
for t in _tables(chapter):
if needle in (t.title or ""):
if "Scores" in (t.title or ""):
return t
return None
def _issues_table(chapter):
for t in _tables(chapter):
if "Problemas" in (t.title or ""):
return t
return None
@@ -69,84 +73,41 @@ def test_golden_chapter_estructura_y_version():
assert ch.id == "calidad"
assert ch.version == CHAPTER_VERSION
kinds = [b.kind for b in ch.blocks]
# intro heading + markdown criteria + summary kv + scores table + issues table
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
def test_golden_intro_explica_dos_dimensiones_y_pesos():
def test_golden_intro_explica_criterios_y_pesos():
ch = build_calidad(_profile(), {})
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
for needle in ("Completitud", "Validez", "60%", "40%",
"unicidad de registro"):
for needle in ("Completitud", "Validez", "Consistencia",
"50%", "30%", "20%"):
assert needle in intro, f"falta {needle!r} en la intro de criterios"
# El principio: los outliers NO bajan la calidad.
assert "atípicos" in intro and "NO bajan" in intro
# Ya no se menciona la dimensión consistencia eliminada.
assert "20%" not in intro
def test_golden_scores_sin_columna_consistencia():
def test_golden_scores_incluyen_desglose_por_criterio():
ch = build_calidad(_profile(), {})
scores = _table_by_title(ch, "Scores")
scores = _scores_table(ch)
assert scores is not None
assert scores.header == ["Columna", "Calidad", "Completitud", "Validez"]
assert "Consistencia" not in scores.header
assert scores.header == ["Columna", "Calidad", "Completitud",
"Validez", "Consistencia"]
# 4 columns scored, none dropped.
assert len(scores.rows) == 4
names = {r[0] for r in scores.rows}
assert names == {"edad", "nombre", "flag_const", "limpia"}
def test_golden_outliers_en_observaciones_no_en_problemas():
def test_golden_issues_en_espanol_separados_de_flags():
ch = build_calidad(_profile(), {})
problemas = _table_by_title(ch, "Problemas de calidad")
observaciones = _table_by_title(ch, "Observaciones")
assert problemas is not None
assert observaciones is not None
problemas_txt = " | ".join(" ".join(r) for r in problemas.rows)
observaciones_txt = " | ".join(" ".join(r) for r in observaciones.rows)
# Los nulos SÍ son problema de calidad.
assert "nulos" in problemas_txt
# Los outliers NO aparecen como problema...
assert "atípic" not in problemas_txt and "outlier" not in problemas_txt
# ...sino como observación analítica.
assert "atípic" in observaciones_txt
# Constante e id: observaciones, no problemas.
assert "constante" in observaciones_txt
assert "identificador" in observaciones_txt
assert "constante" not in problemas_txt
def test_golden_score_columna_limpia_es_100():
"""Columna sin nulos, numérica nativa: score 100 aunque tenga (o no) outliers."""
ch = build_calidad(_profile(), {})
scores = _table_by_title(ch, "Scores")
by_name = {r[0]: r for r in scores.rows}
assert by_name["limpia"][1] == "100 / 100"
# edad: 20% nulos -> 100*(0.6*0.8 + 0.4*1.0) = 88; los outliers no bajan nada.
assert by_name["edad"][1] == "88 / 100"
# --------------------------------------------------------------------------- #
# Glosario (contrato §11.1)
# --------------------------------------------------------------------------- #
def test_glosario_registra_los_cuatro_terminos_y_marca_clicable():
glossary = model.GlossaryCollector()
ch = build_calidad(_profile(), {"glossary": glossary})
for key in ("calidad_datos", "completitud", "validez", "unicidad_registro"):
assert glossary.has(key), f"término {key!r} no registrado en el glosario"
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
# Con colector presente, la primera aparición se marca clicable.
assert "[[term:completitud]]" in intro
assert "[[term:validez]]" in intro
assert "[[term:calidad_datos]]" in intro
assert "[[term:unicidad_registro]]" in intro
def test_sin_glosario_no_marca_terminos():
ch = build_calidad(_profile(), {}) # ctx sin glossary
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
assert "[[term:" not in intro
issues = _issues_table(ch)
assert issues is not None
flat = " | ".join(" ".join(r) for r in issues.rows)
assert "nulos" in flat # completeness issue (ES)
assert "outliers" in flat # validity issue (ES)
assert "columna constante" in flat
assert "posible id de alta cardinalidad" in flat
# The raw type flag string must NOT leak as a "problem".
assert "constant" not in flat or "columna constante" in flat
# --------------------------------------------------------------------------- #
@@ -163,17 +124,17 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
prof = {
"quality_score": 100,
"columns": [
{"name": "a", "inferred_type": "numeric", "null_pct": 0.0,
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
{"name": "b", "inferred_type": "numeric", "null_pct": 0.0,
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
"numeric": {"outlier_pct": 0.0}},
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
"numeric": {"outlier_pct": 0.0}},
],
}
ch = build_calidad(prof, {})
assert ch is not None
assert _table_by_title(ch, "Problemas de calidad") is None # no issues table
assert _issues_table(ch) is None # no issues table
notes = [b for b in ch.blocks if b.kind == "note"]
assert any("No se detectaron problemas" in n.text for n in notes)
assert notes and "No se detectaron problemas" in notes[0].text
# --------------------------------------------------------------------------- #
@@ -182,42 +143,44 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
def _wide_profile(ncols: int = 22) -> dict:
cols = [
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99,
"flags": ["possible_id"]},
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.01,
"flags": ["constant"]},
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
]
for k in range(ncols - 2):
cols.append({
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
"inferred_type": "numeric", "null_pct": 0.1 + (k % 3) * 0.05,
"unique_pct": 0.5,
"numeric": {"outlier_pct": 8.0, "min": 0, "max": 1000},
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
})
return {"table": "ancha", "quality_score": 70.0, "duplicate_pct": 0.0,
"columns": cols}
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
prof = _wide_profile(22)
full = build_document(prof, {"dataset_name": "ancha"})
assert any(c.id == "calidad" for c in full)
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
# this chapter (other chapters, e.g. portada, legitimately contain '…').
chapters = [c for c in full if c.id == "calidad"]
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
with tempfile.TemporaryDirectory() as d:
pdf = os.path.join(d, "q.pdf")
pptx = os.path.join(d, "q.pptx")
rp = render_pdf(chapters, pdf, {"title": "EDA"})
render_pptx(chapters, pptx, {"title": "EDA"})
rx = render_pptx(chapters, pptx, {"title": "EDA"})
assert os.path.exists(pdf) and os.path.exists(pptx)
# The wide table forces pagination across several pages/slides.
assert (rp or {}).get("n_pages", 0) >= 2
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
# and there is no truncation marker.
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
assert "" not in pdf_txt and "..." not in pdf_txt
norm = re.sub(r"\s+", "", pdf_txt)
assert long_name in norm, "el nombre largo se cortó en el PDF"
# PPTX: long name present in some cell, untruncated.
allt = []
for s in Presentation(pptx).slides:
for sh in s.shapes:
@@ -4,10 +4,10 @@ name: column_quality_score
kind: function
lang: py
domain: datascience
version: "2.0.0"
version: "1.0.0"
purity: pure
signature: "def column_quality_score(col: dict) -> dict"
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda. Combina completeness (0.6) y validity (0.4) con renormalizacion por aplicabilidad; los outliers, columnas constantes e ids NO bajan el score (van a observations). Devuelve desglose por dimension, issues (defectos) y observations (señales analiticas). Funcion pura, no muta el input."
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda, con desglose completeness/validity/consistency y lista de issues legibles. Funcion pura, no muta el input."
tags: [eda, data-quality, profiling, scoring, datascience]
uses_functions: []
uses_types: []
@@ -17,26 +17,20 @@ error_type: ""
imports: []
example: |
from datascience import column_quality_score
col = {"name": "precio", "inferred_type": "numeric", "null_pct": 0.2,
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 8.0}}
col = {"name": "precio", "inferred_type": "float", "null_pct": 0.2,
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 0.08}}
column_quality_score(col)
# {"score": 88.0, "completeness": 0.8, "validity": 1.0,
# "applicable": ["completeness", "validity"], "issues": ["20% nulos"],
# "observations": ["8% de valores atípicos (z-score>3): ..."]}
# {"score": 86.8, "completeness": 0.8, "validity": 0.92,
# "consistency": 1.0, "issues": ["20% nulos", "8% outliers"]}
tested: true
tests:
- "test_clean_column_high_score"
- "test_weights_60_40_native_type"
- "test_outliers_do_not_penalize_score"
- "test_nulls_lower_score_more_than_outliers"
- "test_validity_from_parse_rate_lowers_score"
- "test_validity_from_match_rate"
- "test_free_text_renormalizes_to_completeness_only"
- "test_all_null_column_scores_zero"
- "test_constant_column_scores_full_and_is_observation"
- "test_high_cardinality_id_scores_full_and_is_observation"
- "test_mostly_null_no_double_counts_validity"
- "test_half_null_lowers_completeness_and_score"
- "test_constant_column_flags_issue"
- "test_empty_dict_does_not_crash"
- "test_outliers_penalize_validity"
- "test_mostly_null_flag_halves_validity"
- "test_high_cardinality_text_flagged_as_id"
- "test_none_values_treated_defensively"
- "test_does_not_mutate_input"
test_file_path: "python/functions/datascience/column_quality_score_test.py"
@@ -44,22 +38,16 @@ file_path: "python/functions/datascience/column_quality_score.py"
params:
- name: col
desc: >
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb /
profile_table). Se leen sus claves de forma defensiva con .get(...) y se
toleran valores None. Claves usadas: null_pct (0-1), n_rows, empty_count
(texto), inferred_type, semantic_type, validity_rate (0-1, lo expone
profile_table al promocionar texto a numero/fecha), match_rate (0-1),
unique_pct (0-1), flags (list[str], reconoce
"constant"/"possible_id"/"high_cardinality") y numeric ({outlier_pct: 0-100,
skew, ...}|None).
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb).
Se leen sus claves de forma defensiva con .get(...) y se toleran valores
None. Claves usadas: null_pct (0-1), inferred_type, semantic_type,
unique_pct (0-1), flags (list[str], reconoce "constant"/"mostly_null"),
numeric ({outlier_pct: 0-1, ...}|None) y match_rate (opcional, 0-1).
output: >
dict con score (float 0-100, 1 decimal), completeness (0-1), validity (0-1 o
None si no aplicable), dimensions ({completeness, validity}), applicable
(list[str] de dimensiones que entraron en el score), issues (list[str] SOLO de
defectos de calidad: nulos, vacios, valores no conformes) y observations
(list[str] de señales analiticas que NO bajan el score: outliers, columna
constante, posible id, asimetria). score = round(100 * (0.6*completeness +
0.4*validity) / pesos_aplicables, 1), renormalizado cuando validity no aplica.
dict con score (float 0-100, redondeado a 1 decimal), completeness (0-1),
validity (0-1), consistency (0-1) e issues (list[str] de descripciones
legibles de los problemas detectados). score = round(100 * (0.5*completeness
+ 0.3*validity + 0.2*consistency), 1).
---
## Ejemplo
@@ -71,71 +59,51 @@ from datascience import column_quality_score
col = {
"name": "precio",
"physical_type": "DOUBLE",
"inferred_type": "numeric",
"inferred_type": "float",
"semantic_type": "",
"count": 800,
"n_rows": 1000,
"null_count": 200,
"null_pct": 0.20,
"distinct_count": 400,
"unique_pct": 0.40,
"flags": [],
"numeric": {"outlier_pct": 8.0, "skew": 0.3},
"numeric": {"outlier_pct": 0.08},
"categorical": None,
"datetime": None,
}
column_quality_score(col)
# {
# "score": 88.0, # 100 * (0.6*0.8 + 0.4*1.0)
# "completeness": 0.8, # 1 - 0.20
# "validity": 1.0, # numerica nativa: el tipo es conforme
# "dimensions": {"completeness": 0.8, "validity": 1.0},
# "applicable": ["completeness", "validity"],
# "issues": ["20% nulos"], # SOLO defectos de calidad
# "observations": ["8% de valores atípicos (z-score>3): ..."], # NO bajan score
# "score": 86.8,
# "completeness": 0.8, # 1 - 0.20
# "validity": 0.92, # 1 - min(0.08, 0.3)
# "consistency": 1.0,
# "issues": ["20% nulos", "8% outliers"],
# }
```
## Cuando usarla
Cuando hayas perfilado una tabla con el grupo `eda` (p.ej.
`summarize_table_duckdb` / `profile_table`) y necesites un numero 0-100 por
columna para ordenar/priorizar limpieza de datos, pintar semaforos de calidad,
o decidir que columnas descartar antes de modelar. Separa los **defectos de
calidad reales** (`issues`: nulos, vacios, valores que no parsean a su tipo) de
las **observaciones analiticas** (`observations`: outliers, columnas constantes,
ids), que se reportan pero no penalizan. Es la capa de scoring sobre el
ColumnProfile crudo: lee el perfil, no toca los datos.
`summarize_table_duckdb`) y necesites un numero 0-100 por columna para
ordenar/priorizar limpieza de datos, pintar semaforos de calidad en un
dashboard, o decidir que columnas descartar antes de modelar. Es la capa de
scoring sobre el ColumnProfile crudo: lee el perfil, no toca los datos.
## Gotchas
## Notas
Funcion pura, sin I/O, no muta `col`. Aun asi conviene saber:
Funcion pura, sin I/O ni dependencias externas, no muta `col`. Lee todas las
claves con `.get(...)` y tolera que vengan en `None` (un ColumnProfile recien
salido de `summarize_table_duckdb` trae muchas claves a `None`), por lo que
nunca falla por claves ausentes — un `{}` produce un resultado bien definido.
- **Los outliers NO bajan el score.** Un valor extremo puede ser real y correcto
(un cliente que compra mucho); detectar atipicos es analisis de la
distribucion, no un juicio de correccion. Salen en `observations`, no en
`issues`. Mismo trato para columnas constantes e identificadores de alta
cardinalidad: son observaciones, no defectos.
- **`validity` puede ser `None`** (no aplicable): texto libre sin `semantic_type`
ni `validity_rate`, o columna 100% nula. En ese caso el score se renormaliza a
solo `completeness` (la columna no se premia ni castiga por algo no medible).
- **`outlier_pct` se interpreta en escala 0-100** (la que emite
`describe_numeric`, z-score>3). Pasar una fraccion 0-1 produce un texto de
observacion con el % equivocado, pero NUNCA afecta al score.
- **`validity_rate` lo puebla `profile_table`** al promocionar una columna de
texto a numero/fecha (fraccion que parsea). Si no esta presente y el tipo es
nativo numerico/fecha/bool, `validity = 1.0`.
- Sin doble conteo: la falta de datos cuenta solo en `completeness` (el antiguo
castigo de `mostly_null` sobre `validity` se elimino).
Pesos del score: completeness 0.5, validity 0.3, consistency 0.2.
## Capability growth log
- v2.0.0 (2026-06-30) — nueva formula de calidad (report 2046): pesos 60/40
(completeness/validity) con renormalizacion por aplicabilidad; se elimina la
dimension `consistency`-como-informatividad y el doble castigo de
`mostly_null`; los outliers/constantes/ids salen del score a `observations`;
validity mide conformidad real (parse rate / match rate / tipo nativo). Salida
ampliada con `dimensions`, `applicable` y `observations`.
- v1.0.0 — version inicial: pesos 50/30/20 (completeness/validity/consistency),
los outliers penalizaban validity (con bug de escala) y consistency penalizaba
informatividad.
- **completeness** = `1 - null_pct` (None -> 0 nulls -> 1.0).
- **validity**: parte de 1.0 y penaliza `min(outlier_pct, 0.3)` en columnas
numericas, `0.5 * (1 - match_rate)` si hay `semantic_type` declarado con
`match_rate` bajo disponible, y multiplica por 0.5 si el flag `mostly_null`
esta presente.
- **consistency**: 1.0 salvo flag `constant` (-> 0.3, columna poco informativa)
o texto con `unique_pct > 0.9` (-> 0.6, posible id de alta cardinalidad).
@@ -1,78 +1,34 @@
"""Score de calidad de datos (0-100) para un ColumnProfile del grupo eda.
Funcion pura: dado el perfil de una columna producido por el grupo de
capacidad `eda` (p.ej. summarize_table_duckdb / profile_table), calcula un
score agregado de calidad junto a su desglose por dimension y dos listas
legibles separadas: `issues` (defectos de calidad reales que SI bajan el
score) y `observations` (señales analiticas que NO bajan el score). No
realiza I/O ni muta el input.
Modelo (DAMA-DMBOK / ISO 8000), ver report 2046:
- Solo entran en el score las dimensiones medibles automaticamente desde el
perfil, sin fuente externa de verdad: completeness y validity por columna.
- Renormalizacion por aplicabilidad: si una dimension no es medible en la
columna (texto libre sin semantica -> validity no aplica; columna 100% nula
-> validity no medible), se excluye y los pesos se renormalizan sobre las
aplicables. Una columna ni se premia ni se castiga por algo no medible.
- Sin doble conteo: la falta de datos cuenta solo en completeness (se elimino
el antiguo castigo extra de `mostly_null` sobre validity).
- Los OUTLIERS NO bajan la calidad. Un valor extremo puede ser real y
correcto; detectar atipicos es analisis de la distribucion, no un juicio de
coreccion. Outliers, columnas constantes e identificadores de alta
cardinalidad pasan a `observations`, nunca a `issues`.
capacidad `eda` (p.ej. summarize_table_duckdb), calcula un score agregado
de calidad junto a su desglose en completeness / validity / consistency y
una lista de issues legibles. No realiza I/O ni muta el input.
"""
# Pesos base de las dimensiones de columna (se renormalizan por aplicabilidad).
_W_COMPLETENESS = 0.6
_W_VALIDITY = 0.4
# Tipos inferidos cuyo almacen garantiza la conformidad de tipo (validity=1.0)
# cuando NO vienen de una promocion de texto (en cuyo caso manda validity_rate).
_NATIVE_TYPED = ("numeric", "integer", "float", "datetime", "date", "boolean", "bool")
def column_quality_score(col: dict) -> dict:
"""Calcula un score de calidad de datos 0-100 para un ColumnProfile.
El score combina solo dimensiones de calidad medibles desde el perfil, con
renormalizacion por aplicabilidad:
- completeness (peso base 0.6, siempre aplica): proporcion de valores
presentes = 1 - null_pct. En texto, las celdas vacias (`empty_count`)
tambien cuentan como faltantes.
- validity (peso base 0.4, cuando hay un criterio de validacion real):
fraccion de valores no nulos conformes a su tipo/semantica. Tipo nativo
numerico/fecha/bool = 1.0; texto promovido a numero/fecha = parse rate
(`validity_rate`); texto con `semantic_type` regexable = `match_rate`;
texto libre o columna 100% nula = NO aplicable (renormaliza a solo
completeness).
Los outliers, columnas constantes, identificadores y asimetria fuerte NO
bajan el score: se devuelven en `observations`.
El score pondera tres dimensiones:
- completeness (0.5): proporcion de valores no nulos.
- validity (0.3): ausencia de outliers / heuristicas de validez.
- consistency (0.2): la columna aporta informacion (no constante, no ruido).
Args:
col: ColumnProfile dict del grupo eda. Se leen las claves de forma
defensiva con .get(...) y se tolera que muchas vengan en None.
Claves relevantes: null_pct (0-1), n_rows, empty_count,
inferred_type, semantic_type, validity_rate (0-1, lo expone
profile_table al promocionar texto a numero/fecha), match_rate
(0-1), unique_pct (0-1), flags (list[str], reconoce
"constant"/"possible_id"/"high_cardinality"), numeric
({outlier_pct: 0-100, skew, ...}|None).
Claves relevantes: null_pct, inferred_type, semantic_type,
unique_pct, flags (list[str]), numeric ({outlier_pct, ...}|None),
match_rate (opcional).
Returns:
dict con:
score (float 0-100, redondeado a 1 decimal),
completeness (float 0-1),
validity (float 0-1 | None si no aplicable),
dimensions ({completeness, validity}),
applicable (list[str] de dimensiones que entraron en el score),
issues (list[str]) SOLO defectos de calidad (nulos, vacios,
valores no conformes a su tipo/semantica),
observations (list[str]) señales analiticas que NO bajan el score
(outliers, columna constante, posible id, asimetria).
score (float, 0-100, redondeado a 1 decimal),
completeness (float, 0-1),
validity (float, 0-1),
consistency (float, 0-1),
issues (list[str]) descripciones legibles de los problemas.
"""
if not isinstance(col, dict):
col = {}
@@ -83,153 +39,103 @@ def column_quality_score(col: dict) -> dict:
flags = set(flags)
issues: list[str] = []
observations: list[str] = []
inferred_type = col.get("inferred_type") or ""
semantic_type = col.get("semantic_type") or ""
# --- completeness -------------------------------------------------
# Falta de datos = nulos + (en texto) celdas vacias. Es el unico sitio
# donde la falta de datos cuenta: nunca se duplica en validity.
null_pct = _clamp(_num(col.get("null_pct"), 0.0), 0.0, 1.0)
null_pct = col.get("null_pct")
if null_pct is None:
null_pct = 0.0
try:
null_pct = float(null_pct)
except (TypeError, ValueError):
null_pct = 0.0
null_pct = _clamp(null_pct, 0.0, 1.0)
completeness = 1.0 - null_pct
if null_pct > 0:
issues.append(f"{_pct(null_pct)} nulos")
issues.append(f"{round(null_pct * 100)}% nulos")
empty_frac = 0.0
n_rows = col.get("n_rows")
empty_count = col.get("empty_count")
if (
isinstance(n_rows, (int, float)) and not isinstance(n_rows, bool) and n_rows > 0
and isinstance(empty_count, (int, float)) and not isinstance(empty_count, bool)
and empty_count > 0
):
empty_frac = _clamp(float(empty_count) / float(n_rows), 0.0, 1.0)
completeness = _clamp(completeness - empty_frac, 0.0, 1.0)
issues.append(f"{_pct(empty_frac)} vacíos")
# --- validity -----------------------------------------------------
validity = 1.0
inferred_type = col.get("inferred_type") or ""
# --- validity (con renormalizacion por aplicabilidad) -------------
# None = no medible -> se excluye del score (no penaliza ni premia).
validity = None
if completeness <= 0.0:
# Columna 100% faltante: no hay valores no nulos sobre los que medir
# conformidad. validity no aplica -> el score sale solo de completeness
# (= 0). Es el peor defecto de calidad posible.
validity = None
else:
validity_rate = col.get("validity_rate")
match_rate = col.get("match_rate")
if validity_rate is not None:
# Texto promovido a numero/fecha: parse rate real de la muestra.
v = _num(validity_rate, None)
if v is not None:
validity = _clamp(v, 0.0, 1.0)
if validity < 1.0:
kind = (
"número" if inferred_type == "numeric"
else "fecha" if inferred_type == "datetime"
else inferred_type or "su tipo"
)
issues.append(
f"{_pct(1.0 - validity)} no parsea al tipo {kind}"
)
elif inferred_type in _NATIVE_TYPED:
# Tipo nativo garantizado por el almacen: no hay valores que no
# parseen. validity = 1.0 (no se confunde con tener outliers).
validity = 1.0
elif semantic_type and match_rate is not None:
v = _num(match_rate, None)
if v is not None:
validity = _clamp(v, 0.0, 1.0)
if validity < 1.0:
issues.append(
f"{_pct(1.0 - validity)} no casa con el "
f"formato «{semantic_type}»"
)
else:
# Texto libre / categorica sin semantica: no hay criterio honesto
# de validez. No aplica.
validity = None
# --- observations (NO bajan el score) -----------------------------
numeric = col.get("numeric")
is_numeric = inferred_type in ("integer", "float", "numeric") or isinstance(numeric, dict)
if isinstance(numeric, dict):
# outlier_pct viene en escala 0-100 desde describe_numeric (z-score>3).
outlier_pct = _num(numeric.get("outlier_pct"), None)
if outlier_pct is not None and outlier_pct >= 0.05:
observations.append(
f"{_pct(outlier_pct / 100.0)} de valores atípicos (z-score>3): "
"revisar si son errores u observaciones legítimas"
)
skew = _num(numeric.get("skew"), None)
if skew is not None and abs(skew) >= 1.0:
observations.append(
f"asimetría fuerte (skew={round(skew, 2)}): considerar "
"re-expresión antes de modelar"
)
outlier_pct = numeric.get("outlier_pct")
if outlier_pct is not None:
try:
outlier_pct = float(outlier_pct)
except (TypeError, ValueError):
outlier_pct = 0.0
outlier_pct = _clamp(outlier_pct, 0.0, 1.0)
if outlier_pct > 0:
penalty = min(outlier_pct, 0.3)
validity -= penalty
issues.append(f"{round(outlier_pct * 100)}% outliers")
# semantic_type declarado pero con baja tasa de match (si la conocemos).
semantic_type = col.get("semantic_type") or ""
match_rate = col.get("match_rate")
if semantic_type and match_rate is not None:
try:
match_rate = float(match_rate)
except (TypeError, ValueError):
match_rate = None
if match_rate is not None:
match_rate = _clamp(match_rate, 0.0, 1.0)
if match_rate < 1.0:
shortfall = 1.0 - match_rate
validity -= 0.5 * shortfall
issues.append(
f"semantic_type '{semantic_type}' con baja coincidencia "
f"({round(match_rate * 100)}%)"
)
if "mostly_null" in flags:
validity *= 0.5
issues.append("mayoritariamente nula")
validity = _clamp(validity, 0.0, 1.0)
# --- consistency --------------------------------------------------
consistency = 1.0
if "constant" in flags:
observations.append(
"columna constante: aporta poca información para el análisis"
)
unique_pct = _num(col.get("unique_pct"), None)
is_id = (
"possible_id" in flags
or "high_cardinality" in flags
or (
inferred_type in ("text", "categorical")
consistency = 0.3
issues.append("columna constante")
else:
unique_pct = col.get("unique_pct")
if unique_pct is not None:
try:
unique_pct = float(unique_pct)
except (TypeError, ValueError):
unique_pct = None
if (
inferred_type == "text"
and unique_pct is not None
and _clamp(unique_pct, 0.0, 1.0) > 0.9
)
)
if is_id:
observations.append(
"valores casi únicos: posible identificador (no es un defecto de calidad)"
)
):
consistency = 0.6
issues.append("posible id de alta cardinalidad")
# --- score agregado con renormalizacion ---------------------------
applicable = ["completeness"]
num = _W_COMPLETENESS * completeness
den = _W_COMPLETENESS
if validity is not None:
applicable.append("validity")
num += _W_VALIDITY * validity
den += _W_VALIDITY
score = round(100.0 * num / den, 1) if den > 0 else 0.0
consistency = _clamp(consistency, 0.0, 1.0)
# --- score agregado ----------------------------------------------
score = round(
100.0 * (0.5 * completeness + 0.3 * validity + 0.2 * consistency),
1,
)
# Silencia warnings sobre la variable de tipo no usada.
_ = is_numeric
return {
"score": score,
"completeness": completeness,
"validity": validity,
"dimensions": {"completeness": completeness, "validity": validity},
"applicable": applicable,
"consistency": consistency,
"issues": issues,
"observations": observations,
}
def _pct(frac: float) -> str:
"""Formatea una fraccion 0-1 como porcentaje honesto: «N%» si >=1%, «0.N%»
por debajo (para no mostrar «0%» cuando hay un defecto real pequeño)."""
p = frac * 100.0
if p >= 1.0:
return f"{round(p)}%"
return f"{p:.1f}%"
def _num(x, default):
"""Convierte x a float; devuelve `default` si es None o no parseable."""
if x is None:
return default
if isinstance(x, bool):
return default
try:
return float(x)
except (TypeError, ValueError):
return default
def _clamp(x: float, lo: float, hi: float) -> float:
"""Recorta x al rango [lo, hi]."""
if x < lo:
@@ -1,12 +1,4 @@
"""Tests para column_quality_score (nueva fórmula, report 2046).
Verifica las invariantes de la fórmula de calidad:
- completeness (0.6) + validity (0.4) con renormalización por aplicabilidad.
- Los OUTLIERS no bajan el score (van a observations, no a issues).
- Columnas constantes e ids no bajan el score (observations).
- Sin doble conteo de la falta de datos.
- all-null -> score 0; función pura (no muta el input).
"""
"""Tests para column_quality_score."""
import os
import sys
@@ -17,11 +9,11 @@ from column_quality_score import column_quality_score
def _clean_numeric_col() -> dict:
"""ColumnProfile de una columna numérica nativa sana, sin problemas."""
"""ColumnProfile de una columna numerica sana, sin problemas."""
return {
"name": "edad",
"physical_type": "INTEGER",
"inferred_type": "numeric",
"inferred_type": "integer",
"semantic_type": "",
"count": 1000,
"n_rows": 1000,
@@ -36,163 +28,85 @@ def _clean_numeric_col() -> dict:
}
# --------------------------------------------------------------------------- #
# Golden
# --------------------------------------------------------------------------- #
def test_clean_column_high_score():
out = column_quality_score(_clean_numeric_col())
assert out["score"] == 100.0
assert out["score"] > 90
assert out["completeness"] == 1.0
assert out["validity"] == 1.0
assert out["applicable"] == ["completeness", "validity"]
assert out["consistency"] == 1.0
assert out["issues"] == []
assert out["observations"] == []
def test_weights_60_40_native_type():
"""30% nulos en numérica nativa: score = 100*(0.6*0.7 + 0.4*1.0) = 82."""
def test_half_null_lowers_completeness_and_score():
col = _clean_numeric_col()
col["null_pct"] = 0.30
col["null_count"] = 300
col["null_count"] = 500
col["null_pct"] = 0.5
clean_score = column_quality_score(_clean_numeric_col())["score"]
out = column_quality_score(col)
assert out["completeness"] == 0.7
assert out["validity"] == 1.0
assert out["score"] == 82.0
assert any("nulos" in i for i in out["issues"])
assert out["completeness"] == 0.5
assert out["score"] < clean_score
assert any("nulos" in issue for issue in out["issues"])
# --------------------------------------------------------------------------- #
# Outliers FUERA del score
# --------------------------------------------------------------------------- #
def test_outliers_do_not_penalize_score():
"""Columna con outliers pero sin nulos -> score máximo; outliers en observations."""
col = _clean_numeric_col()
col["numeric"] = {"outlier_pct": 18.0, "skew": 0.2} # 18% atípicos (escala 0-100)
out = column_quality_score(col)
assert out["score"] == 100.0 # los outliers NO bajan la calidad
assert out["validity"] == 1.0
# No aparecen como problema de calidad...
assert not any("atípic" in i or "outlier" in i for i in out["issues"])
# ...sino como observación analítica.
assert any("atípic" in o for o in out["observations"])
def test_nulls_lower_score_more_than_outliers():
"""Vacíos sí penalizan; outliers no: comparar las dos columnas."""
con_nulos = _clean_numeric_col()
con_nulos["null_pct"] = 0.30
con_outliers = _clean_numeric_col()
con_outliers["numeric"] = {"outlier_pct": 30.0}
assert column_quality_score(con_nulos)["score"] < \
column_quality_score(con_outliers)["score"]
# --------------------------------------------------------------------------- #
# Validity: aplicabilidad y renormalización
# --------------------------------------------------------------------------- #
def test_validity_from_parse_rate_lowers_score():
"""Numérica como texto con 20% basura: validity=0.8 -> score=92."""
col = {
"name": "precio_txt", "inferred_type": "numeric", "semantic_type": "decimal",
"null_pct": 0.0, "validity_rate": 0.80, "flags": [], "numeric": None,
}
out = column_quality_score(col)
assert out["validity"] == 0.8
assert out["score"] == 92.0 # 100*(0.6 + 0.4*0.8)
assert any("no parsea" in i for i in out["issues"])
def test_validity_from_match_rate():
"""Texto con semantic_type y 5% no conforme: validity=0.95."""
col = {
"name": "email", "inferred_type": "text", "semantic_type": "email",
"null_pct": 0.0, "match_rate": 0.95, "unique_pct": 0.5, "flags": [],
}
out = column_quality_score(col)
assert out["validity"] == 0.95
assert out["score"] == 98.0 # 100*(0.6 + 0.4*0.95)
assert any("no casa" in i for i in out["issues"])
def test_free_text_renormalizes_to_completeness_only():
"""Texto libre sin semántica: validity no aplica -> score = 100*completeness."""
col = {
"name": "comentario", "inferred_type": "text", "semantic_type": "",
"null_pct": 0.30, "unique_pct": 0.5, "flags": [], "numeric": None,
}
out = column_quality_score(col)
assert out["validity"] is None
assert out["applicable"] == ["completeness"]
assert out["completeness"] == 0.7
assert out["score"] == 70.0 # renormalizado a solo completeness
# --------------------------------------------------------------------------- #
# Casos límite (report §4.6)
# --------------------------------------------------------------------------- #
def test_all_null_column_scores_zero():
col = _clean_numeric_col()
col["null_pct"] = 1.0
col["null_count"] = 1000
out = column_quality_score(col)
assert out["completeness"] == 0.0
assert out["validity"] is None # no medible sin valores no nulos
assert out["score"] == 0.0
def test_constant_column_scores_full_and_is_observation():
"""Columna constante: dato válido y completo -> score 100; baja info = observación."""
def test_constant_column_flags_issue():
col = _clean_numeric_col()
col["flags"] = ["constant"]
col["distinct_count"] = 1
col["unique_pct"] = 0.001
out = column_quality_score(col)
assert out["score"] == 100.0 # NO se castiga la baja informatividad
assert not any("constante" in i for i in out["issues"])
assert any("constante" in o for o in out["observations"])
assert out["consistency"] == 0.3
assert any("constante" in issue for issue in out["issues"])
def test_high_cardinality_id_scores_full_and_is_observation():
"""Id de alta cardinalidad: unicidad perfecta -> score 100; posible id = observación."""
col = {
"name": "uuid", "inferred_type": "text", "semantic_type": "",
"null_pct": 0.0, "unique_pct": 0.99, "flags": ["possible_id"],
"numeric": None,
}
out = column_quality_score(col)
assert out["score"] == 100.0
assert not any("identificador" in i for i in out["issues"])
assert any("identificador" in o for o in out["observations"])
def test_mostly_null_no_double_counts_validity():
"""85% nulos: solo completeness penaliza; validity nativa sigue 1.0 (sin doble castigo)."""
col = _clean_numeric_col()
col["null_pct"] = 0.85
col["flags"] = ["mostly_null"]
out = column_quality_score(col)
assert out["validity"] == 1.0 # ya no se multiplica por 0.5
# score = 100*(0.6*0.15 + 0.4*1.0) = 49
assert out["score"] == 49.0
assert not any("mayoritariamente" in o for o in out["observations"])
# --------------------------------------------------------------------------- #
# Robustez
# --------------------------------------------------------------------------- #
def test_empty_dict_does_not_crash():
out = column_quality_score({})
assert isinstance(out["score"], float)
assert out["completeness"] == 1.0
assert 0.0 <= out["score"] <= 100.0
assert isinstance(out["issues"], list)
assert isinstance(out["observations"], list)
def test_outliers_penalize_validity():
col = _clean_numeric_col()
col["numeric"] = {"outlier_pct": 0.2}
out = column_quality_score(col)
assert out["validity"] < 1.0
assert any("outliers" in issue for issue in out["issues"])
def test_mostly_null_flag_halves_validity():
col = _clean_numeric_col()
col["null_pct"] = 0.85
col["flags"] = ["mostly_null"]
out = column_quality_score(col)
assert out["validity"] == 0.5
assert any("mayoritariamente nula" in issue for issue in out["issues"])
def test_high_cardinality_text_flagged_as_id():
col = {
"name": "uuid",
"inferred_type": "text",
"semantic_type": "",
"null_pct": 0.0,
"unique_pct": 0.99,
"flags": [],
"numeric": None,
}
out = column_quality_score(col)
assert out["consistency"] < 1.0
assert any("alta cardinalidad" in issue for issue in out["issues"])
def test_none_values_treated_defensively():
col = {
"name": "x", "inferred_type": None, "semantic_type": None,
"null_pct": None, "unique_pct": None, "flags": None, "numeric": None,
"name": "x",
"inferred_type": None,
"semantic_type": None,
"null_pct": None,
"unique_pct": None,
"flags": None,
"numeric": None,
}
out = column_quality_score(col)
assert out["completeness"] == 1.0
@@ -3,7 +3,7 @@ name: summarize_table_duckdb
kind: function
lang: py
domain: datascience
version: "1.1.0"
version: "1.0.0"
purity: impure
signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict"
description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)."
@@ -64,7 +64,6 @@ else:
- **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima).
- **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75.
- **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`).
- **`duplicate_rows`/`duplicate_pct` se pueblan push-down** (desde v1.1.0) con `count(*)` sobre `SELECT DISTINCT *` (sin traer filas a RAM): `duplicate_rows = n_rows - filas_distintas`, `duplicate_pct` en fraccion 0-1. Habilitan la dimension de unicidad de registro del score de dataset (`profile_table` paso 6). Si la tabla tiene tipos no comparables con `DISTINCT` (BLOB/LIST/MAP) la query degrada y ambas vuelven a `None` (renormaliza el score a solo `cell_quality`).
- **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica).
- **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage).
- **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base.
@@ -196,21 +196,6 @@ def summarize_table_duckdb(
sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
)
# Unicidad de registro: filas duplicadas via COUNT de filas distintas
# push-down (DISTINCT *), sin traer filas a RAM. Habilita la dimension
# de uniqueness del score de dataset (1 - duplicate_pct). Degrada a None
# si la tabla tiene tipos no comparables con DISTINCT (BLOB/LIST/MAP).
duplicate_rows = None
duplicate_pct = None
if n_rows > 0:
dup_res = duckdb_query_readonly(
db_path, f"SELECT count(*) AS c FROM (SELECT DISTINCT * FROM {quoted})"
)
if dup_res["status"] == "ok" and dup_res["rows"]:
distinct_rows = int(dup_res["rows"][0]["c"])
duplicate_rows = max(0, n_rows - distinct_rows)
duplicate_pct = duplicate_rows / n_rows # fraccion 0-1
profile = {
"table": table,
"source": "duckdb",
@@ -218,8 +203,8 @@ def summarize_table_duckdb(
"n_rows": n_rows,
"n_cols": len(columns),
"size_bytes": None,
"duplicate_rows": duplicate_rows,
"duplicate_pct": duplicate_pct,
"duplicate_rows": None,
"duplicate_pct": None,
"constant_cols": constant_cols,
"all_null_cols": all_null_cols,
"null_cell_pct": null_cell_pct,
@@ -54,30 +54,6 @@ def test_shape_y_metadatos_tabla(db):
assert profile["correlations"] is None
def test_duplicate_pct_sin_duplicados(db):
"""Tabla con todas las filas distintas: duplicate_pct = 0, no None."""
profile = summarize_table_duckdb(db, "ventas")["profile"]
assert profile["duplicate_rows"] == 0
assert profile["duplicate_pct"] == 0.0
def test_duplicate_pct_con_duplicados(tmp_path):
"""Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down."""
path = str(tmp_path / "dups.duckdb")
con = duckdb.connect(path)
con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)")
# 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4.
con.execute(
"INSERT INTO t VALUES "
"(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')"
)
con.close()
profile = summarize_table_duckdb(path, "t")["profile"]
assert profile["n_rows"] == 5
assert profile["duplicate_rows"] == 2
assert profile["duplicate_pct"] == 0.4
def test_column_profile_shape(db):
profile = summarize_table_duckdb(db, "ventas")["profile"]
by_name = {c["name"]: c for c in profile["columns"]}
+1 -10
View File
@@ -4,7 +4,7 @@ kind: pipeline
lang: py
domain: pipelines
purity: impure
version: "1.1.0"
version: "1.0.0"
signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict"
description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla."
tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries]
@@ -114,12 +114,3 @@ para auditar la calidad de una tabla ya productiva. Reemplaza orquestar a mano
Formatos exoticos pueden descartarse silenciosamente del calculo numerico.
- `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el
sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red).
- **Score de calidad (report 2046, desde v1.1.0).** Paso 5: cada columna recibe
`quality_score` de `column_quality_score` con la formula 60/40
(completeness/validity); al promocionar texto a numero/fecha se expone
`col["validity_rate"]` (parse rate de la muestra) para alimentar la dimension
validity. Paso 6: el score de dataset NO es la media simple — es
`100 * (0.85*cell_quality + 0.15*row_uniqueness)`, donde
`cell_quality = media(score_col/100)` y `row_uniqueness = 1 - duplicate_pct`.
Si `duplicate_pct` es `None` (backend sin calcularlo) el score se renormaliza
a solo `cell_quality`. Los outliers NO bajan el score (van a `observations`).
+2 -36
View File
@@ -477,18 +477,9 @@ def profile_table(
if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
col["inferred_type"] = "numeric"
inferred = "numeric"
# Tasa de parseo real de la muestra: alimenta la
# dimension validity de column_quality_score (fraccion
# de valores conformes al tipo numerico promovido).
col["validity_rate"] = len(ok) / len(vals)
elif semantic in _DATETIME_SEMANTIC:
col["inferred_type"] = "datetime"
inferred = "datetime"
# Tasa de parseo de la muestra a fecha (mismo papel que el
# parse rate numerico) para la dimension validity.
parsed_dt = [_to_ordinal_days(v) for v in vals]
ok_dt = [d for d in parsed_dt if d is not None]
col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None
# 4) Enriquecer segun el inferred_type final.
if inferred == "numeric":
@@ -515,36 +506,11 @@ def profile_table(
# 5) Score de calidad por columna.
col["quality_score"] = column_quality_score(col).get("score")
# 6) Score agregado de la tabla (report 2046): NO media simple.
# cell_quality = media de los scores de columna, en [0,1].
# row_uniqueness = 1 - duplicate_pct (unicidad de registro).
# score = 100 * (0.85*cell_quality + 0.15*row_uniqueness).
# Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular.
# 6) Score agregado de la tabla (media de columnas).
scores = [
c["quality_score"] for c in cols if c.get("quality_score") is not None
]
if scores:
cell_quality = (sum(scores) / len(scores)) / 100.0
dup_pct = prof.get("duplicate_pct")
if dup_pct is not None:
try:
d = float(dup_pct)
except (TypeError, ValueError):
d = None
else:
d = None
if d is not None:
# Tolerar escala 0-100 por si algun backend la entrega asi.
if d > 1.0:
d = d / 100.0
row_uniqueness = max(0.0, min(1.0, 1.0 - d))
prof["quality_score"] = round(
100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1
)
else:
prof["quality_score"] = round(100.0 * cell_quality, 1)
else:
prof["quality_score"] = None
prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None
# 7) Candidatos a clave.
key_candidates = []