Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a2074a0167 |
@@ -89,35 +89,6 @@ _DEF_MAX_CARD = 20
|
||||
_DEF_MAX_MEASURES = 4
|
||||
_DEF_TOP_N = 12
|
||||
|
||||
# Glossary terms this chapter explains. Both appear in the always-rendered intro,
|
||||
# so they are registered and marked clickable whenever a collector is in ctx —
|
||||
# the canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
|
||||
# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
|
||||
# block. Mapping key -> (label, definition).
|
||||
_TERM_DEFS = {
|
||||
"groupby": (
|
||||
"Agrupación (split-apply-combine)",
|
||||
"Operación de agrupación (group by): parte la tabla en grupos según los "
|
||||
"valores de una columna categórica, aplica un cálculo (conteo, media, "
|
||||
"mediana…) dentro de cada grupo y combina los resultados en una tabla "
|
||||
"resumen. Es el patrón split-apply-combine."),
|
||||
"pivot_table": (
|
||||
"Tabla dinámica (pivot)",
|
||||
"Tabla dinámica que cruza dos variables categóricas — una en las filas y "
|
||||
"otra en las columnas — y rellena cada celda con un agregado (media, "
|
||||
"suma…) de una medida numérica. Resume de un vistazo cómo interactúan las "
|
||||
"dos categóricas sobre esa medida."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
it), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the other chapters' defensive style).
|
||||
@@ -554,18 +525,13 @@ def _sections_live(profile: dict, ctx: dict, candidates: dict) -> list:
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _intro_blocks(gloss=None, mark_term: bool = False) -> list:
|
||||
if gloss is not None:
|
||||
for key, (label, definition) in _TERM_DEFS.items():
|
||||
gloss.add(key, label, definition)
|
||||
t_groupby = _term(mark_term, "groupby", "**por grupos** (split-apply-combine)")
|
||||
t_pivot = _term(mark_term, "pivot_table", "**tablas dinámicas** (pivot)")
|
||||
def _intro_blocks() -> list:
|
||||
text = (
|
||||
f"Este capítulo analiza la tabla {t_groupby}: "
|
||||
"Este capítulo analiza la tabla **por grupos** (split-apply-combine): "
|
||||
"elige las columnas categóricas más informativas — por su cardinalidad "
|
||||
"y relevancia, no todas contra todas, para no inflar comparaciones "
|
||||
"espurias — y resume las variables numéricas dentro de cada grupo "
|
||||
f"(conteo, media, mediana, desviación). Las {t_pivot} "
|
||||
"(conteo, media, mediana, desviación). Las **tablas dinámicas** (pivot) "
|
||||
"cruzan dos categóricas sobre una medida, y los **gráficos de barras** "
|
||||
"(siempre desde cero) comparan los grupos de un vistazo."
|
||||
)
|
||||
@@ -590,21 +556,13 @@ def build_agregacion(profile: dict, ctx: dict):
|
||||
if not isinstance(profile, dict):
|
||||
return None
|
||||
|
||||
# Shared glossary collector: groupby + pivot_table live in the always-present
|
||||
# intro, so they are registered + marked there. Degrades silently (mark_term
|
||||
# False) when no collector is in ctx (standalone render).
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
|
||||
# Pre-computed results take precedence (offline / tests / forward-compat).
|
||||
pre = ctx.get("aggregations")
|
||||
if _is_dict(pre) and (pre.get("groupby") or pre.get("pivots")):
|
||||
sections = _sections_from_precomputed(pre)
|
||||
if not sections:
|
||||
return None
|
||||
blocks = (_intro_blocks(gloss, mark_term) + sections
|
||||
+ _insights_section(ctx))
|
||||
blocks = _intro_blocks() + sections + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -625,11 +583,10 @@ def build_agregacion(profile: dict, ctx: dict):
|
||||
"crudos. Pasa ctx['db_path'] + ctx['table'] (para el cálculo "
|
||||
"push-down en DuckDB) o ctx['aggregations'] ya precalculado. "
|
||||
f"Columnas categóricas candidatas: {keys or '—'}.")
|
||||
blocks = (_intro_blocks(gloss, mark_term) + [note]
|
||||
+ _insights_section(ctx))
|
||||
blocks = _intro_blocks() + [note] + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
blocks = _intro_blocks(gloss, mark_term) + sections + _insights_section(ctx)
|
||||
blocks = _intro_blocks() + sections + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -254,25 +254,3 @@ def test_anti_corte_muchos_grupos_y_texto_largo():
|
||||
# First, middle and last words of the long paragraph all present.
|
||||
for i in (0, 60, 119):
|
||||
assert f"palabra{i}" in txt
|
||||
|
||||
|
||||
def test_glosario_engancha_groupby_y_pivot():
|
||||
"""Mejora 4b: la agrupación (split-apply-combine) y la tabla dinámica (pivot)
|
||||
se registran en el colector compartido y se marcan clicables en el cuerpo.
|
||||
Sin colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ctx = dict(_ctx_precomputed())
|
||||
ctx["glossary"] = g
|
||||
ch = build_agregacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"groupby", "pivot_table"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
assert "[[term:groupby]]" in body and "[[term:pivot_table]]" in body
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_agregacion(_profile(), _ctx_precomputed())
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -1,22 +1,26 @@
|
||||
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
||||
|
||||
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
||||
chapter answers, in Spanish and as tables, the three things the user asked for:
|
||||
chapter implements the quality model of report 2046:
|
||||
|
||||
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
|
||||
their weights (completeness, validity, consistency) before any number, plus a
|
||||
table-level summary (global score and aggregates).
|
||||
1. **En qué se basa la calidad** — an intro paragraph explaining the two scored
|
||||
dimensions and their weights (completitud 60%, validez 40%) plus the
|
||||
table-level row uniqueness, BEFORE any number, and stating explicitly that
|
||||
outliers are reported as observations and do **not** lower the score. The
|
||||
criteria terms (calidad de datos, completitud, validez, unicidad de registro)
|
||||
are hooked into the shared glossary as clickable jumps.
|
||||
2. **Scores por columna** — a table with, per column, the total quality score and
|
||||
its breakdown into completeness / validity / consistency.
|
||||
3. **Problemas en español** — a second table listing, per column, the readable
|
||||
issues in Spanish (kept separate from the type ``flags``).
|
||||
its breakdown into completeness / validity (no consistency dimension).
|
||||
3. **Problemas de calidad** — a table listing ONLY real quality defects
|
||||
(nulls, empty cells, values not conforming to their type/semantics).
|
||||
4. **Observaciones analíticas** — a SEPARATE table for outliers, constant
|
||||
columns, high-cardinality ids and strong skew, with an explicit note that
|
||||
these do not affect the score.
|
||||
|
||||
The breakdown and the issues are NOT recomputed here: they come from the registry
|
||||
function ``column_quality_score`` (group ``eda``), which already derives
|
||||
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
|
||||
This chapter is render-only — it consumes that function and lays the result out
|
||||
as model blocks; the renderers paginate tables (splitting by rows, repeating the
|
||||
header) and wrap long cells so nothing is ever cut.
|
||||
The breakdown, issues and observations are NOT recomputed here: they come from
|
||||
the registry function ``column_quality_score`` (group ``eda``), which derives
|
||||
``{score, completeness, validity, dimensions, applicable, issues,
|
||||
observations}`` from the ColumnProfile. This chapter is render-only.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
@@ -33,28 +37,47 @@ try: # pragma: no cover - import wiring
|
||||
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
||||
_column_quality_score = None
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "2.0.0"
|
||||
CHAPTER_ID = "calidad"
|
||||
CHAPTER_TITLE = "Calidad"
|
||||
|
||||
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
|
||||
# consistency 0.2. Kept here only to render the human explanation; the actual
|
||||
# numbers always come from the function so the two never drift in computation.
|
||||
_CRITERIA_INTRO = (
|
||||
"La calidad de cada columna es un score de 0 a 100 que combina tres "
|
||||
"criterios, cada uno con un peso:\n\n"
|
||||
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
|
||||
"ni vacíos). Una columna con muchos nulos baja de score.\n"
|
||||
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
|
||||
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
|
||||
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
|
||||
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
|
||||
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
|
||||
"Los problemas detectados por columna se listan en español más abajo."
|
||||
)
|
||||
# Glossary terms this chapter explains (report 2046 §6). Registered in the shared
|
||||
# collector and marked clickable on their first appearance (contract §11.1).
|
||||
_TERMS = {
|
||||
"calidad_datos": (
|
||||
"Calidad de datos (score 0-100)",
|
||||
"Mide hasta qué punto los datos están presentes y son utilizables tal "
|
||||
"cual, no si son «buenos para el análisis». Se compone solo de "
|
||||
"dimensiones medibles automáticamente desde el perfil de la tabla, sin "
|
||||
"fuente externa de verdad: completitud (60%), validez (40%, cuando es "
|
||||
"medible) y, a nivel de tabla, unicidad de registro. Los valores "
|
||||
"atípicos NO bajan la calidad: se listan aparte como observaciones.",
|
||||
),
|
||||
"completitud": (
|
||||
"Completitud",
|
||||
"Proporción de valores realmente presentes en una columna (1 − % de "
|
||||
"nulos; en texto, las celdas vacías también cuentan como faltantes). Los "
|
||||
"nulos y vacíos bajan el score porque falta información que debería "
|
||||
"estar. Pesa el 60% del score de columna.",
|
||||
),
|
||||
"validez": (
|
||||
"Validez",
|
||||
"Proporción de valores que encajan con su tipo o formato esperado: un "
|
||||
"número que parsea, una fecha legible, un email con forma de email. Los "
|
||||
"valores que no parsean a su tipo bajan el score. Si la columna es texto "
|
||||
"libre sin formato esperado, la validez no se puede medir y el score se "
|
||||
"basa solo en la completitud. Pesa el 40% del score cuando es medible.",
|
||||
),
|
||||
"unicidad_registro": (
|
||||
"Unicidad de registro",
|
||||
"A nivel de tabla, las filas duplicadas restan calidad al conjunto "
|
||||
"(1 − % de filas duplicadas). Es distinta de que una columna no-clave "
|
||||
"repita valores, que no es un defecto de calidad.",
|
||||
),
|
||||
}
|
||||
|
||||
# Cap for the joined issues cell so a single row never grows taller than a page;
|
||||
# the remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||
# Cap for the joined cell so a single row never grows taller than a page; the
|
||||
# remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||
_ISSUES_MAXLEN = 160
|
||||
|
||||
|
||||
@@ -82,12 +105,19 @@ def _fmt_unit_pct(value) -> str:
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_validity(value) -> str:
|
||||
"""Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%."""
|
||||
if value is None:
|
||||
return "n/a"
|
||||
return _fmt_unit_pct(value)
|
||||
|
||||
|
||||
def _quality_of(col: dict) -> dict:
|
||||
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
|
||||
"""Return the quality dict for a column.
|
||||
|
||||
Uses the registry ``column_quality_score`` when available; otherwise falls
|
||||
back to the per-column ``quality_score`` already in the profile (number only,
|
||||
empty breakdown/issues). Never raises.
|
||||
empty breakdown/issues/observations). Never raises.
|
||||
"""
|
||||
if not isinstance(col, dict):
|
||||
col = {}
|
||||
@@ -98,26 +128,25 @@ def _quality_of(col: dict) -> dict:
|
||||
return res
|
||||
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
||||
pass
|
||||
# Fallback: only the final score is available pre-computed in the profile.
|
||||
return {
|
||||
"score": col.get("quality_score"),
|
||||
"completeness": None,
|
||||
"validity": None,
|
||||
"consistency": None,
|
||||
"issues": [],
|
||||
"observations": [],
|
||||
}
|
||||
|
||||
|
||||
def _join_issues(issues) -> str:
|
||||
"""Join Spanish issue strings into one cell, truncating overly long lists.
|
||||
def _join_cells(items) -> str:
|
||||
"""Join Spanish strings into one cell, truncating overly long lists.
|
||||
|
||||
The renderer wraps cell text, but a column with many long issues could make a
|
||||
single row taller than a whole page; cap the length and append ``(+N más)``
|
||||
so the count of hidden issues is honest rather than silently lost.
|
||||
The renderer wraps cell text, but a column with many long entries could make
|
||||
a single row taller than a whole page; cap the length and append ``(+N más)``
|
||||
so the count of hidden entries is honest rather than silently lost.
|
||||
"""
|
||||
if not isinstance(issues, (list, tuple)) or not issues:
|
||||
if not isinstance(items, (list, tuple)) or not items:
|
||||
return ""
|
||||
parts = [model._safe_str(i).strip() for i in issues]
|
||||
parts = [model._safe_str(i).strip() for i in items]
|
||||
parts = [p for p in parts if p]
|
||||
if not parts:
|
||||
return ""
|
||||
@@ -142,6 +171,33 @@ def _columns_with_quality(profile: dict):
|
||||
yield c, _quality_of(c)
|
||||
|
||||
|
||||
def _fmt_unit_pct_or_pct(value) -> str:
|
||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
pct = num * 100 if num <= 1.0 else num
|
||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text}%"
|
||||
|
||||
|
||||
def _row_uniqueness(profile: dict):
|
||||
"""Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown."""
|
||||
dup = profile.get("duplicate_pct")
|
||||
if dup is None:
|
||||
return None
|
||||
try:
|
||||
d = float(dup)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if d > 1.0: # tolerate a 0-100 scale
|
||||
d = d / 100.0
|
||||
return max(0.0, min(1.0, 1.0 - d))
|
||||
|
||||
|
||||
def _summary_block(profile: dict, evaluated: list):
|
||||
"""Table-level KVTable: global score and quality aggregates."""
|
||||
rows = []
|
||||
@@ -153,14 +209,15 @@ def _summary_block(profile: dict, evaluated: list):
|
||||
if isinstance(q.get("completeness"), (int, float))]
|
||||
vals = [q.get("validity") for _, q in evaluated
|
||||
if isinstance(q.get("validity"), (int, float))]
|
||||
cons = [q.get("consistency") for _, q in evaluated
|
||||
if isinstance(q.get("consistency"), (int, float))]
|
||||
if comps:
|
||||
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
||||
if vals:
|
||||
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
|
||||
if cons:
|
||||
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
|
||||
rows.append(("Validez media (donde aplica)",
|
||||
_fmt_unit_pct(sum(vals) / len(vals))))
|
||||
|
||||
ru = _row_uniqueness(profile)
|
||||
if ru is not None:
|
||||
rows.append(("Unicidad de registro", _fmt_unit_pct(ru)))
|
||||
|
||||
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
||||
rows.append(("Columnas con problemas", str(n_problem)))
|
||||
@@ -182,22 +239,9 @@ def _summary_block(profile: dict, evaluated: list):
|
||||
return model.KVTable(rows=rows, title="Resumen de calidad")
|
||||
|
||||
|
||||
def _fmt_unit_pct_or_pct(value) -> str:
|
||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
pct = num * 100 if num <= 1.0 else num
|
||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text}%"
|
||||
|
||||
|
||||
def _scores_block(evaluated: list):
|
||||
"""DataTable with per-column score and its three-criteria breakdown."""
|
||||
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
|
||||
"""DataTable with per-column score and its completeness/validity breakdown."""
|
||||
header = ["Columna", "Calidad", "Completitud", "Validez"]
|
||||
rows = []
|
||||
# Worst columns first so the reader sees the problems at the top.
|
||||
ordered = sorted(
|
||||
@@ -210,22 +254,22 @@ def _scores_block(evaluated: list):
|
||||
col.get("name") or "(col)",
|
||||
_fmt_score(q.get("score")),
|
||||
_fmt_unit_pct(q.get("completeness")),
|
||||
_fmt_unit_pct(q.get("validity")),
|
||||
_fmt_unit_pct(q.get("consistency")),
|
||||
_fmt_validity(q.get("validity")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Scores de calidad por columna",
|
||||
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
|
||||
note="0 = peor, 100 = mejor; «n/a» = dimensión no "
|
||||
"medible; ordenado de peor a mejor")
|
||||
|
||||
|
||||
def _issues_block(evaluated: list):
|
||||
"""DataTable listing Spanish issues per column, or a Note when there are none."""
|
||||
header = ["Columna", "Problemas detectados (español)"]
|
||||
"""DataTable listing ONLY real quality defects per column, or a Note."""
|
||||
header = ["Columna", "Problemas de calidad (español)"]
|
||||
rows = []
|
||||
for col, q in evaluated:
|
||||
joined = _join_issues(q.get("issues"))
|
||||
joined = _join_cells(q.get("issues"))
|
||||
if joined:
|
||||
rows.append([col.get("name") or "(col)", joined])
|
||||
if not rows:
|
||||
@@ -235,6 +279,63 @@ def _issues_block(evaluated: list):
|
||||
title="Problemas de calidad por columna")
|
||||
|
||||
|
||||
def _observations_block(evaluated: list):
|
||||
"""DataTable listing analytical observations per column, or None.
|
||||
|
||||
Observations (outliers, constant columns, ids, strong skew) are NOT quality
|
||||
defects: they do not affect the score. Returned as a separate table from the
|
||||
issues so the report never presents a legitimate outlier as a problem.
|
||||
"""
|
||||
header = ["Columna", "Observaciones analíticas"]
|
||||
rows = []
|
||||
for col, q in evaluated:
|
||||
joined = _join_cells(q.get("observations"))
|
||||
if joined:
|
||||
rows.append([col.get("name") or "(col)", joined])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(
|
||||
header=header, rows=rows,
|
||||
title="Observaciones analíticas por columna",
|
||||
note="No son defectos de calidad y NO afectan al score; orientan el "
|
||||
"análisis (atípicos, columnas constantes, identificadores).")
|
||||
|
||||
|
||||
def _term(key: str, label: str, mark: bool) -> str:
|
||||
"""Render a term as a clickable glossary span when marking is enabled."""
|
||||
if mark:
|
||||
return f"[[term:{key}]]**{label}**[[/term]]"
|
||||
return f"**{label}**"
|
||||
|
||||
|
||||
def _criteria_intro(mark: bool) -> str:
|
||||
"""Intro paragraph explaining the two scored dimensions and the principle."""
|
||||
calidad = _term("calidad_datos", "calidad de datos", mark)
|
||||
completitud = _term("completitud", "Completitud (peso 60%)", mark)
|
||||
validez = _term("validez", "Validez (peso 40%, cuando es medible)", mark)
|
||||
unicidad = _term("unicidad_registro", "unicidad de registro", mark)
|
||||
return (
|
||||
f"La {calidad} de cada columna es un score de 0 a 100 que combina solo "
|
||||
"dimensiones medibles desde el perfil de la tabla, sin fuente externa "
|
||||
"de verdad:\n\n"
|
||||
f"- {completitud}: proporción de valores presentes (1 − % de nulos; en "
|
||||
"texto, las celdas vacías cuentan como faltantes). Los nulos y vacíos "
|
||||
"bajan el score.\n"
|
||||
f"- {validez}: proporción de valores que encajan con su tipo o formato "
|
||||
"(un número que parsea, una fecha legible, un email con forma de email). "
|
||||
"Si una columna es texto libre sin formato esperado, la validez no se "
|
||||
"mide y el score se basa solo en la completitud.\n\n"
|
||||
f"Score de columna = 100 × (0,6·completitud + 0,4·validez), "
|
||||
"renormalizado cuando la validez no aplica. A nivel de tabla se añade "
|
||||
f"la {unicidad} (1 − % de filas duplicadas).\n\n"
|
||||
"**Los valores atípicos (outliers) NO bajan la calidad.** Un valor "
|
||||
"extremo puede ser real y correcto; detectar atípicos es parte del "
|
||||
"análisis de la distribución, no un juicio de corrección. Por eso, junto "
|
||||
"con las columnas constantes y los identificadores, se listan aparte "
|
||||
"como **observaciones analíticas** que no afectan al score."
|
||||
)
|
||||
|
||||
|
||||
def build_calidad(profile: dict, ctx: dict):
|
||||
"""Build the data-quality Chapter, or None if the profile has no columns.
|
||||
|
||||
@@ -250,17 +351,35 @@ def build_calidad(profile: dict, ctx: dict):
|
||||
if not evaluated:
|
||||
return None # no columns to score -> chapter does not apply.
|
||||
|
||||
# Register the criteria terms in the shared glossary (if present) and mark
|
||||
# their first appearance clickable. Contract §11.1.
|
||||
glossary = ctx.get("glossary")
|
||||
mark = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
for key, (label, definition) in _TERMS.items():
|
||||
glossary.add(key, label, definition)
|
||||
mark = True
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Cómo se calcula la calidad", level=2),
|
||||
model.Markdown(text=_CRITERIA_INTRO),
|
||||
model.Markdown(text=_criteria_intro(mark)),
|
||||
_summary_block(profile, evaluated),
|
||||
model.Heading(text="Scores por columna", level=2),
|
||||
]
|
||||
scores = _scores_block(evaluated)
|
||||
if scores is not None:
|
||||
blocks.append(scores)
|
||||
blocks.append(model.Heading(text="Problemas detectados", level=2))
|
||||
|
||||
blocks.append(model.Heading(text="Problemas de calidad", level=2))
|
||||
blocks.append(_issues_block(evaluated))
|
||||
|
||||
observations = _observations_block(evaluated)
|
||||
if observations is not None:
|
||||
blocks.append(model.Heading(text="Observaciones analíticas", level=2))
|
||||
blocks.append(model.Note(
|
||||
"Las observaciones siguientes NO son defectos de calidad y no "
|
||||
"afectan al score: son señales para orientar el análisis."))
|
||||
blocks.append(observations)
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
|
||||
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut + glossary.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that the chapter explains the quality criteria, shows
|
||||
per-column scores with the completeness/validity/consistency breakdown, lists the
|
||||
issues in Spanish (separate from the type flags), returns None when it does not
|
||||
apply, and that a wide profile with long names renders to PDF and PPTX without
|
||||
cutting any cell text (long content wraps, it is never truncated).
|
||||
and deterministic. Verifies the report-2046 quality model: the chapter explains
|
||||
the two scored dimensions (completitud 60% / validez 40%), shows per-column
|
||||
scores without a consistency column, keeps quality DEFECTS (issues) separate
|
||||
from analytical OBSERVATIONS (outliers, constant, ids), hooks the criteria terms
|
||||
into the glossary, returns None when it does not apply, and renders a wide
|
||||
profile to PDF and PPTX without cutting any cell text.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -20,28 +21,30 @@ from datascience.automatic_eda.chapters.calidad import (
|
||||
CHAPTER_VERSION,
|
||||
)
|
||||
from datascience.automatic_eda import build_document, render_pdf, render_pptx
|
||||
from datascience.automatic_eda import model
|
||||
|
||||
|
||||
def _profile() -> dict:
|
||||
"""A small profile with one column per quality problem (nulls, outliers,
|
||||
constant, high-cardinality id) plus one clean column."""
|
||||
constant, high-cardinality id) plus one clean column. ``outlier_pct`` is in
|
||||
the 0-100 scale that describe_numeric actually emits."""
|
||||
return {
|
||||
"table": "demo",
|
||||
"quality_score": 72.5,
|
||||
"quality_score": 82.0,
|
||||
"duplicate_pct": 0.04,
|
||||
"null_cell_pct": 0.11,
|
||||
"constant_cols": ["flag_const"],
|
||||
"all_null_cols": [],
|
||||
"columns": [
|
||||
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
|
||||
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
|
||||
"quality_score": 60},
|
||||
{"name": "edad", "inferred_type": "numeric", "null_pct": 0.2,
|
||||
"n_rows": 100, "unique_pct": 0.5,
|
||||
"numeric": {"outlier_pct": 15.0, "min": 0, "max": 99}},
|
||||
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
|
||||
"unique_pct": 0.98, "quality_score": 80},
|
||||
"unique_pct": 0.98, "flags": ["possible_id"]},
|
||||
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
|
||||
"flags": ["constant"], "quality_score": 50},
|
||||
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
|
||||
"unique_pct": 0.01, "flags": ["constant"]},
|
||||
{"name": "limpia", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||
],
|
||||
}
|
||||
|
||||
@@ -50,16 +53,9 @@ def _tables(chapter):
|
||||
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
|
||||
|
||||
|
||||
def _scores_table(chapter):
|
||||
def _table_by_title(chapter, needle):
|
||||
for t in _tables(chapter):
|
||||
if "Scores" in (t.title or ""):
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def _issues_table(chapter):
|
||||
for t in _tables(chapter):
|
||||
if "Problemas" in (t.title or ""):
|
||||
if needle in (t.title or ""):
|
||||
return t
|
||||
return None
|
||||
|
||||
@@ -73,41 +69,84 @@ def test_golden_chapter_estructura_y_version():
|
||||
assert ch.id == "calidad"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = [b.kind for b in ch.blocks]
|
||||
# intro heading + markdown criteria + summary kv + scores table + issues table
|
||||
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
|
||||
|
||||
|
||||
def test_golden_intro_explica_criterios_y_pesos():
|
||||
def test_golden_intro_explica_dos_dimensiones_y_pesos():
|
||||
ch = build_calidad(_profile(), {})
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
for needle in ("Completitud", "Validez", "Consistencia",
|
||||
"50%", "30%", "20%"):
|
||||
for needle in ("Completitud", "Validez", "60%", "40%",
|
||||
"unicidad de registro"):
|
||||
assert needle in intro, f"falta {needle!r} en la intro de criterios"
|
||||
# El principio: los outliers NO bajan la calidad.
|
||||
assert "atípicos" in intro and "NO bajan" in intro
|
||||
# Ya no se menciona la dimensión consistencia eliminada.
|
||||
assert "20%" not in intro
|
||||
|
||||
|
||||
def test_golden_scores_incluyen_desglose_por_criterio():
|
||||
def test_golden_scores_sin_columna_consistencia():
|
||||
ch = build_calidad(_profile(), {})
|
||||
scores = _scores_table(ch)
|
||||
scores = _table_by_title(ch, "Scores")
|
||||
assert scores is not None
|
||||
assert scores.header == ["Columna", "Calidad", "Completitud",
|
||||
"Validez", "Consistencia"]
|
||||
# 4 columns scored, none dropped.
|
||||
assert scores.header == ["Columna", "Calidad", "Completitud", "Validez"]
|
||||
assert "Consistencia" not in scores.header
|
||||
assert len(scores.rows) == 4
|
||||
names = {r[0] for r in scores.rows}
|
||||
assert names == {"edad", "nombre", "flag_const", "limpia"}
|
||||
|
||||
|
||||
def test_golden_issues_en_espanol_separados_de_flags():
|
||||
def test_golden_outliers_en_observaciones_no_en_problemas():
|
||||
ch = build_calidad(_profile(), {})
|
||||
issues = _issues_table(ch)
|
||||
assert issues is not None
|
||||
flat = " | ".join(" ".join(r) for r in issues.rows)
|
||||
assert "nulos" in flat # completeness issue (ES)
|
||||
assert "outliers" in flat # validity issue (ES)
|
||||
assert "columna constante" in flat
|
||||
assert "posible id de alta cardinalidad" in flat
|
||||
# The raw type flag string must NOT leak as a "problem".
|
||||
assert "constant" not in flat or "columna constante" in flat
|
||||
problemas = _table_by_title(ch, "Problemas de calidad")
|
||||
observaciones = _table_by_title(ch, "Observaciones")
|
||||
assert problemas is not None
|
||||
assert observaciones is not None
|
||||
|
||||
problemas_txt = " | ".join(" ".join(r) for r in problemas.rows)
|
||||
observaciones_txt = " | ".join(" ".join(r) for r in observaciones.rows)
|
||||
|
||||
# Los nulos SÍ son problema de calidad.
|
||||
assert "nulos" in problemas_txt
|
||||
# Los outliers NO aparecen como problema...
|
||||
assert "atípic" not in problemas_txt and "outlier" not in problemas_txt
|
||||
# ...sino como observación analítica.
|
||||
assert "atípic" in observaciones_txt
|
||||
# Constante e id: observaciones, no problemas.
|
||||
assert "constante" in observaciones_txt
|
||||
assert "identificador" in observaciones_txt
|
||||
assert "constante" not in problemas_txt
|
||||
|
||||
|
||||
def test_golden_score_columna_limpia_es_100():
|
||||
"""Columna sin nulos, numérica nativa: score 100 aunque tenga (o no) outliers."""
|
||||
ch = build_calidad(_profile(), {})
|
||||
scores = _table_by_title(ch, "Scores")
|
||||
by_name = {r[0]: r for r in scores.rows}
|
||||
assert by_name["limpia"][1] == "100 / 100"
|
||||
# edad: 20% nulos -> 100*(0.6*0.8 + 0.4*1.0) = 88; los outliers no bajan nada.
|
||||
assert by_name["edad"][1] == "88 / 100"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Glosario (contrato §11.1)
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_glosario_registra_los_cuatro_terminos_y_marca_clicable():
|
||||
glossary = model.GlossaryCollector()
|
||||
ch = build_calidad(_profile(), {"glossary": glossary})
|
||||
for key in ("calidad_datos", "completitud", "validez", "unicidad_registro"):
|
||||
assert glossary.has(key), f"término {key!r} no registrado en el glosario"
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
# Con colector presente, la primera aparición se marca clicable.
|
||||
assert "[[term:completitud]]" in intro
|
||||
assert "[[term:validez]]" in intro
|
||||
assert "[[term:calidad_datos]]" in intro
|
||||
assert "[[term:unicidad_registro]]" in intro
|
||||
|
||||
|
||||
def test_sin_glosario_no_marca_terminos():
|
||||
ch = build_calidad(_profile(), {}) # ctx sin glossary
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
assert "[[term:" not in intro
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
@@ -124,17 +163,17 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
||||
prof = {
|
||||
"quality_score": 100,
|
||||
"columns": [
|
||||
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "a", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "b", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||
],
|
||||
}
|
||||
ch = build_calidad(prof, {})
|
||||
assert ch is not None
|
||||
assert _issues_table(ch) is None # no issues table
|
||||
assert _table_by_title(ch, "Problemas de calidad") is None # no issues table
|
||||
notes = [b for b in ch.blocks if b.kind == "note"]
|
||||
assert notes and "No se detectaron problemas" in notes[0].text
|
||||
assert any("No se detectaron problemas" in n.text for n in notes)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
@@ -143,44 +182,42 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
||||
def _wide_profile(ncols: int = 22) -> dict:
|
||||
cols = [
|
||||
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99,
|
||||
"flags": ["possible_id"]},
|
||||
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
|
||||
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.01,
|
||||
"flags": ["constant"]},
|
||||
]
|
||||
for k in range(ncols - 2):
|
||||
cols.append({
|
||||
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
|
||||
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
|
||||
"inferred_type": "numeric", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||
"unique_pct": 0.5,
|
||||
"numeric": {"outlier_pct": 8.0, "min": 0, "max": 1000},
|
||||
})
|
||||
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
|
||||
return {"table": "ancha", "quality_score": 70.0, "duplicate_pct": 0.0,
|
||||
"columns": cols}
|
||||
|
||||
|
||||
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
|
||||
prof = _wide_profile(22)
|
||||
full = build_document(prof, {"dataset_name": "ancha"})
|
||||
assert any(c.id == "calidad" for c in full)
|
||||
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
|
||||
# this chapter (other chapters, e.g. portada, legitimately contain '…').
|
||||
chapters = [c for c in full if c.id == "calidad"]
|
||||
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "q.pdf")
|
||||
pptx = os.path.join(d, "q.pptx")
|
||||
rp = render_pdf(chapters, pdf, {"title": "EDA"})
|
||||
rx = render_pptx(chapters, pptx, {"title": "EDA"})
|
||||
render_pptx(chapters, pptx, {"title": "EDA"})
|
||||
assert os.path.exists(pdf) and os.path.exists(pptx)
|
||||
# The wide table forces pagination across several pages/slides.
|
||||
assert (rp or {}).get("n_pages", 0) >= 2
|
||||
|
||||
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
|
||||
# and there is no truncation marker.
|
||||
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
|
||||
assert "…" not in pdf_txt and "..." not in pdf_txt
|
||||
norm = re.sub(r"\s+", "", pdf_txt)
|
||||
assert long_name in norm, "el nombre largo se cortó en el PDF"
|
||||
|
||||
# PPTX: long name present in some cell, untruncated.
|
||||
allt = []
|
||||
for s in Presentation(pptx).slides:
|
||||
for sh in s.shapes:
|
||||
|
||||
@@ -47,53 +47,6 @@ _MAX_MATRIX_LABELS = 16
|
||||
# How many pairs to show in each of the top-positive / top-negative tables.
|
||||
_TOP_N = 10
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance in the
|
||||
# body — the canonical two-step pattern (see ``cat_distr`` for the reference
|
||||
# implementation): ``glossary.add(key, label, definition)`` + the inline span
|
||||
# ``[[term:KEY]]texto visible[[/term]]`` in a Markdown block. Mapping key ->
|
||||
# (label, definition). ``fdr`` is only registered when the FDR summary is present.
|
||||
_TERM_DEFS = {
|
||||
"pearson": (
|
||||
"Pearson (coeficiente r)",
|
||||
"Coeficiente de correlación lineal de Pearson (r) entre dos variables "
|
||||
"numéricas. Va de −1 (relación lineal inversa perfecta) a +1 (directa "
|
||||
"perfecta); 0 indica ausencia de relación lineal. Sólo capta relaciones "
|
||||
"lineales, por eso lleva signo."),
|
||||
"spearman": (
|
||||
"Spearman (correlación de rangos)",
|
||||
"Correlación de rangos de Spearman: el coeficiente de Pearson calculado "
|
||||
"sobre los puestos (rangos) de los valores en vez de sus magnitudes. Mide "
|
||||
"relaciones monótonas (no necesariamente lineales), va de −1 a +1 y es "
|
||||
"robusta frente a valores atípicos."),
|
||||
"cramers_v": (
|
||||
"Cramér's V",
|
||||
"Medida de asociación entre dos variables categóricas, derivada del "
|
||||
"estadístico chi-cuadrado y normalizada al rango 0–1 (0 = independientes, "
|
||||
"1 = asociación total). No tiene signo: sólo mide la intensidad."),
|
||||
"correlation_ratio": (
|
||||
"Razón de correlación (η)",
|
||||
"Razón de correlación (eta) entre una variable numérica y una "
|
||||
"categórica: la fracción de la varianza de la numérica explicada por los "
|
||||
"grupos de la categórica. Va de 0 (los grupos no explican nada) a 1 (la "
|
||||
"explican toda); no tiene signo."),
|
||||
"fdr": (
|
||||
"Comparaciones múltiples (FDR)",
|
||||
"Al evaluar muchos pares a la vez, algunos parecen significativos por "
|
||||
"puro azar. La corrección por tasa de falsos descubrimientos (FDR, "
|
||||
"Benjamini-Hochberg) ajusta los p-valores para controlar la proporción "
|
||||
"esperada de falsos positivos entre los pares declarados significativos."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
the marker), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _is_num(v) -> bool:
|
||||
"""True for a real, finite int/float (not bool, not NaN/inf)."""
|
||||
@@ -292,7 +245,7 @@ def _methods_block(corr: dict):
|
||||
return model.KVTable(rows=rows, title="Métodos de asociación")
|
||||
|
||||
|
||||
def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
|
||||
def _fdr_text(corr: dict) -> str | None:
|
||||
"""One-line summary of the multiple-testing (FDR) correction, or None."""
|
||||
mt = corr.get("multiple_testing")
|
||||
if not isinstance(mt, dict) or not mt:
|
||||
@@ -301,8 +254,7 @@ def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
|
||||
alpha = mt.get("alpha")
|
||||
n_tests = mt.get("n_tests")
|
||||
n_rej = mt.get("n_rejected")
|
||||
multi = _term(mark_term, "fdr", "comparaciones múltiples")
|
||||
parts = [f"Corrección por {multi} ({method}"]
|
||||
parts = [f"Corrección por comparaciones múltiples ({method}"]
|
||||
if _is_num(alpha):
|
||||
parts[0] += f", α={float(alpha):g}"
|
||||
parts[0] += ")."
|
||||
@@ -337,31 +289,13 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
|
||||
blocks: list = []
|
||||
|
||||
# Register the always-present method terms in the shared glossary and mark
|
||||
# their first appearance clickable (the FDR term is registered lazily below,
|
||||
# only when the FDR summary is actually emitted). Degrades silently when no
|
||||
# collector is in ctx (standalone render) — mark_term stays False.
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
if gloss is not None:
|
||||
for key in ("pearson", "spearman", "cramers_v", "correlation_ratio"):
|
||||
label, definition = _TERM_DEFS[key]
|
||||
gloss.add(key, label, definition)
|
||||
|
||||
# Intro: what this chapter shows and how to read the sign. Build the marked
|
||||
# method names as locals first (avoids backslash-in-f-string for "Cramér's V").
|
||||
t_pearson = _term(mark_term, "pearson", "Pearson")
|
||||
t_spearman = _term(mark_term, "spearman", "Spearman")
|
||||
t_cramers = _term(mark_term, "cramers_v", "Cramér's V")
|
||||
t_corr_ratio = _term(mark_term, "correlation_ratio", "razón de correlación")
|
||||
# Intro: what this chapter shows and how to read the sign.
|
||||
blocks.append(model.Markdown(text=(
|
||||
"Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
|
||||
f"sus tipos ({t_pearson}/{t_spearman} entre numéricas — con **signo**; "
|
||||
f"{t_cramers} entre categóricas; {t_corr_ratio} num-categórica; "
|
||||
"información mutua como medida común no lineal). Sólo las correlaciones "
|
||||
"**num-num** tienen dirección: por eso los pares **negativos** son siempre "
|
||||
"num-num.")))
|
||||
"sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
|
||||
"entre categóricas; razón de correlación num-categórica; información mutua "
|
||||
"como medida común no lineal). Sólo las correlaciones **num-num** tienen "
|
||||
"dirección: por eso los pares **negativos** son siempre num-num.")))
|
||||
|
||||
# 1) Association matrix (heatmap).
|
||||
labels, trimmed = _ordered_labels(pairs)
|
||||
@@ -403,13 +337,9 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
"no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
|
||||
"sobre los retornos/diferencias antes de interpretarlas.")))
|
||||
|
||||
# 4) FDR summary + methods legend. Register the FDR term only when its
|
||||
# summary is emitted, so the glossary never lists an unreferenced entry.
|
||||
fdr_text = _fdr_text(corr, mark_term=mark_term)
|
||||
# 4) FDR summary + methods legend.
|
||||
fdr_text = _fdr_text(corr)
|
||||
if fdr_text:
|
||||
if gloss is not None:
|
||||
label, definition = _TERM_DEFS["fdr"]
|
||||
gloss.add("fdr", label, definition)
|
||||
blocks.append(model.Markdown(text=fdr_text))
|
||||
methods = _methods_block(corr)
|
||||
if methods is not None:
|
||||
|
||||
@@ -173,25 +173,3 @@ def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
|
||||
assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
|
||||
# A short, unbreakable fragment of the long label survives the wrap.
|
||||
assert "azufre" in _pdf_text(pdf)
|
||||
|
||||
|
||||
def test_glosario_engancha_metodos_y_fdr():
|
||||
"""Mejora 4b: los métodos de correlación (Pearson, Spearman, Cramér's V,
|
||||
razón de correlación) y la corrección por comparaciones múltiples (FDR) se
|
||||
registran en el colector compartido y se marcan clicables en el cuerpo. Sin
|
||||
colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ch = build_correlacion(_profile(), {"glossary": g})
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
for k in ("pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"):
|
||||
assert f"[[term:{k}]]" in body, k
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_correlacion(_profile(), {})
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -55,62 +55,6 @@ _CLUSTER_COLORS = [
|
||||
"#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac",
|
||||
]
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance — the
|
||||
# canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
|
||||
# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
|
||||
# block. A term is registered only when its section is actually rendered, so the
|
||||
# glossary never lists an entry no in-text appearance points to.
|
||||
_TERM_DEFS = {
|
||||
"zscore": (
|
||||
"Estandarización z-score",
|
||||
"Transformación que lleva cada columna numérica a media 0 y desviación "
|
||||
"típica 1: a cada valor le resta la media de su columna y lo divide por "
|
||||
"la desviación típica. Así variables con escalas muy distintas (euros "
|
||||
"frente a un ratio 0–1) pesan por igual en las distancias y la varianza."),
|
||||
"pca": (
|
||||
"PCA (componentes principales)",
|
||||
"El análisis de componentes principales resume muchas variables "
|
||||
"numéricas correlacionadas en pocos ejes nuevos (componentes), "
|
||||
"ortogonales entre sí y ordenados por la cantidad de varianza que "
|
||||
"capturan. Permite ver la estructura de los datos en 2D y saber cuántas "
|
||||
"dimensiones bastan para explicarlos."),
|
||||
"kmeans": (
|
||||
"KMeans (segmentación)",
|
||||
"Algoritmo de agrupamiento no supervisado que reparte las filas en k "
|
||||
"segmentos: asigna cada fila al centro (centroide) más cercano y recoloca "
|
||||
"los centroides de forma iterativa hasta minimizar la distancia interna "
|
||||
"de cada grupo. Aquí k se elige automáticamente."),
|
||||
"silhouette": (
|
||||
"Coeficiente de silueta (silhouette)",
|
||||
"Métrica de calidad de un agrupamiento, en el rango −1 a 1: para cada "
|
||||
"fila compara cómo de cerca está de su propio segmento frente al segmento "
|
||||
"vecino más próximo. Cuanto más alto el promedio, más compactos y "
|
||||
"separados están los segmentos."),
|
||||
"isolation_forest": (
|
||||
"Isolation Forest (anomalías)",
|
||||
"Algoritmo de detección de anomalías multivariante: construye árboles que "
|
||||
"parten el espacio con cortes aleatorios y mide cuántos cortes hacen "
|
||||
"falta para aislar cada fila. Las filas raras se aíslan con muy pocos "
|
||||
"cortes y se marcan como outliers según un umbral de contaminación."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
it), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _register(gloss, key: str) -> None:
|
||||
"""Register term ``key`` in the collector (idempotent); no-op if gloss None."""
|
||||
if gloss is not None:
|
||||
label, definition = _TERM_DEFS[key]
|
||||
gloss.add(key, label, definition)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the overview chapter's defensive style).
|
||||
@@ -308,37 +252,34 @@ def _make_cluster_scatter(projection: dict):
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Section builders. Each returns a list of blocks (possibly empty).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _normalization_intro(gloss=None, mark_term: bool = False) -> list:
|
||||
_register(gloss, "zscore")
|
||||
zscore = _term(mark_term, "zscore", "**estandarizan con z-score**")
|
||||
def _normalization_intro() -> list:
|
||||
text = (
|
||||
"Estos modelos son **no supervisados**: buscan estructura latente sin "
|
||||
"una variable objetivo. Antes de aplicarlos, todas las columnas "
|
||||
f"numéricas se {zscore} (cada valor menos la media, dividido por la "
|
||||
"desviación típica). Sin esta normalización, una variable con escala "
|
||||
"grande (p.ej. ingresos en euros) dominaría las distancias y la varianza "
|
||||
"frente a otra de escala pequeña (p.ej. un ratio entre 0 y 1), sesgando "
|
||||
"tanto el PCA como el KMeans. Tras la estandarización todas las variables "
|
||||
"pesan por igual."
|
||||
"numéricas se **estandarizan con z-score** (cada valor menos la media, "
|
||||
"dividido por la desviación típica). Sin esta normalización, una "
|
||||
"variable con escala grande (p.ej. ingresos en euros) dominaría las "
|
||||
"distancias y la varianza frente a otra de escala pequeña (p.ej. un "
|
||||
"ratio entre 0 y 1), sesgando tanto el PCA como el KMeans. Tras la "
|
||||
"estandarización todas las variables pesan por igual."
|
||||
)
|
||||
return [model.Heading(text="Modelos no supervisados", level=1),
|
||||
model.Markdown(text=text)]
|
||||
|
||||
|
||||
def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
def _pca_section(pca: dict) -> list:
|
||||
if not _is_dict(pca) or not pca.get("explained_variance_ratio"):
|
||||
return []
|
||||
_register(gloss, "pca")
|
||||
blocks = [model.Heading(text="PCA — varianza explicada", level=2)]
|
||||
|
||||
n_used = pca.get("n_rows_used")
|
||||
n_feat = pca.get("n_features")
|
||||
intro = (
|
||||
f"El {_term(mark_term, 'pca', 'PCA')} resume {_fmt_num(n_feat)} variables "
|
||||
"numéricas en componentes ortogonales ordenados por la varianza que "
|
||||
f"capturan ({_fmt_num(n_used)} filas usadas tras eliminar nulos). El "
|
||||
"gráfico de sedimentación (scree) muestra cuánta varianza aporta cada "
|
||||
"componente y su acumulado: un codo marca cuántos componentes bastan."
|
||||
f"El PCA resume {_fmt_num(n_feat)} variables numéricas en componentes "
|
||||
f"ortogonales ordenados por la varianza que capturan "
|
||||
f"({_fmt_num(n_used)} filas usadas tras eliminar nulos). El gráfico de "
|
||||
"sedimentación (scree) muestra cuánta varianza aporta cada componente y "
|
||||
"su acumulado: un codo marca cuántos componentes bastan."
|
||||
)
|
||||
blocks.append(model.Markdown(text=intro))
|
||||
|
||||
@@ -384,14 +325,11 @@ def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
return blocks
|
||||
|
||||
|
||||
def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
gloss=None, mark_term: bool = False) -> list:
|
||||
def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
|
||||
has_km = _is_dict(kmeans) and kmeans.get("best_k")
|
||||
has_proj = _is_dict(projection) and projection.get("points")
|
||||
if not has_km and not has_proj:
|
||||
return []
|
||||
_register(gloss, "kmeans")
|
||||
_register(gloss, "silhouette")
|
||||
|
||||
blocks = [model.Heading(text="Segmentación (KMeans)", level=2)]
|
||||
|
||||
@@ -399,11 +337,9 @@ def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
sil = (projection or {}).get("silhouette")
|
||||
if sil is None:
|
||||
sil = (kmeans or {}).get("silhouette")
|
||||
t_kmeans = _term(mark_term, "kmeans", "KMeans")
|
||||
t_sil = _term(mark_term, "silhouette", "*silhouette*")
|
||||
intro = (
|
||||
f"{t_kmeans} agrupa las filas en **{_fmt_num(best_k)} segmentos** "
|
||||
f"elegidos automáticamente maximizando el coeficiente de {t_sil} "
|
||||
f"KMeans agrupa las filas en **{_fmt_num(best_k)} segmentos** elegidos "
|
||||
"automáticamente maximizando el coeficiente de *silhouette* "
|
||||
f"(**{_fmt_num(sil)}**, rango −1 a 1: cuanto más alto, segmentos más "
|
||||
"compactos y separados). Los segmentos se proyectan sobre el plano de "
|
||||
"los dos primeros componentes principales para visualizarlos."
|
||||
@@ -458,18 +394,16 @@ def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
return blocks
|
||||
|
||||
|
||||
def _outliers_section(outliers: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
def _outliers_section(outliers: dict) -> list:
|
||||
if not _is_dict(outliers) or outliers.get("n_outliers") is None:
|
||||
return []
|
||||
if outliers.get("note") and not outliers.get("n_rows_used"):
|
||||
# insufficient data — nothing meaningful to show.
|
||||
return []
|
||||
_register(gloss, "isolation_forest")
|
||||
blocks = [model.Heading(text="Detección de anomalías (Isolation Forest)",
|
||||
level=2)]
|
||||
isof = _term(mark_term, "isolation_forest", "**Isolation Forest**")
|
||||
explain = (
|
||||
f"{isof} detecta filas anómalas de forma *multivariante*: "
|
||||
"**Isolation Forest** detecta filas anómalas de forma *multivariante*: "
|
||||
"construye árboles que parten el espacio con cortes aleatorios y mide "
|
||||
"cuántos cortes hacen falta para aislar cada fila. Las filas raras "
|
||||
"(combinaciones de valores poco frecuentes considerando **todas las "
|
||||
@@ -550,21 +484,15 @@ def build_modelos(profile: dict, ctx: dict):
|
||||
(kmeans and kmeans.get("best_k")) or (projection and projection.get("points"))
|
||||
) else None
|
||||
|
||||
# Shared glossary collector: terms are registered + marked clickable inside
|
||||
# each section, only when that section actually renders (no orphan entries).
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
|
||||
sections = []
|
||||
sections += _pca_section(pca, gloss, mark_term) if pca else []
|
||||
sections += _kmeans_section(kmeans, projection, titles, gloss, mark_term)
|
||||
sections += _outliers_section(outliers, gloss, mark_term) if outliers else []
|
||||
sections += _pca_section(pca) if pca else []
|
||||
sections += _kmeans_section(kmeans, projection, titles)
|
||||
sections += _outliers_section(outliers) if outliers else []
|
||||
sections += _normality_section(normality) if normality else []
|
||||
|
||||
if not sections:
|
||||
return None # models block present but nothing renderable.
|
||||
|
||||
blocks = _normalization_intro(gloss, mark_term) + sections
|
||||
blocks = _normalization_intro() + sections
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -257,26 +257,3 @@ def test_anticortes_tabla_normalidad_larga_no_corta():
|
||||
# Every column name survives (wrapped/split, never truncated).
|
||||
for i in (0, 19, 39):
|
||||
assert f"col_{i}" in txt
|
||||
|
||||
|
||||
def test_glosario_engancha_terminos_modelos():
|
||||
"""Mejora 4b: PCA, KMeans, silhouette, Isolation Forest y la estandarización
|
||||
z-score se registran en el colector compartido y se marcan clicables en el
|
||||
cuerpo. Sin colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ctx = dict(_ctx_full())
|
||||
ctx["glossary"] = g
|
||||
ch = build_modelos(_profile(), ctx)
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"zscore", "pca", "kmeans", "silhouette", "isolation_forest"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
for k in ("zscore", "pca", "kmeans", "silhouette", "isolation_forest"):
|
||||
assert f"[[term:{k}]]" in body, k
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_modelos(_profile(), _ctx_full())
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -4,10 +4,10 @@ name: column_quality_score
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
version: "2.0.0"
|
||||
purity: pure
|
||||
signature: "def column_quality_score(col: dict) -> dict"
|
||||
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda, con desglose completeness/validity/consistency y lista de issues legibles. Funcion pura, no muta el input."
|
||||
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda. Combina completeness (0.6) y validity (0.4) con renormalizacion por aplicabilidad; los outliers, columnas constantes e ids NO bajan el score (van a observations). Devuelve desglose por dimension, issues (defectos) y observations (señales analiticas). Funcion pura, no muta el input."
|
||||
tags: [eda, data-quality, profiling, scoring, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
@@ -17,20 +17,26 @@ error_type: ""
|
||||
imports: []
|
||||
example: |
|
||||
from datascience import column_quality_score
|
||||
col = {"name": "precio", "inferred_type": "float", "null_pct": 0.2,
|
||||
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 0.08}}
|
||||
col = {"name": "precio", "inferred_type": "numeric", "null_pct": 0.2,
|
||||
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 8.0}}
|
||||
column_quality_score(col)
|
||||
# {"score": 86.8, "completeness": 0.8, "validity": 0.92,
|
||||
# "consistency": 1.0, "issues": ["20% nulos", "8% outliers"]}
|
||||
# {"score": 88.0, "completeness": 0.8, "validity": 1.0,
|
||||
# "applicable": ["completeness", "validity"], "issues": ["20% nulos"],
|
||||
# "observations": ["8% de valores atípicos (z-score>3): ..."]}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_clean_column_high_score"
|
||||
- "test_half_null_lowers_completeness_and_score"
|
||||
- "test_constant_column_flags_issue"
|
||||
- "test_weights_60_40_native_type"
|
||||
- "test_outliers_do_not_penalize_score"
|
||||
- "test_nulls_lower_score_more_than_outliers"
|
||||
- "test_validity_from_parse_rate_lowers_score"
|
||||
- "test_validity_from_match_rate"
|
||||
- "test_free_text_renormalizes_to_completeness_only"
|
||||
- "test_all_null_column_scores_zero"
|
||||
- "test_constant_column_scores_full_and_is_observation"
|
||||
- "test_high_cardinality_id_scores_full_and_is_observation"
|
||||
- "test_mostly_null_no_double_counts_validity"
|
||||
- "test_empty_dict_does_not_crash"
|
||||
- "test_outliers_penalize_validity"
|
||||
- "test_mostly_null_flag_halves_validity"
|
||||
- "test_high_cardinality_text_flagged_as_id"
|
||||
- "test_none_values_treated_defensively"
|
||||
- "test_does_not_mutate_input"
|
||||
test_file_path: "python/functions/datascience/column_quality_score_test.py"
|
||||
@@ -38,16 +44,22 @@ file_path: "python/functions/datascience/column_quality_score.py"
|
||||
params:
|
||||
- name: col
|
||||
desc: >
|
||||
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb).
|
||||
Se leen sus claves de forma defensiva con .get(...) y se toleran valores
|
||||
None. Claves usadas: null_pct (0-1), inferred_type, semantic_type,
|
||||
unique_pct (0-1), flags (list[str], reconoce "constant"/"mostly_null"),
|
||||
numeric ({outlier_pct: 0-1, ...}|None) y match_rate (opcional, 0-1).
|
||||
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb /
|
||||
profile_table). Se leen sus claves de forma defensiva con .get(...) y se
|
||||
toleran valores None. Claves usadas: null_pct (0-1), n_rows, empty_count
|
||||
(texto), inferred_type, semantic_type, validity_rate (0-1, lo expone
|
||||
profile_table al promocionar texto a numero/fecha), match_rate (0-1),
|
||||
unique_pct (0-1), flags (list[str], reconoce
|
||||
"constant"/"possible_id"/"high_cardinality") y numeric ({outlier_pct: 0-100,
|
||||
skew, ...}|None).
|
||||
output: >
|
||||
dict con score (float 0-100, redondeado a 1 decimal), completeness (0-1),
|
||||
validity (0-1), consistency (0-1) e issues (list[str] de descripciones
|
||||
legibles de los problemas detectados). score = round(100 * (0.5*completeness
|
||||
+ 0.3*validity + 0.2*consistency), 1).
|
||||
dict con score (float 0-100, 1 decimal), completeness (0-1), validity (0-1 o
|
||||
None si no aplicable), dimensions ({completeness, validity}), applicable
|
||||
(list[str] de dimensiones que entraron en el score), issues (list[str] SOLO de
|
||||
defectos de calidad: nulos, vacios, valores no conformes) y observations
|
||||
(list[str] de señales analiticas que NO bajan el score: outliers, columna
|
||||
constante, posible id, asimetria). score = round(100 * (0.6*completeness +
|
||||
0.4*validity) / pesos_aplicables, 1), renormalizado cuando validity no aplica.
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
@@ -59,51 +71,71 @@ from datascience import column_quality_score
|
||||
col = {
|
||||
"name": "precio",
|
||||
"physical_type": "DOUBLE",
|
||||
"inferred_type": "float",
|
||||
"inferred_type": "numeric",
|
||||
"semantic_type": "",
|
||||
"count": 800,
|
||||
"n_rows": 1000,
|
||||
"null_count": 200,
|
||||
"null_pct": 0.20,
|
||||
"distinct_count": 400,
|
||||
"unique_pct": 0.40,
|
||||
"flags": [],
|
||||
"numeric": {"outlier_pct": 0.08},
|
||||
"numeric": {"outlier_pct": 8.0, "skew": 0.3},
|
||||
"categorical": None,
|
||||
"datetime": None,
|
||||
}
|
||||
|
||||
column_quality_score(col)
|
||||
# {
|
||||
# "score": 86.8,
|
||||
# "completeness": 0.8, # 1 - 0.20
|
||||
# "validity": 0.92, # 1 - min(0.08, 0.3)
|
||||
# "consistency": 1.0,
|
||||
# "issues": ["20% nulos", "8% outliers"],
|
||||
# "score": 88.0, # 100 * (0.6*0.8 + 0.4*1.0)
|
||||
# "completeness": 0.8, # 1 - 0.20
|
||||
# "validity": 1.0, # numerica nativa: el tipo es conforme
|
||||
# "dimensions": {"completeness": 0.8, "validity": 1.0},
|
||||
# "applicable": ["completeness", "validity"],
|
||||
# "issues": ["20% nulos"], # SOLO defectos de calidad
|
||||
# "observations": ["8% de valores atípicos (z-score>3): ..."], # NO bajan score
|
||||
# }
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando hayas perfilado una tabla con el grupo `eda` (p.ej.
|
||||
`summarize_table_duckdb`) y necesites un numero 0-100 por columna para
|
||||
ordenar/priorizar limpieza de datos, pintar semaforos de calidad en un
|
||||
dashboard, o decidir que columnas descartar antes de modelar. Es la capa de
|
||||
scoring sobre el ColumnProfile crudo: lee el perfil, no toca los datos.
|
||||
`summarize_table_duckdb` / `profile_table`) y necesites un numero 0-100 por
|
||||
columna para ordenar/priorizar limpieza de datos, pintar semaforos de calidad,
|
||||
o decidir que columnas descartar antes de modelar. Separa los **defectos de
|
||||
calidad reales** (`issues`: nulos, vacios, valores que no parsean a su tipo) de
|
||||
las **observaciones analiticas** (`observations`: outliers, columnas constantes,
|
||||
ids), que se reportan pero no penalizan. Es la capa de scoring sobre el
|
||||
ColumnProfile crudo: lee el perfil, no toca los datos.
|
||||
|
||||
## Notas
|
||||
## Gotchas
|
||||
|
||||
Funcion pura, sin I/O ni dependencias externas, no muta `col`. Lee todas las
|
||||
claves con `.get(...)` y tolera que vengan en `None` (un ColumnProfile recien
|
||||
salido de `summarize_table_duckdb` trae muchas claves a `None`), por lo que
|
||||
nunca falla por claves ausentes — un `{}` produce un resultado bien definido.
|
||||
Funcion pura, sin I/O, no muta `col`. Aun asi conviene saber:
|
||||
|
||||
Pesos del score: completeness 0.5, validity 0.3, consistency 0.2.
|
||||
- **Los outliers NO bajan el score.** Un valor extremo puede ser real y correcto
|
||||
(un cliente que compra mucho); detectar atipicos es analisis de la
|
||||
distribucion, no un juicio de correccion. Salen en `observations`, no en
|
||||
`issues`. Mismo trato para columnas constantes e identificadores de alta
|
||||
cardinalidad: son observaciones, no defectos.
|
||||
- **`validity` puede ser `None`** (no aplicable): texto libre sin `semantic_type`
|
||||
ni `validity_rate`, o columna 100% nula. En ese caso el score se renormaliza a
|
||||
solo `completeness` (la columna no se premia ni castiga por algo no medible).
|
||||
- **`outlier_pct` se interpreta en escala 0-100** (la que emite
|
||||
`describe_numeric`, z-score>3). Pasar una fraccion 0-1 produce un texto de
|
||||
observacion con el % equivocado, pero NUNCA afecta al score.
|
||||
- **`validity_rate` lo puebla `profile_table`** al promocionar una columna de
|
||||
texto a numero/fecha (fraccion que parsea). Si no esta presente y el tipo es
|
||||
nativo numerico/fecha/bool, `validity = 1.0`.
|
||||
- Sin doble conteo: la falta de datos cuenta solo en `completeness` (el antiguo
|
||||
castigo de `mostly_null` sobre `validity` se elimino).
|
||||
|
||||
- **completeness** = `1 - null_pct` (None -> 0 nulls -> 1.0).
|
||||
- **validity**: parte de 1.0 y penaliza `min(outlier_pct, 0.3)` en columnas
|
||||
numericas, `0.5 * (1 - match_rate)` si hay `semantic_type` declarado con
|
||||
`match_rate` bajo disponible, y multiplica por 0.5 si el flag `mostly_null`
|
||||
esta presente.
|
||||
- **consistency**: 1.0 salvo flag `constant` (-> 0.3, columna poco informativa)
|
||||
o texto con `unique_pct > 0.9` (-> 0.6, posible id de alta cardinalidad).
|
||||
## Capability growth log
|
||||
|
||||
- v2.0.0 (2026-06-30) — nueva formula de calidad (report 2046): pesos 60/40
|
||||
(completeness/validity) con renormalizacion por aplicabilidad; se elimina la
|
||||
dimension `consistency`-como-informatividad y el doble castigo de
|
||||
`mostly_null`; los outliers/constantes/ids salen del score a `observations`;
|
||||
validity mide conformidad real (parse rate / match rate / tipo nativo). Salida
|
||||
ampliada con `dimensions`, `applicable` y `observations`.
|
||||
- v1.0.0 — version inicial: pesos 50/30/20 (completeness/validity/consistency),
|
||||
los outliers penalizaban validity (con bug de escala) y consistency penalizaba
|
||||
informatividad.
|
||||
|
||||
@@ -1,34 +1,78 @@
|
||||
"""Score de calidad de datos (0-100) para un ColumnProfile del grupo eda.
|
||||
|
||||
Funcion pura: dado el perfil de una columna producido por el grupo de
|
||||
capacidad `eda` (p.ej. summarize_table_duckdb), calcula un score agregado
|
||||
de calidad junto a su desglose en completeness / validity / consistency y
|
||||
una lista de issues legibles. No realiza I/O ni muta el input.
|
||||
capacidad `eda` (p.ej. summarize_table_duckdb / profile_table), calcula un
|
||||
score agregado de calidad junto a su desglose por dimension y dos listas
|
||||
legibles separadas: `issues` (defectos de calidad reales que SI bajan el
|
||||
score) y `observations` (señales analiticas que NO bajan el score). No
|
||||
realiza I/O ni muta el input.
|
||||
|
||||
Modelo (DAMA-DMBOK / ISO 8000), ver report 2046:
|
||||
|
||||
- Solo entran en el score las dimensiones medibles automaticamente desde el
|
||||
perfil, sin fuente externa de verdad: completeness y validity por columna.
|
||||
- Renormalizacion por aplicabilidad: si una dimension no es medible en la
|
||||
columna (texto libre sin semantica -> validity no aplica; columna 100% nula
|
||||
-> validity no medible), se excluye y los pesos se renormalizan sobre las
|
||||
aplicables. Una columna ni se premia ni se castiga por algo no medible.
|
||||
- Sin doble conteo: la falta de datos cuenta solo en completeness (se elimino
|
||||
el antiguo castigo extra de `mostly_null` sobre validity).
|
||||
- Los OUTLIERS NO bajan la calidad. Un valor extremo puede ser real y
|
||||
correcto; detectar atipicos es analisis de la distribucion, no un juicio de
|
||||
coreccion. Outliers, columnas constantes e identificadores de alta
|
||||
cardinalidad pasan a `observations`, nunca a `issues`.
|
||||
"""
|
||||
|
||||
|
||||
# Pesos base de las dimensiones de columna (se renormalizan por aplicabilidad).
|
||||
_W_COMPLETENESS = 0.6
|
||||
_W_VALIDITY = 0.4
|
||||
|
||||
# Tipos inferidos cuyo almacen garantiza la conformidad de tipo (validity=1.0)
|
||||
# cuando NO vienen de una promocion de texto (en cuyo caso manda validity_rate).
|
||||
_NATIVE_TYPED = ("numeric", "integer", "float", "datetime", "date", "boolean", "bool")
|
||||
|
||||
|
||||
def column_quality_score(col: dict) -> dict:
|
||||
"""Calcula un score de calidad de datos 0-100 para un ColumnProfile.
|
||||
|
||||
El score pondera tres dimensiones:
|
||||
- completeness (0.5): proporcion de valores no nulos.
|
||||
- validity (0.3): ausencia de outliers / heuristicas de validez.
|
||||
- consistency (0.2): la columna aporta informacion (no constante, no ruido).
|
||||
El score combina solo dimensiones de calidad medibles desde el perfil, con
|
||||
renormalizacion por aplicabilidad:
|
||||
|
||||
- completeness (peso base 0.6, siempre aplica): proporcion de valores
|
||||
presentes = 1 - null_pct. En texto, las celdas vacias (`empty_count`)
|
||||
tambien cuentan como faltantes.
|
||||
- validity (peso base 0.4, cuando hay un criterio de validacion real):
|
||||
fraccion de valores no nulos conformes a su tipo/semantica. Tipo nativo
|
||||
numerico/fecha/bool = 1.0; texto promovido a numero/fecha = parse rate
|
||||
(`validity_rate`); texto con `semantic_type` regexable = `match_rate`;
|
||||
texto libre o columna 100% nula = NO aplicable (renormaliza a solo
|
||||
completeness).
|
||||
|
||||
Los outliers, columnas constantes, identificadores y asimetria fuerte NO
|
||||
bajan el score: se devuelven en `observations`.
|
||||
|
||||
Args:
|
||||
col: ColumnProfile dict del grupo eda. Se leen las claves de forma
|
||||
defensiva con .get(...) y se tolera que muchas vengan en None.
|
||||
Claves relevantes: null_pct, inferred_type, semantic_type,
|
||||
unique_pct, flags (list[str]), numeric ({outlier_pct, ...}|None),
|
||||
match_rate (opcional).
|
||||
Claves relevantes: null_pct (0-1), n_rows, empty_count,
|
||||
inferred_type, semantic_type, validity_rate (0-1, lo expone
|
||||
profile_table al promocionar texto a numero/fecha), match_rate
|
||||
(0-1), unique_pct (0-1), flags (list[str], reconoce
|
||||
"constant"/"possible_id"/"high_cardinality"), numeric
|
||||
({outlier_pct: 0-100, skew, ...}|None).
|
||||
|
||||
Returns:
|
||||
dict con:
|
||||
score (float, 0-100, redondeado a 1 decimal),
|
||||
completeness (float, 0-1),
|
||||
validity (float, 0-1),
|
||||
consistency (float, 0-1),
|
||||
issues (list[str]) descripciones legibles de los problemas.
|
||||
score (float 0-100, redondeado a 1 decimal),
|
||||
completeness (float 0-1),
|
||||
validity (float 0-1 | None si no aplicable),
|
||||
dimensions ({completeness, validity}),
|
||||
applicable (list[str] de dimensiones que entraron en el score),
|
||||
issues (list[str]) SOLO defectos de calidad (nulos, vacios,
|
||||
valores no conformes a su tipo/semantica),
|
||||
observations (list[str]) señales analiticas que NO bajan el score
|
||||
(outliers, columna constante, posible id, asimetria).
|
||||
"""
|
||||
if not isinstance(col, dict):
|
||||
col = {}
|
||||
@@ -39,103 +83,153 @@ def column_quality_score(col: dict) -> dict:
|
||||
flags = set(flags)
|
||||
|
||||
issues: list[str] = []
|
||||
observations: list[str] = []
|
||||
|
||||
inferred_type = col.get("inferred_type") or ""
|
||||
semantic_type = col.get("semantic_type") or ""
|
||||
|
||||
# --- completeness -------------------------------------------------
|
||||
null_pct = col.get("null_pct")
|
||||
if null_pct is None:
|
||||
null_pct = 0.0
|
||||
try:
|
||||
null_pct = float(null_pct)
|
||||
except (TypeError, ValueError):
|
||||
null_pct = 0.0
|
||||
null_pct = _clamp(null_pct, 0.0, 1.0)
|
||||
# Falta de datos = nulos + (en texto) celdas vacias. Es el unico sitio
|
||||
# donde la falta de datos cuenta: nunca se duplica en validity.
|
||||
null_pct = _clamp(_num(col.get("null_pct"), 0.0), 0.0, 1.0)
|
||||
completeness = 1.0 - null_pct
|
||||
if null_pct > 0:
|
||||
issues.append(f"{round(null_pct * 100)}% nulos")
|
||||
issues.append(f"{_pct(null_pct)} nulos")
|
||||
|
||||
# --- validity -----------------------------------------------------
|
||||
validity = 1.0
|
||||
inferred_type = col.get("inferred_type") or ""
|
||||
empty_frac = 0.0
|
||||
n_rows = col.get("n_rows")
|
||||
empty_count = col.get("empty_count")
|
||||
if (
|
||||
isinstance(n_rows, (int, float)) and not isinstance(n_rows, bool) and n_rows > 0
|
||||
and isinstance(empty_count, (int, float)) and not isinstance(empty_count, bool)
|
||||
and empty_count > 0
|
||||
):
|
||||
empty_frac = _clamp(float(empty_count) / float(n_rows), 0.0, 1.0)
|
||||
completeness = _clamp(completeness - empty_frac, 0.0, 1.0)
|
||||
issues.append(f"{_pct(empty_frac)} vacíos")
|
||||
|
||||
numeric = col.get("numeric")
|
||||
is_numeric = inferred_type in ("integer", "float", "numeric") or isinstance(numeric, dict)
|
||||
if isinstance(numeric, dict):
|
||||
outlier_pct = numeric.get("outlier_pct")
|
||||
if outlier_pct is not None:
|
||||
try:
|
||||
outlier_pct = float(outlier_pct)
|
||||
except (TypeError, ValueError):
|
||||
outlier_pct = 0.0
|
||||
outlier_pct = _clamp(outlier_pct, 0.0, 1.0)
|
||||
if outlier_pct > 0:
|
||||
penalty = min(outlier_pct, 0.3)
|
||||
validity -= penalty
|
||||
issues.append(f"{round(outlier_pct * 100)}% outliers")
|
||||
|
||||
# semantic_type declarado pero con baja tasa de match (si la conocemos).
|
||||
semantic_type = col.get("semantic_type") or ""
|
||||
match_rate = col.get("match_rate")
|
||||
if semantic_type and match_rate is not None:
|
||||
try:
|
||||
match_rate = float(match_rate)
|
||||
except (TypeError, ValueError):
|
||||
match_rate = None
|
||||
if match_rate is not None:
|
||||
match_rate = _clamp(match_rate, 0.0, 1.0)
|
||||
if match_rate < 1.0:
|
||||
shortfall = 1.0 - match_rate
|
||||
validity -= 0.5 * shortfall
|
||||
issues.append(
|
||||
f"semantic_type '{semantic_type}' con baja coincidencia "
|
||||
f"({round(match_rate * 100)}%)"
|
||||
)
|
||||
|
||||
if "mostly_null" in flags:
|
||||
validity *= 0.5
|
||||
issues.append("mayoritariamente nula")
|
||||
|
||||
validity = _clamp(validity, 0.0, 1.0)
|
||||
|
||||
# --- consistency --------------------------------------------------
|
||||
consistency = 1.0
|
||||
if "constant" in flags:
|
||||
consistency = 0.3
|
||||
issues.append("columna constante")
|
||||
# --- validity (con renormalizacion por aplicabilidad) -------------
|
||||
# None = no medible -> se excluye del score (no penaliza ni premia).
|
||||
validity = None
|
||||
if completeness <= 0.0:
|
||||
# Columna 100% faltante: no hay valores no nulos sobre los que medir
|
||||
# conformidad. validity no aplica -> el score sale solo de completeness
|
||||
# (= 0). Es el peor defecto de calidad posible.
|
||||
validity = None
|
||||
else:
|
||||
unique_pct = col.get("unique_pct")
|
||||
if unique_pct is not None:
|
||||
try:
|
||||
unique_pct = float(unique_pct)
|
||||
except (TypeError, ValueError):
|
||||
unique_pct = None
|
||||
if (
|
||||
inferred_type == "text"
|
||||
validity_rate = col.get("validity_rate")
|
||||
match_rate = col.get("match_rate")
|
||||
if validity_rate is not None:
|
||||
# Texto promovido a numero/fecha: parse rate real de la muestra.
|
||||
v = _num(validity_rate, None)
|
||||
if v is not None:
|
||||
validity = _clamp(v, 0.0, 1.0)
|
||||
if validity < 1.0:
|
||||
kind = (
|
||||
"número" if inferred_type == "numeric"
|
||||
else "fecha" if inferred_type == "datetime"
|
||||
else inferred_type or "su tipo"
|
||||
)
|
||||
issues.append(
|
||||
f"{_pct(1.0 - validity)} no parsea al tipo {kind}"
|
||||
)
|
||||
elif inferred_type in _NATIVE_TYPED:
|
||||
# Tipo nativo garantizado por el almacen: no hay valores que no
|
||||
# parseen. validity = 1.0 (no se confunde con tener outliers).
|
||||
validity = 1.0
|
||||
elif semantic_type and match_rate is not None:
|
||||
v = _num(match_rate, None)
|
||||
if v is not None:
|
||||
validity = _clamp(v, 0.0, 1.0)
|
||||
if validity < 1.0:
|
||||
issues.append(
|
||||
f"{_pct(1.0 - validity)} no casa con el "
|
||||
f"formato «{semantic_type}»"
|
||||
)
|
||||
else:
|
||||
# Texto libre / categorica sin semantica: no hay criterio honesto
|
||||
# de validez. No aplica.
|
||||
validity = None
|
||||
|
||||
# --- observations (NO bajan el score) -----------------------------
|
||||
numeric = col.get("numeric")
|
||||
if isinstance(numeric, dict):
|
||||
# outlier_pct viene en escala 0-100 desde describe_numeric (z-score>3).
|
||||
outlier_pct = _num(numeric.get("outlier_pct"), None)
|
||||
if outlier_pct is not None and outlier_pct >= 0.05:
|
||||
observations.append(
|
||||
f"{_pct(outlier_pct / 100.0)} de valores atípicos (z-score>3): "
|
||||
"revisar si son errores u observaciones legítimas"
|
||||
)
|
||||
skew = _num(numeric.get("skew"), None)
|
||||
if skew is not None and abs(skew) >= 1.0:
|
||||
observations.append(
|
||||
f"asimetría fuerte (skew={round(skew, 2)}): considerar "
|
||||
"re-expresión antes de modelar"
|
||||
)
|
||||
|
||||
if "constant" in flags:
|
||||
observations.append(
|
||||
"columna constante: aporta poca información para el análisis"
|
||||
)
|
||||
|
||||
unique_pct = _num(col.get("unique_pct"), None)
|
||||
is_id = (
|
||||
"possible_id" in flags
|
||||
or "high_cardinality" in flags
|
||||
or (
|
||||
inferred_type in ("text", "categorical")
|
||||
and unique_pct is not None
|
||||
and _clamp(unique_pct, 0.0, 1.0) > 0.9
|
||||
):
|
||||
consistency = 0.6
|
||||
issues.append("posible id de alta cardinalidad")
|
||||
|
||||
consistency = _clamp(consistency, 0.0, 1.0)
|
||||
|
||||
# --- score agregado ----------------------------------------------
|
||||
score = round(
|
||||
100.0 * (0.5 * completeness + 0.3 * validity + 0.2 * consistency),
|
||||
1,
|
||||
)
|
||||
)
|
||||
if is_id:
|
||||
observations.append(
|
||||
"valores casi únicos: posible identificador (no es un defecto de calidad)"
|
||||
)
|
||||
|
||||
# Silencia warnings sobre la variable de tipo no usada.
|
||||
_ = is_numeric
|
||||
# --- score agregado con renormalizacion ---------------------------
|
||||
applicable = ["completeness"]
|
||||
num = _W_COMPLETENESS * completeness
|
||||
den = _W_COMPLETENESS
|
||||
if validity is not None:
|
||||
applicable.append("validity")
|
||||
num += _W_VALIDITY * validity
|
||||
den += _W_VALIDITY
|
||||
score = round(100.0 * num / den, 1) if den > 0 else 0.0
|
||||
|
||||
return {
|
||||
"score": score,
|
||||
"completeness": completeness,
|
||||
"validity": validity,
|
||||
"consistency": consistency,
|
||||
"dimensions": {"completeness": completeness, "validity": validity},
|
||||
"applicable": applicable,
|
||||
"issues": issues,
|
||||
"observations": observations,
|
||||
}
|
||||
|
||||
|
||||
def _pct(frac: float) -> str:
|
||||
"""Formatea una fraccion 0-1 como porcentaje honesto: «N%» si >=1%, «0.N%»
|
||||
por debajo (para no mostrar «0%» cuando hay un defecto real pequeño)."""
|
||||
p = frac * 100.0
|
||||
if p >= 1.0:
|
||||
return f"{round(p)}%"
|
||||
return f"{p:.1f}%"
|
||||
|
||||
|
||||
def _num(x, default):
|
||||
"""Convierte x a float; devuelve `default` si es None o no parseable."""
|
||||
if x is None:
|
||||
return default
|
||||
if isinstance(x, bool):
|
||||
return default
|
||||
try:
|
||||
return float(x)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _clamp(x: float, lo: float, hi: float) -> float:
|
||||
"""Recorta x al rango [lo, hi]."""
|
||||
if x < lo:
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
"""Tests para column_quality_score."""
|
||||
"""Tests para column_quality_score (nueva fórmula, report 2046).
|
||||
|
||||
Verifica las invariantes de la fórmula de calidad:
|
||||
- completeness (0.6) + validity (0.4) con renormalización por aplicabilidad.
|
||||
- Los OUTLIERS no bajan el score (van a observations, no a issues).
|
||||
- Columnas constantes e ids no bajan el score (observations).
|
||||
- Sin doble conteo de la falta de datos.
|
||||
- all-null -> score 0; función pura (no muta el input).
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -9,11 +17,11 @@ from column_quality_score import column_quality_score
|
||||
|
||||
|
||||
def _clean_numeric_col() -> dict:
|
||||
"""ColumnProfile de una columna numerica sana, sin problemas."""
|
||||
"""ColumnProfile de una columna numérica nativa sana, sin problemas."""
|
||||
return {
|
||||
"name": "edad",
|
||||
"physical_type": "INTEGER",
|
||||
"inferred_type": "integer",
|
||||
"inferred_type": "numeric",
|
||||
"semantic_type": "",
|
||||
"count": 1000,
|
||||
"n_rows": 1000,
|
||||
@@ -28,85 +36,163 @@ def _clean_numeric_col() -> dict:
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_clean_column_high_score():
|
||||
out = column_quality_score(_clean_numeric_col())
|
||||
assert out["score"] > 90
|
||||
assert out["score"] == 100.0
|
||||
assert out["completeness"] == 1.0
|
||||
assert out["validity"] == 1.0
|
||||
assert out["consistency"] == 1.0
|
||||
assert out["applicable"] == ["completeness", "validity"]
|
||||
assert out["issues"] == []
|
||||
assert out["observations"] == []
|
||||
|
||||
|
||||
def test_half_null_lowers_completeness_and_score():
|
||||
def test_weights_60_40_native_type():
|
||||
"""30% nulos en numérica nativa: score = 100*(0.6*0.7 + 0.4*1.0) = 82."""
|
||||
col = _clean_numeric_col()
|
||||
col["null_count"] = 500
|
||||
col["null_pct"] = 0.5
|
||||
clean_score = column_quality_score(_clean_numeric_col())["score"]
|
||||
col["null_pct"] = 0.30
|
||||
col["null_count"] = 300
|
||||
out = column_quality_score(col)
|
||||
assert out["completeness"] == 0.5
|
||||
assert out["score"] < clean_score
|
||||
assert any("nulos" in issue for issue in out["issues"])
|
||||
assert out["completeness"] == 0.7
|
||||
assert out["validity"] == 1.0
|
||||
assert out["score"] == 82.0
|
||||
assert any("nulos" in i for i in out["issues"])
|
||||
|
||||
|
||||
def test_constant_column_flags_issue():
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Outliers FUERA del score
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_outliers_do_not_penalize_score():
|
||||
"""Columna con outliers pero sin nulos -> score máximo; outliers en observations."""
|
||||
col = _clean_numeric_col()
|
||||
col["numeric"] = {"outlier_pct": 18.0, "skew": 0.2} # 18% atípicos (escala 0-100)
|
||||
out = column_quality_score(col)
|
||||
assert out["score"] == 100.0 # los outliers NO bajan la calidad
|
||||
assert out["validity"] == 1.0
|
||||
# No aparecen como problema de calidad...
|
||||
assert not any("atípic" in i or "outlier" in i for i in out["issues"])
|
||||
# ...sino como observación analítica.
|
||||
assert any("atípic" in o for o in out["observations"])
|
||||
|
||||
|
||||
def test_nulls_lower_score_more_than_outliers():
|
||||
"""Vacíos sí penalizan; outliers no: comparar las dos columnas."""
|
||||
con_nulos = _clean_numeric_col()
|
||||
con_nulos["null_pct"] = 0.30
|
||||
con_outliers = _clean_numeric_col()
|
||||
con_outliers["numeric"] = {"outlier_pct": 30.0}
|
||||
assert column_quality_score(con_nulos)["score"] < \
|
||||
column_quality_score(con_outliers)["score"]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Validity: aplicabilidad y renormalización
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_validity_from_parse_rate_lowers_score():
|
||||
"""Numérica como texto con 20% basura: validity=0.8 -> score=92."""
|
||||
col = {
|
||||
"name": "precio_txt", "inferred_type": "numeric", "semantic_type": "decimal",
|
||||
"null_pct": 0.0, "validity_rate": 0.80, "flags": [], "numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 0.8
|
||||
assert out["score"] == 92.0 # 100*(0.6 + 0.4*0.8)
|
||||
assert any("no parsea" in i for i in out["issues"])
|
||||
|
||||
|
||||
def test_validity_from_match_rate():
|
||||
"""Texto con semantic_type y 5% no conforme: validity=0.95."""
|
||||
col = {
|
||||
"name": "email", "inferred_type": "text", "semantic_type": "email",
|
||||
"null_pct": 0.0, "match_rate": 0.95, "unique_pct": 0.5, "flags": [],
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 0.95
|
||||
assert out["score"] == 98.0 # 100*(0.6 + 0.4*0.95)
|
||||
assert any("no casa" in i for i in out["issues"])
|
||||
|
||||
|
||||
def test_free_text_renormalizes_to_completeness_only():
|
||||
"""Texto libre sin semántica: validity no aplica -> score = 100*completeness."""
|
||||
col = {
|
||||
"name": "comentario", "inferred_type": "text", "semantic_type": "",
|
||||
"null_pct": 0.30, "unique_pct": 0.5, "flags": [], "numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] is None
|
||||
assert out["applicable"] == ["completeness"]
|
||||
assert out["completeness"] == 0.7
|
||||
assert out["score"] == 70.0 # renormalizado a solo completeness
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Casos límite (report §4.6)
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_all_null_column_scores_zero():
|
||||
col = _clean_numeric_col()
|
||||
col["null_pct"] = 1.0
|
||||
col["null_count"] = 1000
|
||||
out = column_quality_score(col)
|
||||
assert out["completeness"] == 0.0
|
||||
assert out["validity"] is None # no medible sin valores no nulos
|
||||
assert out["score"] == 0.0
|
||||
|
||||
|
||||
def test_constant_column_scores_full_and_is_observation():
|
||||
"""Columna constante: dato válido y completo -> score 100; baja info = observación."""
|
||||
col = _clean_numeric_col()
|
||||
col["flags"] = ["constant"]
|
||||
col["distinct_count"] = 1
|
||||
col["unique_pct"] = 0.001
|
||||
out = column_quality_score(col)
|
||||
assert out["consistency"] == 0.3
|
||||
assert any("constante" in issue for issue in out["issues"])
|
||||
assert out["score"] == 100.0 # NO se castiga la baja informatividad
|
||||
assert not any("constante" in i for i in out["issues"])
|
||||
assert any("constante" in o for o in out["observations"])
|
||||
|
||||
|
||||
def test_high_cardinality_id_scores_full_and_is_observation():
|
||||
"""Id de alta cardinalidad: unicidad perfecta -> score 100; posible id = observación."""
|
||||
col = {
|
||||
"name": "uuid", "inferred_type": "text", "semantic_type": "",
|
||||
"null_pct": 0.0, "unique_pct": 0.99, "flags": ["possible_id"],
|
||||
"numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["score"] == 100.0
|
||||
assert not any("identificador" in i for i in out["issues"])
|
||||
assert any("identificador" in o for o in out["observations"])
|
||||
|
||||
|
||||
def test_mostly_null_no_double_counts_validity():
|
||||
"""85% nulos: solo completeness penaliza; validity nativa sigue 1.0 (sin doble castigo)."""
|
||||
col = _clean_numeric_col()
|
||||
col["null_pct"] = 0.85
|
||||
col["flags"] = ["mostly_null"]
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 1.0 # ya no se multiplica por 0.5
|
||||
# score = 100*(0.6*0.15 + 0.4*1.0) = 49
|
||||
assert out["score"] == 49.0
|
||||
assert not any("mayoritariamente" in o for o in out["observations"])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Robustez
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_empty_dict_does_not_crash():
|
||||
out = column_quality_score({})
|
||||
assert isinstance(out["score"], float)
|
||||
assert out["completeness"] == 1.0
|
||||
assert 0.0 <= out["score"] <= 100.0
|
||||
assert isinstance(out["issues"], list)
|
||||
|
||||
|
||||
def test_outliers_penalize_validity():
|
||||
col = _clean_numeric_col()
|
||||
col["numeric"] = {"outlier_pct": 0.2}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] < 1.0
|
||||
assert any("outliers" in issue for issue in out["issues"])
|
||||
|
||||
|
||||
def test_mostly_null_flag_halves_validity():
|
||||
col = _clean_numeric_col()
|
||||
col["null_pct"] = 0.85
|
||||
col["flags"] = ["mostly_null"]
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 0.5
|
||||
assert any("mayoritariamente nula" in issue for issue in out["issues"])
|
||||
|
||||
|
||||
def test_high_cardinality_text_flagged_as_id():
|
||||
col = {
|
||||
"name": "uuid",
|
||||
"inferred_type": "text",
|
||||
"semantic_type": "",
|
||||
"null_pct": 0.0,
|
||||
"unique_pct": 0.99,
|
||||
"flags": [],
|
||||
"numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["consistency"] < 1.0
|
||||
assert any("alta cardinalidad" in issue for issue in out["issues"])
|
||||
assert isinstance(out["observations"], list)
|
||||
|
||||
|
||||
def test_none_values_treated_defensively():
|
||||
col = {
|
||||
"name": "x",
|
||||
"inferred_type": None,
|
||||
"semantic_type": None,
|
||||
"null_pct": None,
|
||||
"unique_pct": None,
|
||||
"flags": None,
|
||||
"numeric": None,
|
||||
"name": "x", "inferred_type": None, "semantic_type": None,
|
||||
"null_pct": None, "unique_pct": None, "flags": None, "numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["completeness"] == 1.0
|
||||
|
||||
@@ -3,7 +3,7 @@ name: summarize_table_duckdb
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
version: "1.1.0"
|
||||
purity: impure
|
||||
signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict"
|
||||
description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)."
|
||||
@@ -64,6 +64,7 @@ else:
|
||||
- **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima).
|
||||
- **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75.
|
||||
- **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`).
|
||||
- **`duplicate_rows`/`duplicate_pct` se pueblan push-down** (desde v1.1.0) con `count(*)` sobre `SELECT DISTINCT *` (sin traer filas a RAM): `duplicate_rows = n_rows - filas_distintas`, `duplicate_pct` en fraccion 0-1. Habilitan la dimension de unicidad de registro del score de dataset (`profile_table` paso 6). Si la tabla tiene tipos no comparables con `DISTINCT` (BLOB/LIST/MAP) la query degrada y ambas vuelven a `None` (renormaliza el score a solo `cell_quality`).
|
||||
- **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica).
|
||||
- **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage).
|
||||
- **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base.
|
||||
|
||||
@@ -196,6 +196,21 @@ def summarize_table_duckdb(
|
||||
sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
|
||||
)
|
||||
|
||||
# Unicidad de registro: filas duplicadas via COUNT de filas distintas
|
||||
# push-down (DISTINCT *), sin traer filas a RAM. Habilita la dimension
|
||||
# de uniqueness del score de dataset (1 - duplicate_pct). Degrada a None
|
||||
# si la tabla tiene tipos no comparables con DISTINCT (BLOB/LIST/MAP).
|
||||
duplicate_rows = None
|
||||
duplicate_pct = None
|
||||
if n_rows > 0:
|
||||
dup_res = duckdb_query_readonly(
|
||||
db_path, f"SELECT count(*) AS c FROM (SELECT DISTINCT * FROM {quoted})"
|
||||
)
|
||||
if dup_res["status"] == "ok" and dup_res["rows"]:
|
||||
distinct_rows = int(dup_res["rows"][0]["c"])
|
||||
duplicate_rows = max(0, n_rows - distinct_rows)
|
||||
duplicate_pct = duplicate_rows / n_rows # fraccion 0-1
|
||||
|
||||
profile = {
|
||||
"table": table,
|
||||
"source": "duckdb",
|
||||
@@ -203,8 +218,8 @@ def summarize_table_duckdb(
|
||||
"n_rows": n_rows,
|
||||
"n_cols": len(columns),
|
||||
"size_bytes": None,
|
||||
"duplicate_rows": None,
|
||||
"duplicate_pct": None,
|
||||
"duplicate_rows": duplicate_rows,
|
||||
"duplicate_pct": duplicate_pct,
|
||||
"constant_cols": constant_cols,
|
||||
"all_null_cols": all_null_cols,
|
||||
"null_cell_pct": null_cell_pct,
|
||||
|
||||
@@ -54,6 +54,30 @@ def test_shape_y_metadatos_tabla(db):
|
||||
assert profile["correlations"] is None
|
||||
|
||||
|
||||
def test_duplicate_pct_sin_duplicados(db):
|
||||
"""Tabla con todas las filas distintas: duplicate_pct = 0, no None."""
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
assert profile["duplicate_rows"] == 0
|
||||
assert profile["duplicate_pct"] == 0.0
|
||||
|
||||
|
||||
def test_duplicate_pct_con_duplicados(tmp_path):
|
||||
"""Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down."""
|
||||
path = str(tmp_path / "dups.duckdb")
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)")
|
||||
# 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4.
|
||||
con.execute(
|
||||
"INSERT INTO t VALUES "
|
||||
"(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')"
|
||||
)
|
||||
con.close()
|
||||
profile = summarize_table_duckdb(path, "t")["profile"]
|
||||
assert profile["n_rows"] == 5
|
||||
assert profile["duplicate_rows"] == 2
|
||||
assert profile["duplicate_pct"] == 0.4
|
||||
|
||||
|
||||
def test_column_profile_shape(db):
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
by_name = {c["name"]: c for c in profile["columns"]}
|
||||
|
||||
@@ -4,7 +4,7 @@ kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
purity: impure
|
||||
version: "1.0.0"
|
||||
version: "1.1.0"
|
||||
signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict"
|
||||
description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla."
|
||||
tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries]
|
||||
@@ -114,3 +114,12 @@ para auditar la calidad de una tabla ya productiva. Reemplaza orquestar a mano
|
||||
Formatos exoticos pueden descartarse silenciosamente del calculo numerico.
|
||||
- `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el
|
||||
sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red).
|
||||
- **Score de calidad (report 2046, desde v1.1.0).** Paso 5: cada columna recibe
|
||||
`quality_score` de `column_quality_score` con la formula 60/40
|
||||
(completeness/validity); al promocionar texto a numero/fecha se expone
|
||||
`col["validity_rate"]` (parse rate de la muestra) para alimentar la dimension
|
||||
validity. Paso 6: el score de dataset NO es la media simple — es
|
||||
`100 * (0.85*cell_quality + 0.15*row_uniqueness)`, donde
|
||||
`cell_quality = media(score_col/100)` y `row_uniqueness = 1 - duplicate_pct`.
|
||||
Si `duplicate_pct` es `None` (backend sin calcularlo) el score se renormaliza
|
||||
a solo `cell_quality`. Los outliers NO bajan el score (van a `observations`).
|
||||
|
||||
@@ -477,9 +477,18 @@ def profile_table(
|
||||
if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
|
||||
col["inferred_type"] = "numeric"
|
||||
inferred = "numeric"
|
||||
# Tasa de parseo real de la muestra: alimenta la
|
||||
# dimension validity de column_quality_score (fraccion
|
||||
# de valores conformes al tipo numerico promovido).
|
||||
col["validity_rate"] = len(ok) / len(vals)
|
||||
elif semantic in _DATETIME_SEMANTIC:
|
||||
col["inferred_type"] = "datetime"
|
||||
inferred = "datetime"
|
||||
# Tasa de parseo de la muestra a fecha (mismo papel que el
|
||||
# parse rate numerico) para la dimension validity.
|
||||
parsed_dt = [_to_ordinal_days(v) for v in vals]
|
||||
ok_dt = [d for d in parsed_dt if d is not None]
|
||||
col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None
|
||||
|
||||
# 4) Enriquecer segun el inferred_type final.
|
||||
if inferred == "numeric":
|
||||
@@ -506,11 +515,36 @@ def profile_table(
|
||||
# 5) Score de calidad por columna.
|
||||
col["quality_score"] = column_quality_score(col).get("score")
|
||||
|
||||
# 6) Score agregado de la tabla (media de columnas).
|
||||
# 6) Score agregado de la tabla (report 2046): NO media simple.
|
||||
# cell_quality = media de los scores de columna, en [0,1].
|
||||
# row_uniqueness = 1 - duplicate_pct (unicidad de registro).
|
||||
# score = 100 * (0.85*cell_quality + 0.15*row_uniqueness).
|
||||
# Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular.
|
||||
scores = [
|
||||
c["quality_score"] for c in cols if c.get("quality_score") is not None
|
||||
]
|
||||
prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None
|
||||
if scores:
|
||||
cell_quality = (sum(scores) / len(scores)) / 100.0
|
||||
dup_pct = prof.get("duplicate_pct")
|
||||
if dup_pct is not None:
|
||||
try:
|
||||
d = float(dup_pct)
|
||||
except (TypeError, ValueError):
|
||||
d = None
|
||||
else:
|
||||
d = None
|
||||
if d is not None:
|
||||
# Tolerar escala 0-100 por si algun backend la entrega asi.
|
||||
if d > 1.0:
|
||||
d = d / 100.0
|
||||
row_uniqueness = max(0.0, min(1.0, 1.0 - d))
|
||||
prof["quality_score"] = round(
|
||||
100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1
|
||||
)
|
||||
else:
|
||||
prof["quality_score"] = round(100.0 * cell_quality, 1)
|
||||
else:
|
||||
prof["quality_score"] = None
|
||||
|
||||
# 7) Candidatos a clave.
|
||||
key_candidates = []
|
||||
|
||||
Reference in New Issue
Block a user