merge: 4b calidad — nueva formula (completeness 0.6+validity 0.4, dataset row_uniqueness, outliers fuera a Observaciones, sin doble conteo) report 2046 (verificado met)
This commit is contained in:
@@ -1,22 +1,26 @@
|
|||||||
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
||||||
|
|
||||||
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
||||||
chapter answers, in Spanish and as tables, the three things the user asked for:
|
chapter implements the quality model of report 2046:
|
||||||
|
|
||||||
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
|
1. **En qué se basa la calidad** — an intro paragraph explaining the two scored
|
||||||
their weights (completeness, validity, consistency) before any number, plus a
|
dimensions and their weights (completitud 60%, validez 40%) plus the
|
||||||
table-level summary (global score and aggregates).
|
table-level row uniqueness, BEFORE any number, and stating explicitly that
|
||||||
|
outliers are reported as observations and do **not** lower the score. The
|
||||||
|
criteria terms (calidad de datos, completitud, validez, unicidad de registro)
|
||||||
|
are hooked into the shared glossary as clickable jumps.
|
||||||
2. **Scores por columna** — a table with, per column, the total quality score and
|
2. **Scores por columna** — a table with, per column, the total quality score and
|
||||||
its breakdown into completeness / validity / consistency.
|
its breakdown into completeness / validity (no consistency dimension).
|
||||||
3. **Problemas en español** — a second table listing, per column, the readable
|
3. **Problemas de calidad** — a table listing ONLY real quality defects
|
||||||
issues in Spanish (kept separate from the type ``flags``).
|
(nulls, empty cells, values not conforming to their type/semantics).
|
||||||
|
4. **Observaciones analíticas** — a SEPARATE table for outliers, constant
|
||||||
|
columns, high-cardinality ids and strong skew, with an explicit note that
|
||||||
|
these do not affect the score.
|
||||||
|
|
||||||
The breakdown and the issues are NOT recomputed here: they come from the registry
|
The breakdown, issues and observations are NOT recomputed here: they come from
|
||||||
function ``column_quality_score`` (group ``eda``), which already derives
|
the registry function ``column_quality_score`` (group ``eda``), which derives
|
||||||
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
|
``{score, completeness, validity, dimensions, applicable, issues,
|
||||||
This chapter is render-only — it consumes that function and lays the result out
|
observations}`` from the ColumnProfile. This chapter is render-only.
|
||||||
as model blocks; the renderers paginate tables (splitting by rows, repeating the
|
|
||||||
header) and wrap long cells so nothing is ever cut.
|
|
||||||
|
|
||||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||||
"""
|
"""
|
||||||
@@ -33,28 +37,47 @@ try: # pragma: no cover - import wiring
|
|||||||
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
||||||
_column_quality_score = None
|
_column_quality_score = None
|
||||||
|
|
||||||
CHAPTER_VERSION = "1.0.0"
|
CHAPTER_VERSION = "2.0.0"
|
||||||
CHAPTER_ID = "calidad"
|
CHAPTER_ID = "calidad"
|
||||||
CHAPTER_TITLE = "Calidad"
|
CHAPTER_TITLE = "Calidad"
|
||||||
|
|
||||||
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
|
# Glossary terms this chapter explains (report 2046 §6). Registered in the shared
|
||||||
# consistency 0.2. Kept here only to render the human explanation; the actual
|
# collector and marked clickable on their first appearance (contract §11.1).
|
||||||
# numbers always come from the function so the two never drift in computation.
|
_TERMS = {
|
||||||
_CRITERIA_INTRO = (
|
"calidad_datos": (
|
||||||
"La calidad de cada columna es un score de 0 a 100 que combina tres "
|
"Calidad de datos (score 0-100)",
|
||||||
"criterios, cada uno con un peso:\n\n"
|
"Mide hasta qué punto los datos están presentes y son utilizables tal "
|
||||||
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
|
"cual, no si son «buenos para el análisis». Se compone solo de "
|
||||||
"ni vacíos). Una columna con muchos nulos baja de score.\n"
|
"dimensiones medibles automáticamente desde el perfil de la tabla, sin "
|
||||||
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
|
"fuente externa de verdad: completitud (60%), validez (40%, cuando es "
|
||||||
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
|
"medible) y, a nivel de tabla, unicidad de registro. Los valores "
|
||||||
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
|
"atípicos NO bajan la calidad: se listan aparte como observaciones.",
|
||||||
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
|
),
|
||||||
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
|
"completitud": (
|
||||||
"Los problemas detectados por columna se listan en español más abajo."
|
"Completitud",
|
||||||
)
|
"Proporción de valores realmente presentes en una columna (1 − % de "
|
||||||
|
"nulos; en texto, las celdas vacías también cuentan como faltantes). Los "
|
||||||
|
"nulos y vacíos bajan el score porque falta información que debería "
|
||||||
|
"estar. Pesa el 60% del score de columna.",
|
||||||
|
),
|
||||||
|
"validez": (
|
||||||
|
"Validez",
|
||||||
|
"Proporción de valores que encajan con su tipo o formato esperado: un "
|
||||||
|
"número que parsea, una fecha legible, un email con forma de email. Los "
|
||||||
|
"valores que no parsean a su tipo bajan el score. Si la columna es texto "
|
||||||
|
"libre sin formato esperado, la validez no se puede medir y el score se "
|
||||||
|
"basa solo en la completitud. Pesa el 40% del score cuando es medible.",
|
||||||
|
),
|
||||||
|
"unicidad_registro": (
|
||||||
|
"Unicidad de registro",
|
||||||
|
"A nivel de tabla, las filas duplicadas restan calidad al conjunto "
|
||||||
|
"(1 − % de filas duplicadas). Es distinta de que una columna no-clave "
|
||||||
|
"repita valores, que no es un defecto de calidad.",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
# Cap for the joined issues cell so a single row never grows taller than a page;
|
# Cap for the joined cell so a single row never grows taller than a page; the
|
||||||
# the remainder is summarized as "(+N más)" instead of being silently dropped.
|
# remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||||
_ISSUES_MAXLEN = 160
|
_ISSUES_MAXLEN = 160
|
||||||
|
|
||||||
|
|
||||||
@@ -82,12 +105,19 @@ def _fmt_unit_pct(value) -> str:
|
|||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_validity(value) -> str:
|
||||||
|
"""Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%."""
|
||||||
|
if value is None:
|
||||||
|
return "n/a"
|
||||||
|
return _fmt_unit_pct(value)
|
||||||
|
|
||||||
|
|
||||||
def _quality_of(col: dict) -> dict:
|
def _quality_of(col: dict) -> dict:
|
||||||
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
|
"""Return the quality dict for a column.
|
||||||
|
|
||||||
Uses the registry ``column_quality_score`` when available; otherwise falls
|
Uses the registry ``column_quality_score`` when available; otherwise falls
|
||||||
back to the per-column ``quality_score`` already in the profile (number only,
|
back to the per-column ``quality_score`` already in the profile (number only,
|
||||||
empty breakdown/issues). Never raises.
|
empty breakdown/issues/observations). Never raises.
|
||||||
"""
|
"""
|
||||||
if not isinstance(col, dict):
|
if not isinstance(col, dict):
|
||||||
col = {}
|
col = {}
|
||||||
@@ -98,26 +128,25 @@ def _quality_of(col: dict) -> dict:
|
|||||||
return res
|
return res
|
||||||
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
||||||
pass
|
pass
|
||||||
# Fallback: only the final score is available pre-computed in the profile.
|
|
||||||
return {
|
return {
|
||||||
"score": col.get("quality_score"),
|
"score": col.get("quality_score"),
|
||||||
"completeness": None,
|
"completeness": None,
|
||||||
"validity": None,
|
"validity": None,
|
||||||
"consistency": None,
|
|
||||||
"issues": [],
|
"issues": [],
|
||||||
|
"observations": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _join_issues(issues) -> str:
|
def _join_cells(items) -> str:
|
||||||
"""Join Spanish issue strings into one cell, truncating overly long lists.
|
"""Join Spanish strings into one cell, truncating overly long lists.
|
||||||
|
|
||||||
The renderer wraps cell text, but a column with many long issues could make a
|
The renderer wraps cell text, but a column with many long entries could make
|
||||||
single row taller than a whole page; cap the length and append ``(+N más)``
|
a single row taller than a whole page; cap the length and append ``(+N más)``
|
||||||
so the count of hidden issues is honest rather than silently lost.
|
so the count of hidden entries is honest rather than silently lost.
|
||||||
"""
|
"""
|
||||||
if not isinstance(issues, (list, tuple)) or not issues:
|
if not isinstance(items, (list, tuple)) or not items:
|
||||||
return ""
|
return ""
|
||||||
parts = [model._safe_str(i).strip() for i in issues]
|
parts = [model._safe_str(i).strip() for i in items]
|
||||||
parts = [p for p in parts if p]
|
parts = [p for p in parts if p]
|
||||||
if not parts:
|
if not parts:
|
||||||
return ""
|
return ""
|
||||||
@@ -142,6 +171,33 @@ def _columns_with_quality(profile: dict):
|
|||||||
yield c, _quality_of(c)
|
yield c, _quality_of(c)
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_unit_pct_or_pct(value) -> str:
|
||||||
|
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||||
|
try:
|
||||||
|
num = float(value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return model._safe_str(value)
|
||||||
|
if num != num: # NaN
|
||||||
|
return "—"
|
||||||
|
pct = num * 100 if num <= 1.0 else num
|
||||||
|
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||||
|
return f"{text}%"
|
||||||
|
|
||||||
|
|
||||||
|
def _row_uniqueness(profile: dict):
|
||||||
|
"""Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown."""
|
||||||
|
dup = profile.get("duplicate_pct")
|
||||||
|
if dup is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
d = float(dup)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
if d > 1.0: # tolerate a 0-100 scale
|
||||||
|
d = d / 100.0
|
||||||
|
return max(0.0, min(1.0, 1.0 - d))
|
||||||
|
|
||||||
|
|
||||||
def _summary_block(profile: dict, evaluated: list):
|
def _summary_block(profile: dict, evaluated: list):
|
||||||
"""Table-level KVTable: global score and quality aggregates."""
|
"""Table-level KVTable: global score and quality aggregates."""
|
||||||
rows = []
|
rows = []
|
||||||
@@ -153,14 +209,15 @@ def _summary_block(profile: dict, evaluated: list):
|
|||||||
if isinstance(q.get("completeness"), (int, float))]
|
if isinstance(q.get("completeness"), (int, float))]
|
||||||
vals = [q.get("validity") for _, q in evaluated
|
vals = [q.get("validity") for _, q in evaluated
|
||||||
if isinstance(q.get("validity"), (int, float))]
|
if isinstance(q.get("validity"), (int, float))]
|
||||||
cons = [q.get("consistency") for _, q in evaluated
|
|
||||||
if isinstance(q.get("consistency"), (int, float))]
|
|
||||||
if comps:
|
if comps:
|
||||||
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
||||||
if vals:
|
if vals:
|
||||||
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
|
rows.append(("Validez media (donde aplica)",
|
||||||
if cons:
|
_fmt_unit_pct(sum(vals) / len(vals))))
|
||||||
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
|
|
||||||
|
ru = _row_uniqueness(profile)
|
||||||
|
if ru is not None:
|
||||||
|
rows.append(("Unicidad de registro", _fmt_unit_pct(ru)))
|
||||||
|
|
||||||
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
||||||
rows.append(("Columnas con problemas", str(n_problem)))
|
rows.append(("Columnas con problemas", str(n_problem)))
|
||||||
@@ -182,22 +239,9 @@ def _summary_block(profile: dict, evaluated: list):
|
|||||||
return model.KVTable(rows=rows, title="Resumen de calidad")
|
return model.KVTable(rows=rows, title="Resumen de calidad")
|
||||||
|
|
||||||
|
|
||||||
def _fmt_unit_pct_or_pct(value) -> str:
|
|
||||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
|
||||||
try:
|
|
||||||
num = float(value)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
return model._safe_str(value)
|
|
||||||
if num != num: # NaN
|
|
||||||
return "—"
|
|
||||||
pct = num * 100 if num <= 1.0 else num
|
|
||||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
|
||||||
return f"{text}%"
|
|
||||||
|
|
||||||
|
|
||||||
def _scores_block(evaluated: list):
|
def _scores_block(evaluated: list):
|
||||||
"""DataTable with per-column score and its three-criteria breakdown."""
|
"""DataTable with per-column score and its completeness/validity breakdown."""
|
||||||
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
|
header = ["Columna", "Calidad", "Completitud", "Validez"]
|
||||||
rows = []
|
rows = []
|
||||||
# Worst columns first so the reader sees the problems at the top.
|
# Worst columns first so the reader sees the problems at the top.
|
||||||
ordered = sorted(
|
ordered = sorted(
|
||||||
@@ -210,22 +254,22 @@ def _scores_block(evaluated: list):
|
|||||||
col.get("name") or "(col)",
|
col.get("name") or "(col)",
|
||||||
_fmt_score(q.get("score")),
|
_fmt_score(q.get("score")),
|
||||||
_fmt_unit_pct(q.get("completeness")),
|
_fmt_unit_pct(q.get("completeness")),
|
||||||
_fmt_unit_pct(q.get("validity")),
|
_fmt_validity(q.get("validity")),
|
||||||
_fmt_unit_pct(q.get("consistency")),
|
|
||||||
])
|
])
|
||||||
if not rows:
|
if not rows:
|
||||||
return None
|
return None
|
||||||
return model.DataTable(header=header, rows=rows,
|
return model.DataTable(header=header, rows=rows,
|
||||||
title="Scores de calidad por columna",
|
title="Scores de calidad por columna",
|
||||||
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
|
note="0 = peor, 100 = mejor; «n/a» = dimensión no "
|
||||||
|
"medible; ordenado de peor a mejor")
|
||||||
|
|
||||||
|
|
||||||
def _issues_block(evaluated: list):
|
def _issues_block(evaluated: list):
|
||||||
"""DataTable listing Spanish issues per column, or a Note when there are none."""
|
"""DataTable listing ONLY real quality defects per column, or a Note."""
|
||||||
header = ["Columna", "Problemas detectados (español)"]
|
header = ["Columna", "Problemas de calidad (español)"]
|
||||||
rows = []
|
rows = []
|
||||||
for col, q in evaluated:
|
for col, q in evaluated:
|
||||||
joined = _join_issues(q.get("issues"))
|
joined = _join_cells(q.get("issues"))
|
||||||
if joined:
|
if joined:
|
||||||
rows.append([col.get("name") or "(col)", joined])
|
rows.append([col.get("name") or "(col)", joined])
|
||||||
if not rows:
|
if not rows:
|
||||||
@@ -235,6 +279,63 @@ def _issues_block(evaluated: list):
|
|||||||
title="Problemas de calidad por columna")
|
title="Problemas de calidad por columna")
|
||||||
|
|
||||||
|
|
||||||
|
def _observations_block(evaluated: list):
|
||||||
|
"""DataTable listing analytical observations per column, or None.
|
||||||
|
|
||||||
|
Observations (outliers, constant columns, ids, strong skew) are NOT quality
|
||||||
|
defects: they do not affect the score. Returned as a separate table from the
|
||||||
|
issues so the report never presents a legitimate outlier as a problem.
|
||||||
|
"""
|
||||||
|
header = ["Columna", "Observaciones analíticas"]
|
||||||
|
rows = []
|
||||||
|
for col, q in evaluated:
|
||||||
|
joined = _join_cells(q.get("observations"))
|
||||||
|
if joined:
|
||||||
|
rows.append([col.get("name") or "(col)", joined])
|
||||||
|
if not rows:
|
||||||
|
return None
|
||||||
|
return model.DataTable(
|
||||||
|
header=header, rows=rows,
|
||||||
|
title="Observaciones analíticas por columna",
|
||||||
|
note="No son defectos de calidad y NO afectan al score; orientan el "
|
||||||
|
"análisis (atípicos, columnas constantes, identificadores).")
|
||||||
|
|
||||||
|
|
||||||
|
def _term(key: str, label: str, mark: bool) -> str:
|
||||||
|
"""Render a term as a clickable glossary span when marking is enabled."""
|
||||||
|
if mark:
|
||||||
|
return f"[[term:{key}]]**{label}**[[/term]]"
|
||||||
|
return f"**{label}**"
|
||||||
|
|
||||||
|
|
||||||
|
def _criteria_intro(mark: bool) -> str:
|
||||||
|
"""Intro paragraph explaining the two scored dimensions and the principle."""
|
||||||
|
calidad = _term("calidad_datos", "calidad de datos", mark)
|
||||||
|
completitud = _term("completitud", "Completitud (peso 60%)", mark)
|
||||||
|
validez = _term("validez", "Validez (peso 40%, cuando es medible)", mark)
|
||||||
|
unicidad = _term("unicidad_registro", "unicidad de registro", mark)
|
||||||
|
return (
|
||||||
|
f"La {calidad} de cada columna es un score de 0 a 100 que combina solo "
|
||||||
|
"dimensiones medibles desde el perfil de la tabla, sin fuente externa "
|
||||||
|
"de verdad:\n\n"
|
||||||
|
f"- {completitud}: proporción de valores presentes (1 − % de nulos; en "
|
||||||
|
"texto, las celdas vacías cuentan como faltantes). Los nulos y vacíos "
|
||||||
|
"bajan el score.\n"
|
||||||
|
f"- {validez}: proporción de valores que encajan con su tipo o formato "
|
||||||
|
"(un número que parsea, una fecha legible, un email con forma de email). "
|
||||||
|
"Si una columna es texto libre sin formato esperado, la validez no se "
|
||||||
|
"mide y el score se basa solo en la completitud.\n\n"
|
||||||
|
f"Score de columna = 100 × (0,6·completitud + 0,4·validez), "
|
||||||
|
"renormalizado cuando la validez no aplica. A nivel de tabla se añade "
|
||||||
|
f"la {unicidad} (1 − % de filas duplicadas).\n\n"
|
||||||
|
"**Los valores atípicos (outliers) NO bajan la calidad.** Un valor "
|
||||||
|
"extremo puede ser real y correcto; detectar atípicos es parte del "
|
||||||
|
"análisis de la distribución, no un juicio de corrección. Por eso, junto "
|
||||||
|
"con las columnas constantes y los identificadores, se listan aparte "
|
||||||
|
"como **observaciones analíticas** que no afectan al score."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_calidad(profile: dict, ctx: dict):
|
def build_calidad(profile: dict, ctx: dict):
|
||||||
"""Build the data-quality Chapter, or None if the profile has no columns.
|
"""Build the data-quality Chapter, or None if the profile has no columns.
|
||||||
|
|
||||||
@@ -250,17 +351,35 @@ def build_calidad(profile: dict, ctx: dict):
|
|||||||
if not evaluated:
|
if not evaluated:
|
||||||
return None # no columns to score -> chapter does not apply.
|
return None # no columns to score -> chapter does not apply.
|
||||||
|
|
||||||
|
# Register the criteria terms in the shared glossary (if present) and mark
|
||||||
|
# their first appearance clickable. Contract §11.1.
|
||||||
|
glossary = ctx.get("glossary")
|
||||||
|
mark = False
|
||||||
|
if isinstance(glossary, model.GlossaryCollector):
|
||||||
|
for key, (label, definition) in _TERMS.items():
|
||||||
|
glossary.add(key, label, definition)
|
||||||
|
mark = True
|
||||||
|
|
||||||
blocks = [
|
blocks = [
|
||||||
model.Heading(text="Cómo se calcula la calidad", level=2),
|
model.Heading(text="Cómo se calcula la calidad", level=2),
|
||||||
model.Markdown(text=_CRITERIA_INTRO),
|
model.Markdown(text=_criteria_intro(mark)),
|
||||||
_summary_block(profile, evaluated),
|
_summary_block(profile, evaluated),
|
||||||
model.Heading(text="Scores por columna", level=2),
|
model.Heading(text="Scores por columna", level=2),
|
||||||
]
|
]
|
||||||
scores = _scores_block(evaluated)
|
scores = _scores_block(evaluated)
|
||||||
if scores is not None:
|
if scores is not None:
|
||||||
blocks.append(scores)
|
blocks.append(scores)
|
||||||
blocks.append(model.Heading(text="Problemas detectados", level=2))
|
|
||||||
|
blocks.append(model.Heading(text="Problemas de calidad", level=2))
|
||||||
blocks.append(_issues_block(evaluated))
|
blocks.append(_issues_block(evaluated))
|
||||||
|
|
||||||
|
observations = _observations_block(evaluated)
|
||||||
|
if observations is not None:
|
||||||
|
blocks.append(model.Heading(text="Observaciones analíticas", level=2))
|
||||||
|
blocks.append(model.Note(
|
||||||
|
"Las observaciones siguientes NO son defectos de calidad y no "
|
||||||
|
"afectan al score: son señales para orientar el análisis."))
|
||||||
|
blocks.append(observations)
|
||||||
|
|
||||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||||
version=CHAPTER_VERSION, blocks=blocks)
|
version=CHAPTER_VERSION, blocks=blocks)
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
|
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut + glossary.
|
||||||
|
|
||||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||||
and deterministic. Verifies that the chapter explains the quality criteria, shows
|
and deterministic. Verifies the report-2046 quality model: the chapter explains
|
||||||
per-column scores with the completeness/validity/consistency breakdown, lists the
|
the two scored dimensions (completitud 60% / validez 40%), shows per-column
|
||||||
issues in Spanish (separate from the type flags), returns None when it does not
|
scores without a consistency column, keeps quality DEFECTS (issues) separate
|
||||||
apply, and that a wide profile with long names renders to PDF and PPTX without
|
from analytical OBSERVATIONS (outliers, constant, ids), hooks the criteria terms
|
||||||
cutting any cell text (long content wraps, it is never truncated).
|
into the glossary, returns None when it does not apply, and renders a wide
|
||||||
|
profile to PDF and PPTX without cutting any cell text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -20,28 +21,30 @@ from datascience.automatic_eda.chapters.calidad import (
|
|||||||
CHAPTER_VERSION,
|
CHAPTER_VERSION,
|
||||||
)
|
)
|
||||||
from datascience.automatic_eda import build_document, render_pdf, render_pptx
|
from datascience.automatic_eda import build_document, render_pdf, render_pptx
|
||||||
|
from datascience.automatic_eda import model
|
||||||
|
|
||||||
|
|
||||||
def _profile() -> dict:
|
def _profile() -> dict:
|
||||||
"""A small profile with one column per quality problem (nulls, outliers,
|
"""A small profile with one column per quality problem (nulls, outliers,
|
||||||
constant, high-cardinality id) plus one clean column."""
|
constant, high-cardinality id) plus one clean column. ``outlier_pct`` is in
|
||||||
|
the 0-100 scale that describe_numeric actually emits."""
|
||||||
return {
|
return {
|
||||||
"table": "demo",
|
"table": "demo",
|
||||||
"quality_score": 72.5,
|
"quality_score": 82.0,
|
||||||
"duplicate_pct": 0.04,
|
"duplicate_pct": 0.04,
|
||||||
"null_cell_pct": 0.11,
|
"null_cell_pct": 0.11,
|
||||||
"constant_cols": ["flag_const"],
|
"constant_cols": ["flag_const"],
|
||||||
"all_null_cols": [],
|
"all_null_cols": [],
|
||||||
"columns": [
|
"columns": [
|
||||||
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
|
{"name": "edad", "inferred_type": "numeric", "null_pct": 0.2,
|
||||||
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
|
"n_rows": 100, "unique_pct": 0.5,
|
||||||
"quality_score": 60},
|
"numeric": {"outlier_pct": 15.0, "min": 0, "max": 99}},
|
||||||
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
|
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
|
||||||
"unique_pct": 0.98, "quality_score": 80},
|
"unique_pct": 0.98, "flags": ["possible_id"]},
|
||||||
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
|
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
|
||||||
"flags": ["constant"], "quality_score": 50},
|
"unique_pct": 0.01, "flags": ["constant"]},
|
||||||
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
|
{"name": "limpia", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
|
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,16 +53,9 @@ def _tables(chapter):
|
|||||||
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
|
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
|
||||||
|
|
||||||
|
|
||||||
def _scores_table(chapter):
|
def _table_by_title(chapter, needle):
|
||||||
for t in _tables(chapter):
|
for t in _tables(chapter):
|
||||||
if "Scores" in (t.title or ""):
|
if needle in (t.title or ""):
|
||||||
return t
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _issues_table(chapter):
|
|
||||||
for t in _tables(chapter):
|
|
||||||
if "Problemas" in (t.title or ""):
|
|
||||||
return t
|
return t
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -73,41 +69,84 @@ def test_golden_chapter_estructura_y_version():
|
|||||||
assert ch.id == "calidad"
|
assert ch.id == "calidad"
|
||||||
assert ch.version == CHAPTER_VERSION
|
assert ch.version == CHAPTER_VERSION
|
||||||
kinds = [b.kind for b in ch.blocks]
|
kinds = [b.kind for b in ch.blocks]
|
||||||
# intro heading + markdown criteria + summary kv + scores table + issues table
|
|
||||||
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
|
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
|
||||||
|
|
||||||
|
|
||||||
def test_golden_intro_explica_criterios_y_pesos():
|
def test_golden_intro_explica_dos_dimensiones_y_pesos():
|
||||||
ch = build_calidad(_profile(), {})
|
ch = build_calidad(_profile(), {})
|
||||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||||
for needle in ("Completitud", "Validez", "Consistencia",
|
for needle in ("Completitud", "Validez", "60%", "40%",
|
||||||
"50%", "30%", "20%"):
|
"unicidad de registro"):
|
||||||
assert needle in intro, f"falta {needle!r} en la intro de criterios"
|
assert needle in intro, f"falta {needle!r} en la intro de criterios"
|
||||||
|
# El principio: los outliers NO bajan la calidad.
|
||||||
|
assert "atípicos" in intro and "NO bajan" in intro
|
||||||
|
# Ya no se menciona la dimensión consistencia eliminada.
|
||||||
|
assert "20%" not in intro
|
||||||
|
|
||||||
|
|
||||||
def test_golden_scores_incluyen_desglose_por_criterio():
|
def test_golden_scores_sin_columna_consistencia():
|
||||||
ch = build_calidad(_profile(), {})
|
ch = build_calidad(_profile(), {})
|
||||||
scores = _scores_table(ch)
|
scores = _table_by_title(ch, "Scores")
|
||||||
assert scores is not None
|
assert scores is not None
|
||||||
assert scores.header == ["Columna", "Calidad", "Completitud",
|
assert scores.header == ["Columna", "Calidad", "Completitud", "Validez"]
|
||||||
"Validez", "Consistencia"]
|
assert "Consistencia" not in scores.header
|
||||||
# 4 columns scored, none dropped.
|
|
||||||
assert len(scores.rows) == 4
|
assert len(scores.rows) == 4
|
||||||
names = {r[0] for r in scores.rows}
|
names = {r[0] for r in scores.rows}
|
||||||
assert names == {"edad", "nombre", "flag_const", "limpia"}
|
assert names == {"edad", "nombre", "flag_const", "limpia"}
|
||||||
|
|
||||||
|
|
||||||
def test_golden_issues_en_espanol_separados_de_flags():
|
def test_golden_outliers_en_observaciones_no_en_problemas():
|
||||||
ch = build_calidad(_profile(), {})
|
ch = build_calidad(_profile(), {})
|
||||||
issues = _issues_table(ch)
|
problemas = _table_by_title(ch, "Problemas de calidad")
|
||||||
assert issues is not None
|
observaciones = _table_by_title(ch, "Observaciones")
|
||||||
flat = " | ".join(" ".join(r) for r in issues.rows)
|
assert problemas is not None
|
||||||
assert "nulos" in flat # completeness issue (ES)
|
assert observaciones is not None
|
||||||
assert "outliers" in flat # validity issue (ES)
|
|
||||||
assert "columna constante" in flat
|
problemas_txt = " | ".join(" ".join(r) for r in problemas.rows)
|
||||||
assert "posible id de alta cardinalidad" in flat
|
observaciones_txt = " | ".join(" ".join(r) for r in observaciones.rows)
|
||||||
# The raw type flag string must NOT leak as a "problem".
|
|
||||||
assert "constant" not in flat or "columna constante" in flat
|
# Los nulos SÍ son problema de calidad.
|
||||||
|
assert "nulos" in problemas_txt
|
||||||
|
# Los outliers NO aparecen como problema...
|
||||||
|
assert "atípic" not in problemas_txt and "outlier" not in problemas_txt
|
||||||
|
# ...sino como observación analítica.
|
||||||
|
assert "atípic" in observaciones_txt
|
||||||
|
# Constante e id: observaciones, no problemas.
|
||||||
|
assert "constante" in observaciones_txt
|
||||||
|
assert "identificador" in observaciones_txt
|
||||||
|
assert "constante" not in problemas_txt
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_score_columna_limpia_es_100():
|
||||||
|
"""Columna sin nulos, numérica nativa: score 100 aunque tenga (o no) outliers."""
|
||||||
|
ch = build_calidad(_profile(), {})
|
||||||
|
scores = _table_by_title(ch, "Scores")
|
||||||
|
by_name = {r[0]: r for r in scores.rows}
|
||||||
|
assert by_name["limpia"][1] == "100 / 100"
|
||||||
|
# edad: 20% nulos -> 100*(0.6*0.8 + 0.4*1.0) = 88; los outliers no bajan nada.
|
||||||
|
assert by_name["edad"][1] == "88 / 100"
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Glosario (contrato §11.1)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_glosario_registra_los_cuatro_terminos_y_marca_clicable():
|
||||||
|
glossary = model.GlossaryCollector()
|
||||||
|
ch = build_calidad(_profile(), {"glossary": glossary})
|
||||||
|
for key in ("calidad_datos", "completitud", "validez", "unicidad_registro"):
|
||||||
|
assert glossary.has(key), f"término {key!r} no registrado en el glosario"
|
||||||
|
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||||
|
# Con colector presente, la primera aparición se marca clicable.
|
||||||
|
assert "[[term:completitud]]" in intro
|
||||||
|
assert "[[term:validez]]" in intro
|
||||||
|
assert "[[term:calidad_datos]]" in intro
|
||||||
|
assert "[[term:unicidad_registro]]" in intro
|
||||||
|
|
||||||
|
|
||||||
|
def test_sin_glosario_no_marca_terminos():
|
||||||
|
ch = build_calidad(_profile(), {}) # ctx sin glossary
|
||||||
|
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||||
|
assert "[[term:" not in intro
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
@@ -124,17 +163,17 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
|||||||
prof = {
|
prof = {
|
||||||
"quality_score": 100,
|
"quality_score": 100,
|
||||||
"columns": [
|
"columns": [
|
||||||
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
|
{"name": "a", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
"numeric": {"outlier_pct": 0.0}},
|
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||||
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
|
{"name": "b", "inferred_type": "numeric", "null_pct": 0.0,
|
||||||
"numeric": {"outlier_pct": 0.0}},
|
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
ch = build_calidad(prof, {})
|
ch = build_calidad(prof, {})
|
||||||
assert ch is not None
|
assert ch is not None
|
||||||
assert _issues_table(ch) is None # no issues table
|
assert _table_by_title(ch, "Problemas de calidad") is None # no issues table
|
||||||
notes = [b for b in ch.blocks if b.kind == "note"]
|
notes = [b for b in ch.blocks if b.kind == "note"]
|
||||||
assert notes and "No se detectaron problemas" in notes[0].text
|
assert any("No se detectaron problemas" in n.text for n in notes)
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
@@ -143,44 +182,42 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
|||||||
def _wide_profile(ncols: int = 22) -> dict:
|
def _wide_profile(ncols: int = 22) -> dict:
|
||||||
cols = [
|
cols = [
|
||||||
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
|
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
|
||||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
|
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99,
|
||||||
|
"flags": ["possible_id"]},
|
||||||
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
|
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
|
||||||
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
|
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.01,
|
||||||
|
"flags": ["constant"]},
|
||||||
]
|
]
|
||||||
for k in range(ncols - 2):
|
for k in range(ncols - 2):
|
||||||
cols.append({
|
cols.append({
|
||||||
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
|
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
|
||||||
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
|
"inferred_type": "numeric", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||||
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
|
"unique_pct": 0.5,
|
||||||
|
"numeric": {"outlier_pct": 8.0, "min": 0, "max": 1000},
|
||||||
})
|
})
|
||||||
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
|
return {"table": "ancha", "quality_score": 70.0, "duplicate_pct": 0.0,
|
||||||
|
"columns": cols}
|
||||||
|
|
||||||
|
|
||||||
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
|
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
|
||||||
prof = _wide_profile(22)
|
prof = _wide_profile(22)
|
||||||
full = build_document(prof, {"dataset_name": "ancha"})
|
full = build_document(prof, {"dataset_name": "ancha"})
|
||||||
assert any(c.id == "calidad" for c in full)
|
assert any(c.id == "calidad" for c in full)
|
||||||
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
|
|
||||||
# this chapter (other chapters, e.g. portada, legitimately contain '…').
|
|
||||||
chapters = [c for c in full if c.id == "calidad"]
|
chapters = [c for c in full if c.id == "calidad"]
|
||||||
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
|
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
|
||||||
with tempfile.TemporaryDirectory() as d:
|
with tempfile.TemporaryDirectory() as d:
|
||||||
pdf = os.path.join(d, "q.pdf")
|
pdf = os.path.join(d, "q.pdf")
|
||||||
pptx = os.path.join(d, "q.pptx")
|
pptx = os.path.join(d, "q.pptx")
|
||||||
rp = render_pdf(chapters, pdf, {"title": "EDA"})
|
rp = render_pdf(chapters, pdf, {"title": "EDA"})
|
||||||
rx = render_pptx(chapters, pptx, {"title": "EDA"})
|
render_pptx(chapters, pptx, {"title": "EDA"})
|
||||||
assert os.path.exists(pdf) and os.path.exists(pptx)
|
assert os.path.exists(pdf) and os.path.exists(pptx)
|
||||||
# The wide table forces pagination across several pages/slides.
|
|
||||||
assert (rp or {}).get("n_pages", 0) >= 2
|
assert (rp or {}).get("n_pages", 0) >= 2
|
||||||
|
|
||||||
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
|
|
||||||
# and there is no truncation marker.
|
|
||||||
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
|
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
|
||||||
assert "…" not in pdf_txt and "..." not in pdf_txt
|
assert "…" not in pdf_txt and "..." not in pdf_txt
|
||||||
norm = re.sub(r"\s+", "", pdf_txt)
|
norm = re.sub(r"\s+", "", pdf_txt)
|
||||||
assert long_name in norm, "el nombre largo se cortó en el PDF"
|
assert long_name in norm, "el nombre largo se cortó en el PDF"
|
||||||
|
|
||||||
# PPTX: long name present in some cell, untruncated.
|
|
||||||
allt = []
|
allt = []
|
||||||
for s in Presentation(pptx).slides:
|
for s in Presentation(pptx).slides:
|
||||||
for sh in s.shapes:
|
for sh in s.shapes:
|
||||||
|
|||||||
@@ -4,10 +4,10 @@ name: column_quality_score
|
|||||||
kind: function
|
kind: function
|
||||||
lang: py
|
lang: py
|
||||||
domain: datascience
|
domain: datascience
|
||||||
version: "1.0.0"
|
version: "2.0.0"
|
||||||
purity: pure
|
purity: pure
|
||||||
signature: "def column_quality_score(col: dict) -> dict"
|
signature: "def column_quality_score(col: dict) -> dict"
|
||||||
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda, con desglose completeness/validity/consistency y lista de issues legibles. Funcion pura, no muta el input."
|
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda. Combina completeness (0.6) y validity (0.4) con renormalizacion por aplicabilidad; los outliers, columnas constantes e ids NO bajan el score (van a observations). Devuelve desglose por dimension, issues (defectos) y observations (señales analiticas). Funcion pura, no muta el input."
|
||||||
tags: [eda, data-quality, profiling, scoring, datascience]
|
tags: [eda, data-quality, profiling, scoring, datascience]
|
||||||
uses_functions: []
|
uses_functions: []
|
||||||
uses_types: []
|
uses_types: []
|
||||||
@@ -17,20 +17,26 @@ error_type: ""
|
|||||||
imports: []
|
imports: []
|
||||||
example: |
|
example: |
|
||||||
from datascience import column_quality_score
|
from datascience import column_quality_score
|
||||||
col = {"name": "precio", "inferred_type": "float", "null_pct": 0.2,
|
col = {"name": "precio", "inferred_type": "numeric", "null_pct": 0.2,
|
||||||
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 0.08}}
|
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 8.0}}
|
||||||
column_quality_score(col)
|
column_quality_score(col)
|
||||||
# {"score": 86.8, "completeness": 0.8, "validity": 0.92,
|
# {"score": 88.0, "completeness": 0.8, "validity": 1.0,
|
||||||
# "consistency": 1.0, "issues": ["20% nulos", "8% outliers"]}
|
# "applicable": ["completeness", "validity"], "issues": ["20% nulos"],
|
||||||
|
# "observations": ["8% de valores atípicos (z-score>3): ..."]}
|
||||||
tested: true
|
tested: true
|
||||||
tests:
|
tests:
|
||||||
- "test_clean_column_high_score"
|
- "test_clean_column_high_score"
|
||||||
- "test_half_null_lowers_completeness_and_score"
|
- "test_weights_60_40_native_type"
|
||||||
- "test_constant_column_flags_issue"
|
- "test_outliers_do_not_penalize_score"
|
||||||
|
- "test_nulls_lower_score_more_than_outliers"
|
||||||
|
- "test_validity_from_parse_rate_lowers_score"
|
||||||
|
- "test_validity_from_match_rate"
|
||||||
|
- "test_free_text_renormalizes_to_completeness_only"
|
||||||
|
- "test_all_null_column_scores_zero"
|
||||||
|
- "test_constant_column_scores_full_and_is_observation"
|
||||||
|
- "test_high_cardinality_id_scores_full_and_is_observation"
|
||||||
|
- "test_mostly_null_no_double_counts_validity"
|
||||||
- "test_empty_dict_does_not_crash"
|
- "test_empty_dict_does_not_crash"
|
||||||
- "test_outliers_penalize_validity"
|
|
||||||
- "test_mostly_null_flag_halves_validity"
|
|
||||||
- "test_high_cardinality_text_flagged_as_id"
|
|
||||||
- "test_none_values_treated_defensively"
|
- "test_none_values_treated_defensively"
|
||||||
- "test_does_not_mutate_input"
|
- "test_does_not_mutate_input"
|
||||||
test_file_path: "python/functions/datascience/column_quality_score_test.py"
|
test_file_path: "python/functions/datascience/column_quality_score_test.py"
|
||||||
@@ -38,16 +44,22 @@ file_path: "python/functions/datascience/column_quality_score.py"
|
|||||||
params:
|
params:
|
||||||
- name: col
|
- name: col
|
||||||
desc: >
|
desc: >
|
||||||
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb).
|
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb /
|
||||||
Se leen sus claves de forma defensiva con .get(...) y se toleran valores
|
profile_table). Se leen sus claves de forma defensiva con .get(...) y se
|
||||||
None. Claves usadas: null_pct (0-1), inferred_type, semantic_type,
|
toleran valores None. Claves usadas: null_pct (0-1), n_rows, empty_count
|
||||||
unique_pct (0-1), flags (list[str], reconoce "constant"/"mostly_null"),
|
(texto), inferred_type, semantic_type, validity_rate (0-1, lo expone
|
||||||
numeric ({outlier_pct: 0-1, ...}|None) y match_rate (opcional, 0-1).
|
profile_table al promocionar texto a numero/fecha), match_rate (0-1),
|
||||||
|
unique_pct (0-1), flags (list[str], reconoce
|
||||||
|
"constant"/"possible_id"/"high_cardinality") y numeric ({outlier_pct: 0-100,
|
||||||
|
skew, ...}|None).
|
||||||
output: >
|
output: >
|
||||||
dict con score (float 0-100, redondeado a 1 decimal), completeness (0-1),
|
dict con score (float 0-100, 1 decimal), completeness (0-1), validity (0-1 o
|
||||||
validity (0-1), consistency (0-1) e issues (list[str] de descripciones
|
None si no aplicable), dimensions ({completeness, validity}), applicable
|
||||||
legibles de los problemas detectados). score = round(100 * (0.5*completeness
|
(list[str] de dimensiones que entraron en el score), issues (list[str] SOLO de
|
||||||
+ 0.3*validity + 0.2*consistency), 1).
|
defectos de calidad: nulos, vacios, valores no conformes) y observations
|
||||||
|
(list[str] de señales analiticas que NO bajan el score: outliers, columna
|
||||||
|
constante, posible id, asimetria). score = round(100 * (0.6*completeness +
|
||||||
|
0.4*validity) / pesos_aplicables, 1), renormalizado cuando validity no aplica.
|
||||||
---
|
---
|
||||||
|
|
||||||
## Ejemplo
|
## Ejemplo
|
||||||
@@ -59,51 +71,71 @@ from datascience import column_quality_score
|
|||||||
col = {
|
col = {
|
||||||
"name": "precio",
|
"name": "precio",
|
||||||
"physical_type": "DOUBLE",
|
"physical_type": "DOUBLE",
|
||||||
"inferred_type": "float",
|
"inferred_type": "numeric",
|
||||||
"semantic_type": "",
|
"semantic_type": "",
|
||||||
"count": 800,
|
|
||||||
"n_rows": 1000,
|
"n_rows": 1000,
|
||||||
"null_count": 200,
|
"null_count": 200,
|
||||||
"null_pct": 0.20,
|
"null_pct": 0.20,
|
||||||
"distinct_count": 400,
|
"distinct_count": 400,
|
||||||
"unique_pct": 0.40,
|
"unique_pct": 0.40,
|
||||||
"flags": [],
|
"flags": [],
|
||||||
"numeric": {"outlier_pct": 0.08},
|
"numeric": {"outlier_pct": 8.0, "skew": 0.3},
|
||||||
"categorical": None,
|
"categorical": None,
|
||||||
"datetime": None,
|
"datetime": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
column_quality_score(col)
|
column_quality_score(col)
|
||||||
# {
|
# {
|
||||||
# "score": 86.8,
|
# "score": 88.0, # 100 * (0.6*0.8 + 0.4*1.0)
|
||||||
# "completeness": 0.8, # 1 - 0.20
|
# "completeness": 0.8, # 1 - 0.20
|
||||||
# "validity": 0.92, # 1 - min(0.08, 0.3)
|
# "validity": 1.0, # numerica nativa: el tipo es conforme
|
||||||
# "consistency": 1.0,
|
# "dimensions": {"completeness": 0.8, "validity": 1.0},
|
||||||
# "issues": ["20% nulos", "8% outliers"],
|
# "applicable": ["completeness", "validity"],
|
||||||
|
# "issues": ["20% nulos"], # SOLO defectos de calidad
|
||||||
|
# "observations": ["8% de valores atípicos (z-score>3): ..."], # NO bajan score
|
||||||
# }
|
# }
|
||||||
```
|
```
|
||||||
|
|
||||||
## Cuando usarla
|
## Cuando usarla
|
||||||
|
|
||||||
Cuando hayas perfilado una tabla con el grupo `eda` (p.ej.
|
Cuando hayas perfilado una tabla con el grupo `eda` (p.ej.
|
||||||
`summarize_table_duckdb`) y necesites un numero 0-100 por columna para
|
`summarize_table_duckdb` / `profile_table`) y necesites un numero 0-100 por
|
||||||
ordenar/priorizar limpieza de datos, pintar semaforos de calidad en un
|
columna para ordenar/priorizar limpieza de datos, pintar semaforos de calidad,
|
||||||
dashboard, o decidir que columnas descartar antes de modelar. Es la capa de
|
o decidir que columnas descartar antes de modelar. Separa los **defectos de
|
||||||
scoring sobre el ColumnProfile crudo: lee el perfil, no toca los datos.
|
calidad reales** (`issues`: nulos, vacios, valores que no parsean a su tipo) de
|
||||||
|
las **observaciones analiticas** (`observations`: outliers, columnas constantes,
|
||||||
|
ids), que se reportan pero no penalizan. Es la capa de scoring sobre el
|
||||||
|
ColumnProfile crudo: lee el perfil, no toca los datos.
|
||||||
|
|
||||||
## Notas
|
## Gotchas
|
||||||
|
|
||||||
Funcion pura, sin I/O ni dependencias externas, no muta `col`. Lee todas las
|
Funcion pura, sin I/O, no muta `col`. Aun asi conviene saber:
|
||||||
claves con `.get(...)` y tolera que vengan en `None` (un ColumnProfile recien
|
|
||||||
salido de `summarize_table_duckdb` trae muchas claves a `None`), por lo que
|
|
||||||
nunca falla por claves ausentes — un `{}` produce un resultado bien definido.
|
|
||||||
|
|
||||||
Pesos del score: completeness 0.5, validity 0.3, consistency 0.2.
|
- **Los outliers NO bajan el score.** Un valor extremo puede ser real y correcto
|
||||||
|
(un cliente que compra mucho); detectar atipicos es analisis de la
|
||||||
|
distribucion, no un juicio de correccion. Salen en `observations`, no en
|
||||||
|
`issues`. Mismo trato para columnas constantes e identificadores de alta
|
||||||
|
cardinalidad: son observaciones, no defectos.
|
||||||
|
- **`validity` puede ser `None`** (no aplicable): texto libre sin `semantic_type`
|
||||||
|
ni `validity_rate`, o columna 100% nula. En ese caso el score se renormaliza a
|
||||||
|
solo `completeness` (la columna no se premia ni castiga por algo no medible).
|
||||||
|
- **`outlier_pct` se interpreta en escala 0-100** (la que emite
|
||||||
|
`describe_numeric`, z-score>3). Pasar una fraccion 0-1 produce un texto de
|
||||||
|
observacion con el % equivocado, pero NUNCA afecta al score.
|
||||||
|
- **`validity_rate` lo puebla `profile_table`** al promocionar una columna de
|
||||||
|
texto a numero/fecha (fraccion que parsea). Si no esta presente y el tipo es
|
||||||
|
nativo numerico/fecha/bool, `validity = 1.0`.
|
||||||
|
- Sin doble conteo: la falta de datos cuenta solo en `completeness` (el antiguo
|
||||||
|
castigo de `mostly_null` sobre `validity` se elimino).
|
||||||
|
|
||||||
- **completeness** = `1 - null_pct` (None -> 0 nulls -> 1.0).
|
## Capability growth log
|
||||||
- **validity**: parte de 1.0 y penaliza `min(outlier_pct, 0.3)` en columnas
|
|
||||||
numericas, `0.5 * (1 - match_rate)` si hay `semantic_type` declarado con
|
- v2.0.0 (2026-06-30) — nueva formula de calidad (report 2046): pesos 60/40
|
||||||
`match_rate` bajo disponible, y multiplica por 0.5 si el flag `mostly_null`
|
(completeness/validity) con renormalizacion por aplicabilidad; se elimina la
|
||||||
esta presente.
|
dimension `consistency`-como-informatividad y el doble castigo de
|
||||||
- **consistency**: 1.0 salvo flag `constant` (-> 0.3, columna poco informativa)
|
`mostly_null`; los outliers/constantes/ids salen del score a `observations`;
|
||||||
o texto con `unique_pct > 0.9` (-> 0.6, posible id de alta cardinalidad).
|
validity mide conformidad real (parse rate / match rate / tipo nativo). Salida
|
||||||
|
ampliada con `dimensions`, `applicable` y `observations`.
|
||||||
|
- v1.0.0 — version inicial: pesos 50/30/20 (completeness/validity/consistency),
|
||||||
|
los outliers penalizaban validity (con bug de escala) y consistency penalizaba
|
||||||
|
informatividad.
|
||||||
|
|||||||
@@ -1,34 +1,78 @@
|
|||||||
"""Score de calidad de datos (0-100) para un ColumnProfile del grupo eda.
|
"""Score de calidad de datos (0-100) para un ColumnProfile del grupo eda.
|
||||||
|
|
||||||
Funcion pura: dado el perfil de una columna producido por el grupo de
|
Funcion pura: dado el perfil de una columna producido por el grupo de
|
||||||
capacidad `eda` (p.ej. summarize_table_duckdb), calcula un score agregado
|
capacidad `eda` (p.ej. summarize_table_duckdb / profile_table), calcula un
|
||||||
de calidad junto a su desglose en completeness / validity / consistency y
|
score agregado de calidad junto a su desglose por dimension y dos listas
|
||||||
una lista de issues legibles. No realiza I/O ni muta el input.
|
legibles separadas: `issues` (defectos de calidad reales que SI bajan el
|
||||||
|
score) y `observations` (señales analiticas que NO bajan el score). No
|
||||||
|
realiza I/O ni muta el input.
|
||||||
|
|
||||||
|
Modelo (DAMA-DMBOK / ISO 8000), ver report 2046:
|
||||||
|
|
||||||
|
- Solo entran en el score las dimensiones medibles automaticamente desde el
|
||||||
|
perfil, sin fuente externa de verdad: completeness y validity por columna.
|
||||||
|
- Renormalizacion por aplicabilidad: si una dimension no es medible en la
|
||||||
|
columna (texto libre sin semantica -> validity no aplica; columna 100% nula
|
||||||
|
-> validity no medible), se excluye y los pesos se renormalizan sobre las
|
||||||
|
aplicables. Una columna ni se premia ni se castiga por algo no medible.
|
||||||
|
- Sin doble conteo: la falta de datos cuenta solo en completeness (se elimino
|
||||||
|
el antiguo castigo extra de `mostly_null` sobre validity).
|
||||||
|
- Los OUTLIERS NO bajan la calidad. Un valor extremo puede ser real y
|
||||||
|
correcto; detectar atipicos es analisis de la distribucion, no un juicio de
|
||||||
|
coreccion. Outliers, columnas constantes e identificadores de alta
|
||||||
|
cardinalidad pasan a `observations`, nunca a `issues`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# Pesos base de las dimensiones de columna (se renormalizan por aplicabilidad).
|
||||||
|
_W_COMPLETENESS = 0.6
|
||||||
|
_W_VALIDITY = 0.4
|
||||||
|
|
||||||
|
# Tipos inferidos cuyo almacen garantiza la conformidad de tipo (validity=1.0)
|
||||||
|
# cuando NO vienen de una promocion de texto (en cuyo caso manda validity_rate).
|
||||||
|
_NATIVE_TYPED = ("numeric", "integer", "float", "datetime", "date", "boolean", "bool")
|
||||||
|
|
||||||
|
|
||||||
def column_quality_score(col: dict) -> dict:
|
def column_quality_score(col: dict) -> dict:
|
||||||
"""Calcula un score de calidad de datos 0-100 para un ColumnProfile.
|
"""Calcula un score de calidad de datos 0-100 para un ColumnProfile.
|
||||||
|
|
||||||
El score pondera tres dimensiones:
|
El score combina solo dimensiones de calidad medibles desde el perfil, con
|
||||||
- completeness (0.5): proporcion de valores no nulos.
|
renormalizacion por aplicabilidad:
|
||||||
- validity (0.3): ausencia de outliers / heuristicas de validez.
|
|
||||||
- consistency (0.2): la columna aporta informacion (no constante, no ruido).
|
- completeness (peso base 0.6, siempre aplica): proporcion de valores
|
||||||
|
presentes = 1 - null_pct. En texto, las celdas vacias (`empty_count`)
|
||||||
|
tambien cuentan como faltantes.
|
||||||
|
- validity (peso base 0.4, cuando hay un criterio de validacion real):
|
||||||
|
fraccion de valores no nulos conformes a su tipo/semantica. Tipo nativo
|
||||||
|
numerico/fecha/bool = 1.0; texto promovido a numero/fecha = parse rate
|
||||||
|
(`validity_rate`); texto con `semantic_type` regexable = `match_rate`;
|
||||||
|
texto libre o columna 100% nula = NO aplicable (renormaliza a solo
|
||||||
|
completeness).
|
||||||
|
|
||||||
|
Los outliers, columnas constantes, identificadores y asimetria fuerte NO
|
||||||
|
bajan el score: se devuelven en `observations`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
col: ColumnProfile dict del grupo eda. Se leen las claves de forma
|
col: ColumnProfile dict del grupo eda. Se leen las claves de forma
|
||||||
defensiva con .get(...) y se tolera que muchas vengan en None.
|
defensiva con .get(...) y se tolera que muchas vengan en None.
|
||||||
Claves relevantes: null_pct, inferred_type, semantic_type,
|
Claves relevantes: null_pct (0-1), n_rows, empty_count,
|
||||||
unique_pct, flags (list[str]), numeric ({outlier_pct, ...}|None),
|
inferred_type, semantic_type, validity_rate (0-1, lo expone
|
||||||
match_rate (opcional).
|
profile_table al promocionar texto a numero/fecha), match_rate
|
||||||
|
(0-1), unique_pct (0-1), flags (list[str], reconoce
|
||||||
|
"constant"/"possible_id"/"high_cardinality"), numeric
|
||||||
|
({outlier_pct: 0-100, skew, ...}|None).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict con:
|
dict con:
|
||||||
score (float, 0-100, redondeado a 1 decimal),
|
score (float 0-100, redondeado a 1 decimal),
|
||||||
completeness (float, 0-1),
|
completeness (float 0-1),
|
||||||
validity (float, 0-1),
|
validity (float 0-1 | None si no aplicable),
|
||||||
consistency (float, 0-1),
|
dimensions ({completeness, validity}),
|
||||||
issues (list[str]) descripciones legibles de los problemas.
|
applicable (list[str] de dimensiones que entraron en el score),
|
||||||
|
issues (list[str]) SOLO defectos de calidad (nulos, vacios,
|
||||||
|
valores no conformes a su tipo/semantica),
|
||||||
|
observations (list[str]) señales analiticas que NO bajan el score
|
||||||
|
(outliers, columna constante, posible id, asimetria).
|
||||||
"""
|
"""
|
||||||
if not isinstance(col, dict):
|
if not isinstance(col, dict):
|
||||||
col = {}
|
col = {}
|
||||||
@@ -39,103 +83,153 @@ def column_quality_score(col: dict) -> dict:
|
|||||||
flags = set(flags)
|
flags = set(flags)
|
||||||
|
|
||||||
issues: list[str] = []
|
issues: list[str] = []
|
||||||
|
observations: list[str] = []
|
||||||
|
|
||||||
|
inferred_type = col.get("inferred_type") or ""
|
||||||
|
semantic_type = col.get("semantic_type") or ""
|
||||||
|
|
||||||
# --- completeness -------------------------------------------------
|
# --- completeness -------------------------------------------------
|
||||||
null_pct = col.get("null_pct")
|
# Falta de datos = nulos + (en texto) celdas vacias. Es el unico sitio
|
||||||
if null_pct is None:
|
# donde la falta de datos cuenta: nunca se duplica en validity.
|
||||||
null_pct = 0.0
|
null_pct = _clamp(_num(col.get("null_pct"), 0.0), 0.0, 1.0)
|
||||||
try:
|
|
||||||
null_pct = float(null_pct)
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
null_pct = 0.0
|
|
||||||
null_pct = _clamp(null_pct, 0.0, 1.0)
|
|
||||||
completeness = 1.0 - null_pct
|
completeness = 1.0 - null_pct
|
||||||
if null_pct > 0:
|
if null_pct > 0:
|
||||||
issues.append(f"{round(null_pct * 100)}% nulos")
|
issues.append(f"{_pct(null_pct)} nulos")
|
||||||
|
|
||||||
# --- validity -----------------------------------------------------
|
empty_frac = 0.0
|
||||||
validity = 1.0
|
n_rows = col.get("n_rows")
|
||||||
inferred_type = col.get("inferred_type") or ""
|
empty_count = col.get("empty_count")
|
||||||
|
if (
|
||||||
|
isinstance(n_rows, (int, float)) and not isinstance(n_rows, bool) and n_rows > 0
|
||||||
|
and isinstance(empty_count, (int, float)) and not isinstance(empty_count, bool)
|
||||||
|
and empty_count > 0
|
||||||
|
):
|
||||||
|
empty_frac = _clamp(float(empty_count) / float(n_rows), 0.0, 1.0)
|
||||||
|
completeness = _clamp(completeness - empty_frac, 0.0, 1.0)
|
||||||
|
issues.append(f"{_pct(empty_frac)} vacíos")
|
||||||
|
|
||||||
numeric = col.get("numeric")
|
# --- validity (con renormalizacion por aplicabilidad) -------------
|
||||||
is_numeric = inferred_type in ("integer", "float", "numeric") or isinstance(numeric, dict)
|
# None = no medible -> se excluye del score (no penaliza ni premia).
|
||||||
if isinstance(numeric, dict):
|
validity = None
|
||||||
outlier_pct = numeric.get("outlier_pct")
|
if completeness <= 0.0:
|
||||||
if outlier_pct is not None:
|
# Columna 100% faltante: no hay valores no nulos sobre los que medir
|
||||||
try:
|
# conformidad. validity no aplica -> el score sale solo de completeness
|
||||||
outlier_pct = float(outlier_pct)
|
# (= 0). Es el peor defecto de calidad posible.
|
||||||
except (TypeError, ValueError):
|
validity = None
|
||||||
outlier_pct = 0.0
|
else:
|
||||||
outlier_pct = _clamp(outlier_pct, 0.0, 1.0)
|
validity_rate = col.get("validity_rate")
|
||||||
if outlier_pct > 0:
|
|
||||||
penalty = min(outlier_pct, 0.3)
|
|
||||||
validity -= penalty
|
|
||||||
issues.append(f"{round(outlier_pct * 100)}% outliers")
|
|
||||||
|
|
||||||
# semantic_type declarado pero con baja tasa de match (si la conocemos).
|
|
||||||
semantic_type = col.get("semantic_type") or ""
|
|
||||||
match_rate = col.get("match_rate")
|
match_rate = col.get("match_rate")
|
||||||
if semantic_type and match_rate is not None:
|
if validity_rate is not None:
|
||||||
try:
|
# Texto promovido a numero/fecha: parse rate real de la muestra.
|
||||||
match_rate = float(match_rate)
|
v = _num(validity_rate, None)
|
||||||
except (TypeError, ValueError):
|
if v is not None:
|
||||||
match_rate = None
|
validity = _clamp(v, 0.0, 1.0)
|
||||||
if match_rate is not None:
|
if validity < 1.0:
|
||||||
match_rate = _clamp(match_rate, 0.0, 1.0)
|
kind = (
|
||||||
if match_rate < 1.0:
|
"número" if inferred_type == "numeric"
|
||||||
shortfall = 1.0 - match_rate
|
else "fecha" if inferred_type == "datetime"
|
||||||
validity -= 0.5 * shortfall
|
else inferred_type or "su tipo"
|
||||||
|
)
|
||||||
issues.append(
|
issues.append(
|
||||||
f"semantic_type '{semantic_type}' con baja coincidencia "
|
f"{_pct(1.0 - validity)} no parsea al tipo {kind}"
|
||||||
f"({round(match_rate * 100)}%)"
|
)
|
||||||
|
elif inferred_type in _NATIVE_TYPED:
|
||||||
|
# Tipo nativo garantizado por el almacen: no hay valores que no
|
||||||
|
# parseen. validity = 1.0 (no se confunde con tener outliers).
|
||||||
|
validity = 1.0
|
||||||
|
elif semantic_type and match_rate is not None:
|
||||||
|
v = _num(match_rate, None)
|
||||||
|
if v is not None:
|
||||||
|
validity = _clamp(v, 0.0, 1.0)
|
||||||
|
if validity < 1.0:
|
||||||
|
issues.append(
|
||||||
|
f"{_pct(1.0 - validity)} no casa con el "
|
||||||
|
f"formato «{semantic_type}»"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Texto libre / categorica sin semantica: no hay criterio honesto
|
||||||
|
# de validez. No aplica.
|
||||||
|
validity = None
|
||||||
|
|
||||||
|
# --- observations (NO bajan el score) -----------------------------
|
||||||
|
numeric = col.get("numeric")
|
||||||
|
if isinstance(numeric, dict):
|
||||||
|
# outlier_pct viene en escala 0-100 desde describe_numeric (z-score>3).
|
||||||
|
outlier_pct = _num(numeric.get("outlier_pct"), None)
|
||||||
|
if outlier_pct is not None and outlier_pct >= 0.05:
|
||||||
|
observations.append(
|
||||||
|
f"{_pct(outlier_pct / 100.0)} de valores atípicos (z-score>3): "
|
||||||
|
"revisar si son errores u observaciones legítimas"
|
||||||
|
)
|
||||||
|
skew = _num(numeric.get("skew"), None)
|
||||||
|
if skew is not None and abs(skew) >= 1.0:
|
||||||
|
observations.append(
|
||||||
|
f"asimetría fuerte (skew={round(skew, 2)}): considerar "
|
||||||
|
"re-expresión antes de modelar"
|
||||||
)
|
)
|
||||||
|
|
||||||
if "mostly_null" in flags:
|
|
||||||
validity *= 0.5
|
|
||||||
issues.append("mayoritariamente nula")
|
|
||||||
|
|
||||||
validity = _clamp(validity, 0.0, 1.0)
|
|
||||||
|
|
||||||
# --- consistency --------------------------------------------------
|
|
||||||
consistency = 1.0
|
|
||||||
if "constant" in flags:
|
if "constant" in flags:
|
||||||
consistency = 0.3
|
observations.append(
|
||||||
issues.append("columna constante")
|
"columna constante: aporta poca información para el análisis"
|
||||||
else:
|
)
|
||||||
unique_pct = col.get("unique_pct")
|
|
||||||
if unique_pct is not None:
|
unique_pct = _num(col.get("unique_pct"), None)
|
||||||
try:
|
is_id = (
|
||||||
unique_pct = float(unique_pct)
|
"possible_id" in flags
|
||||||
except (TypeError, ValueError):
|
or "high_cardinality" in flags
|
||||||
unique_pct = None
|
or (
|
||||||
if (
|
inferred_type in ("text", "categorical")
|
||||||
inferred_type == "text"
|
|
||||||
and unique_pct is not None
|
and unique_pct is not None
|
||||||
and _clamp(unique_pct, 0.0, 1.0) > 0.9
|
and _clamp(unique_pct, 0.0, 1.0) > 0.9
|
||||||
):
|
)
|
||||||
consistency = 0.6
|
)
|
||||||
issues.append("posible id de alta cardinalidad")
|
if is_id:
|
||||||
|
observations.append(
|
||||||
consistency = _clamp(consistency, 0.0, 1.0)
|
"valores casi únicos: posible identificador (no es un defecto de calidad)"
|
||||||
|
|
||||||
# --- score agregado ----------------------------------------------
|
|
||||||
score = round(
|
|
||||||
100.0 * (0.5 * completeness + 0.3 * validity + 0.2 * consistency),
|
|
||||||
1,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Silencia warnings sobre la variable de tipo no usada.
|
# --- score agregado con renormalizacion ---------------------------
|
||||||
_ = is_numeric
|
applicable = ["completeness"]
|
||||||
|
num = _W_COMPLETENESS * completeness
|
||||||
|
den = _W_COMPLETENESS
|
||||||
|
if validity is not None:
|
||||||
|
applicable.append("validity")
|
||||||
|
num += _W_VALIDITY * validity
|
||||||
|
den += _W_VALIDITY
|
||||||
|
score = round(100.0 * num / den, 1) if den > 0 else 0.0
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"score": score,
|
"score": score,
|
||||||
"completeness": completeness,
|
"completeness": completeness,
|
||||||
"validity": validity,
|
"validity": validity,
|
||||||
"consistency": consistency,
|
"dimensions": {"completeness": completeness, "validity": validity},
|
||||||
|
"applicable": applicable,
|
||||||
"issues": issues,
|
"issues": issues,
|
||||||
|
"observations": observations,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _pct(frac: float) -> str:
|
||||||
|
"""Formatea una fraccion 0-1 como porcentaje honesto: «N%» si >=1%, «0.N%»
|
||||||
|
por debajo (para no mostrar «0%» cuando hay un defecto real pequeño)."""
|
||||||
|
p = frac * 100.0
|
||||||
|
if p >= 1.0:
|
||||||
|
return f"{round(p)}%"
|
||||||
|
return f"{p:.1f}%"
|
||||||
|
|
||||||
|
|
||||||
|
def _num(x, default):
|
||||||
|
"""Convierte x a float; devuelve `default` si es None o no parseable."""
|
||||||
|
if x is None:
|
||||||
|
return default
|
||||||
|
if isinstance(x, bool):
|
||||||
|
return default
|
||||||
|
try:
|
||||||
|
return float(x)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def _clamp(x: float, lo: float, hi: float) -> float:
|
def _clamp(x: float, lo: float, hi: float) -> float:
|
||||||
"""Recorta x al rango [lo, hi]."""
|
"""Recorta x al rango [lo, hi]."""
|
||||||
if x < lo:
|
if x < lo:
|
||||||
|
|||||||
@@ -1,4 +1,12 @@
|
|||||||
"""Tests para column_quality_score."""
|
"""Tests para column_quality_score (nueva fórmula, report 2046).
|
||||||
|
|
||||||
|
Verifica las invariantes de la fórmula de calidad:
|
||||||
|
- completeness (0.6) + validity (0.4) con renormalización por aplicabilidad.
|
||||||
|
- Los OUTLIERS no bajan el score (van a observations, no a issues).
|
||||||
|
- Columnas constantes e ids no bajan el score (observations).
|
||||||
|
- Sin doble conteo de la falta de datos.
|
||||||
|
- all-null -> score 0; función pura (no muta el input).
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -9,11 +17,11 @@ from column_quality_score import column_quality_score
|
|||||||
|
|
||||||
|
|
||||||
def _clean_numeric_col() -> dict:
|
def _clean_numeric_col() -> dict:
|
||||||
"""ColumnProfile de una columna numerica sana, sin problemas."""
|
"""ColumnProfile de una columna numérica nativa sana, sin problemas."""
|
||||||
return {
|
return {
|
||||||
"name": "edad",
|
"name": "edad",
|
||||||
"physical_type": "INTEGER",
|
"physical_type": "INTEGER",
|
||||||
"inferred_type": "integer",
|
"inferred_type": "numeric",
|
||||||
"semantic_type": "",
|
"semantic_type": "",
|
||||||
"count": 1000,
|
"count": 1000,
|
||||||
"n_rows": 1000,
|
"n_rows": 1000,
|
||||||
@@ -28,85 +36,163 @@ def _clean_numeric_col() -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Golden
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
def test_clean_column_high_score():
|
def test_clean_column_high_score():
|
||||||
out = column_quality_score(_clean_numeric_col())
|
out = column_quality_score(_clean_numeric_col())
|
||||||
assert out["score"] > 90
|
assert out["score"] == 100.0
|
||||||
assert out["completeness"] == 1.0
|
assert out["completeness"] == 1.0
|
||||||
assert out["validity"] == 1.0
|
assert out["validity"] == 1.0
|
||||||
assert out["consistency"] == 1.0
|
assert out["applicable"] == ["completeness", "validity"]
|
||||||
assert out["issues"] == []
|
assert out["issues"] == []
|
||||||
|
assert out["observations"] == []
|
||||||
|
|
||||||
|
|
||||||
def test_half_null_lowers_completeness_and_score():
|
def test_weights_60_40_native_type():
|
||||||
|
"""30% nulos en numérica nativa: score = 100*(0.6*0.7 + 0.4*1.0) = 82."""
|
||||||
col = _clean_numeric_col()
|
col = _clean_numeric_col()
|
||||||
col["null_count"] = 500
|
col["null_pct"] = 0.30
|
||||||
col["null_pct"] = 0.5
|
col["null_count"] = 300
|
||||||
clean_score = column_quality_score(_clean_numeric_col())["score"]
|
|
||||||
out = column_quality_score(col)
|
out = column_quality_score(col)
|
||||||
assert out["completeness"] == 0.5
|
assert out["completeness"] == 0.7
|
||||||
assert out["score"] < clean_score
|
assert out["validity"] == 1.0
|
||||||
assert any("nulos" in issue for issue in out["issues"])
|
assert out["score"] == 82.0
|
||||||
|
assert any("nulos" in i for i in out["issues"])
|
||||||
|
|
||||||
|
|
||||||
def test_constant_column_flags_issue():
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Outliers FUERA del score
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_outliers_do_not_penalize_score():
|
||||||
|
"""Columna con outliers pero sin nulos -> score máximo; outliers en observations."""
|
||||||
|
col = _clean_numeric_col()
|
||||||
|
col["numeric"] = {"outlier_pct": 18.0, "skew": 0.2} # 18% atípicos (escala 0-100)
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["score"] == 100.0 # los outliers NO bajan la calidad
|
||||||
|
assert out["validity"] == 1.0
|
||||||
|
# No aparecen como problema de calidad...
|
||||||
|
assert not any("atípic" in i or "outlier" in i for i in out["issues"])
|
||||||
|
# ...sino como observación analítica.
|
||||||
|
assert any("atípic" in o for o in out["observations"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_nulls_lower_score_more_than_outliers():
|
||||||
|
"""Vacíos sí penalizan; outliers no: comparar las dos columnas."""
|
||||||
|
con_nulos = _clean_numeric_col()
|
||||||
|
con_nulos["null_pct"] = 0.30
|
||||||
|
con_outliers = _clean_numeric_col()
|
||||||
|
con_outliers["numeric"] = {"outlier_pct": 30.0}
|
||||||
|
assert column_quality_score(con_nulos)["score"] < \
|
||||||
|
column_quality_score(con_outliers)["score"]
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Validity: aplicabilidad y renormalización
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_validity_from_parse_rate_lowers_score():
|
||||||
|
"""Numérica como texto con 20% basura: validity=0.8 -> score=92."""
|
||||||
|
col = {
|
||||||
|
"name": "precio_txt", "inferred_type": "numeric", "semantic_type": "decimal",
|
||||||
|
"null_pct": 0.0, "validity_rate": 0.80, "flags": [], "numeric": None,
|
||||||
|
}
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["validity"] == 0.8
|
||||||
|
assert out["score"] == 92.0 # 100*(0.6 + 0.4*0.8)
|
||||||
|
assert any("no parsea" in i for i in out["issues"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_validity_from_match_rate():
|
||||||
|
"""Texto con semantic_type y 5% no conforme: validity=0.95."""
|
||||||
|
col = {
|
||||||
|
"name": "email", "inferred_type": "text", "semantic_type": "email",
|
||||||
|
"null_pct": 0.0, "match_rate": 0.95, "unique_pct": 0.5, "flags": [],
|
||||||
|
}
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["validity"] == 0.95
|
||||||
|
assert out["score"] == 98.0 # 100*(0.6 + 0.4*0.95)
|
||||||
|
assert any("no casa" in i for i in out["issues"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_free_text_renormalizes_to_completeness_only():
|
||||||
|
"""Texto libre sin semántica: validity no aplica -> score = 100*completeness."""
|
||||||
|
col = {
|
||||||
|
"name": "comentario", "inferred_type": "text", "semantic_type": "",
|
||||||
|
"null_pct": 0.30, "unique_pct": 0.5, "flags": [], "numeric": None,
|
||||||
|
}
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["validity"] is None
|
||||||
|
assert out["applicable"] == ["completeness"]
|
||||||
|
assert out["completeness"] == 0.7
|
||||||
|
assert out["score"] == 70.0 # renormalizado a solo completeness
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Casos límite (report §4.6)
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
def test_all_null_column_scores_zero():
|
||||||
|
col = _clean_numeric_col()
|
||||||
|
col["null_pct"] = 1.0
|
||||||
|
col["null_count"] = 1000
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["completeness"] == 0.0
|
||||||
|
assert out["validity"] is None # no medible sin valores no nulos
|
||||||
|
assert out["score"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_constant_column_scores_full_and_is_observation():
|
||||||
|
"""Columna constante: dato válido y completo -> score 100; baja info = observación."""
|
||||||
col = _clean_numeric_col()
|
col = _clean_numeric_col()
|
||||||
col["flags"] = ["constant"]
|
col["flags"] = ["constant"]
|
||||||
col["distinct_count"] = 1
|
col["distinct_count"] = 1
|
||||||
col["unique_pct"] = 0.001
|
col["unique_pct"] = 0.001
|
||||||
out = column_quality_score(col)
|
out = column_quality_score(col)
|
||||||
assert out["consistency"] == 0.3
|
assert out["score"] == 100.0 # NO se castiga la baja informatividad
|
||||||
assert any("constante" in issue for issue in out["issues"])
|
assert not any("constante" in i for i in out["issues"])
|
||||||
|
assert any("constante" in o for o in out["observations"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_high_cardinality_id_scores_full_and_is_observation():
|
||||||
|
"""Id de alta cardinalidad: unicidad perfecta -> score 100; posible id = observación."""
|
||||||
|
col = {
|
||||||
|
"name": "uuid", "inferred_type": "text", "semantic_type": "",
|
||||||
|
"null_pct": 0.0, "unique_pct": 0.99, "flags": ["possible_id"],
|
||||||
|
"numeric": None,
|
||||||
|
}
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["score"] == 100.0
|
||||||
|
assert not any("identificador" in i for i in out["issues"])
|
||||||
|
assert any("identificador" in o for o in out["observations"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_mostly_null_no_double_counts_validity():
|
||||||
|
"""85% nulos: solo completeness penaliza; validity nativa sigue 1.0 (sin doble castigo)."""
|
||||||
|
col = _clean_numeric_col()
|
||||||
|
col["null_pct"] = 0.85
|
||||||
|
col["flags"] = ["mostly_null"]
|
||||||
|
out = column_quality_score(col)
|
||||||
|
assert out["validity"] == 1.0 # ya no se multiplica por 0.5
|
||||||
|
# score = 100*(0.6*0.15 + 0.4*1.0) = 49
|
||||||
|
assert out["score"] == 49.0
|
||||||
|
assert not any("mayoritariamente" in o for o in out["observations"])
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Robustez
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
def test_empty_dict_does_not_crash():
|
def test_empty_dict_does_not_crash():
|
||||||
out = column_quality_score({})
|
out = column_quality_score({})
|
||||||
assert isinstance(out["score"], float)
|
assert isinstance(out["score"], float)
|
||||||
assert out["completeness"] == 1.0
|
assert out["completeness"] == 1.0
|
||||||
assert 0.0 <= out["score"] <= 100.0
|
assert 0.0 <= out["score"] <= 100.0
|
||||||
assert isinstance(out["issues"], list)
|
assert isinstance(out["issues"], list)
|
||||||
|
assert isinstance(out["observations"], list)
|
||||||
|
|
||||||
def test_outliers_penalize_validity():
|
|
||||||
col = _clean_numeric_col()
|
|
||||||
col["numeric"] = {"outlier_pct": 0.2}
|
|
||||||
out = column_quality_score(col)
|
|
||||||
assert out["validity"] < 1.0
|
|
||||||
assert any("outliers" in issue for issue in out["issues"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_mostly_null_flag_halves_validity():
|
|
||||||
col = _clean_numeric_col()
|
|
||||||
col["null_pct"] = 0.85
|
|
||||||
col["flags"] = ["mostly_null"]
|
|
||||||
out = column_quality_score(col)
|
|
||||||
assert out["validity"] == 0.5
|
|
||||||
assert any("mayoritariamente nula" in issue for issue in out["issues"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_high_cardinality_text_flagged_as_id():
|
|
||||||
col = {
|
|
||||||
"name": "uuid",
|
|
||||||
"inferred_type": "text",
|
|
||||||
"semantic_type": "",
|
|
||||||
"null_pct": 0.0,
|
|
||||||
"unique_pct": 0.99,
|
|
||||||
"flags": [],
|
|
||||||
"numeric": None,
|
|
||||||
}
|
|
||||||
out = column_quality_score(col)
|
|
||||||
assert out["consistency"] < 1.0
|
|
||||||
assert any("alta cardinalidad" in issue for issue in out["issues"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_none_values_treated_defensively():
|
def test_none_values_treated_defensively():
|
||||||
col = {
|
col = {
|
||||||
"name": "x",
|
"name": "x", "inferred_type": None, "semantic_type": None,
|
||||||
"inferred_type": None,
|
"null_pct": None, "unique_pct": None, "flags": None, "numeric": None,
|
||||||
"semantic_type": None,
|
|
||||||
"null_pct": None,
|
|
||||||
"unique_pct": None,
|
|
||||||
"flags": None,
|
|
||||||
"numeric": None,
|
|
||||||
}
|
}
|
||||||
out = column_quality_score(col)
|
out = column_quality_score(col)
|
||||||
assert out["completeness"] == 1.0
|
assert out["completeness"] == 1.0
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ name: summarize_table_duckdb
|
|||||||
kind: function
|
kind: function
|
||||||
lang: py
|
lang: py
|
||||||
domain: datascience
|
domain: datascience
|
||||||
version: "1.0.0"
|
version: "1.1.0"
|
||||||
purity: impure
|
purity: impure
|
||||||
signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict"
|
signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict"
|
||||||
description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)."
|
description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)."
|
||||||
@@ -64,6 +64,7 @@ else:
|
|||||||
- **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima).
|
- **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima).
|
||||||
- **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75.
|
- **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75.
|
||||||
- **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`).
|
- **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`).
|
||||||
|
- **`duplicate_rows`/`duplicate_pct` se pueblan push-down** (desde v1.1.0) con `count(*)` sobre `SELECT DISTINCT *` (sin traer filas a RAM): `duplicate_rows = n_rows - filas_distintas`, `duplicate_pct` en fraccion 0-1. Habilitan la dimension de unicidad de registro del score de dataset (`profile_table` paso 6). Si la tabla tiene tipos no comparables con `DISTINCT` (BLOB/LIST/MAP) la query degrada y ambas vuelven a `None` (renormaliza el score a solo `cell_quality`).
|
||||||
- **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica).
|
- **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica).
|
||||||
- **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage).
|
- **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage).
|
||||||
- **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base.
|
- **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base.
|
||||||
|
|||||||
@@ -196,6 +196,21 @@ def summarize_table_duckdb(
|
|||||||
sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
|
sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Unicidad de registro: filas duplicadas via COUNT de filas distintas
|
||||||
|
# push-down (DISTINCT *), sin traer filas a RAM. Habilita la dimension
|
||||||
|
# de uniqueness del score de dataset (1 - duplicate_pct). Degrada a None
|
||||||
|
# si la tabla tiene tipos no comparables con DISTINCT (BLOB/LIST/MAP).
|
||||||
|
duplicate_rows = None
|
||||||
|
duplicate_pct = None
|
||||||
|
if n_rows > 0:
|
||||||
|
dup_res = duckdb_query_readonly(
|
||||||
|
db_path, f"SELECT count(*) AS c FROM (SELECT DISTINCT * FROM {quoted})"
|
||||||
|
)
|
||||||
|
if dup_res["status"] == "ok" and dup_res["rows"]:
|
||||||
|
distinct_rows = int(dup_res["rows"][0]["c"])
|
||||||
|
duplicate_rows = max(0, n_rows - distinct_rows)
|
||||||
|
duplicate_pct = duplicate_rows / n_rows # fraccion 0-1
|
||||||
|
|
||||||
profile = {
|
profile = {
|
||||||
"table": table,
|
"table": table,
|
||||||
"source": "duckdb",
|
"source": "duckdb",
|
||||||
@@ -203,8 +218,8 @@ def summarize_table_duckdb(
|
|||||||
"n_rows": n_rows,
|
"n_rows": n_rows,
|
||||||
"n_cols": len(columns),
|
"n_cols": len(columns),
|
||||||
"size_bytes": None,
|
"size_bytes": None,
|
||||||
"duplicate_rows": None,
|
"duplicate_rows": duplicate_rows,
|
||||||
"duplicate_pct": None,
|
"duplicate_pct": duplicate_pct,
|
||||||
"constant_cols": constant_cols,
|
"constant_cols": constant_cols,
|
||||||
"all_null_cols": all_null_cols,
|
"all_null_cols": all_null_cols,
|
||||||
"null_cell_pct": null_cell_pct,
|
"null_cell_pct": null_cell_pct,
|
||||||
|
|||||||
@@ -54,6 +54,30 @@ def test_shape_y_metadatos_tabla(db):
|
|||||||
assert profile["correlations"] is None
|
assert profile["correlations"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_pct_sin_duplicados(db):
|
||||||
|
"""Tabla con todas las filas distintas: duplicate_pct = 0, no None."""
|
||||||
|
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||||
|
assert profile["duplicate_rows"] == 0
|
||||||
|
assert profile["duplicate_pct"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_pct_con_duplicados(tmp_path):
|
||||||
|
"""Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down."""
|
||||||
|
path = str(tmp_path / "dups.duckdb")
|
||||||
|
con = duckdb.connect(path)
|
||||||
|
con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)")
|
||||||
|
# 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4.
|
||||||
|
con.execute(
|
||||||
|
"INSERT INTO t VALUES "
|
||||||
|
"(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')"
|
||||||
|
)
|
||||||
|
con.close()
|
||||||
|
profile = summarize_table_duckdb(path, "t")["profile"]
|
||||||
|
assert profile["n_rows"] == 5
|
||||||
|
assert profile["duplicate_rows"] == 2
|
||||||
|
assert profile["duplicate_pct"] == 0.4
|
||||||
|
|
||||||
|
|
||||||
def test_column_profile_shape(db):
|
def test_column_profile_shape(db):
|
||||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||||
by_name = {c["name"]: c for c in profile["columns"]}
|
by_name = {c["name"]: c for c in profile["columns"]}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ kind: pipeline
|
|||||||
lang: py
|
lang: py
|
||||||
domain: pipelines
|
domain: pipelines
|
||||||
purity: impure
|
purity: impure
|
||||||
version: "1.0.0"
|
version: "1.1.0"
|
||||||
signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict"
|
signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict"
|
||||||
description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla."
|
description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla."
|
||||||
tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries]
|
tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries]
|
||||||
@@ -114,3 +114,12 @@ para auditar la calidad de una tabla ya productiva. Reemplaza orquestar a mano
|
|||||||
Formatos exoticos pueden descartarse silenciosamente del calculo numerico.
|
Formatos exoticos pueden descartarse silenciosamente del calculo numerico.
|
||||||
- `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el
|
- `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el
|
||||||
sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red).
|
sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red).
|
||||||
|
- **Score de calidad (report 2046, desde v1.1.0).** Paso 5: cada columna recibe
|
||||||
|
`quality_score` de `column_quality_score` con la formula 60/40
|
||||||
|
(completeness/validity); al promocionar texto a numero/fecha se expone
|
||||||
|
`col["validity_rate"]` (parse rate de la muestra) para alimentar la dimension
|
||||||
|
validity. Paso 6: el score de dataset NO es la media simple — es
|
||||||
|
`100 * (0.85*cell_quality + 0.15*row_uniqueness)`, donde
|
||||||
|
`cell_quality = media(score_col/100)` y `row_uniqueness = 1 - duplicate_pct`.
|
||||||
|
Si `duplicate_pct` es `None` (backend sin calcularlo) el score se renormaliza
|
||||||
|
a solo `cell_quality`. Los outliers NO bajan el score (van a `observations`).
|
||||||
|
|||||||
@@ -477,9 +477,18 @@ def profile_table(
|
|||||||
if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
|
if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
|
||||||
col["inferred_type"] = "numeric"
|
col["inferred_type"] = "numeric"
|
||||||
inferred = "numeric"
|
inferred = "numeric"
|
||||||
|
# Tasa de parseo real de la muestra: alimenta la
|
||||||
|
# dimension validity de column_quality_score (fraccion
|
||||||
|
# de valores conformes al tipo numerico promovido).
|
||||||
|
col["validity_rate"] = len(ok) / len(vals)
|
||||||
elif semantic in _DATETIME_SEMANTIC:
|
elif semantic in _DATETIME_SEMANTIC:
|
||||||
col["inferred_type"] = "datetime"
|
col["inferred_type"] = "datetime"
|
||||||
inferred = "datetime"
|
inferred = "datetime"
|
||||||
|
# Tasa de parseo de la muestra a fecha (mismo papel que el
|
||||||
|
# parse rate numerico) para la dimension validity.
|
||||||
|
parsed_dt = [_to_ordinal_days(v) for v in vals]
|
||||||
|
ok_dt = [d for d in parsed_dt if d is not None]
|
||||||
|
col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None
|
||||||
|
|
||||||
# 4) Enriquecer segun el inferred_type final.
|
# 4) Enriquecer segun el inferred_type final.
|
||||||
if inferred == "numeric":
|
if inferred == "numeric":
|
||||||
@@ -506,11 +515,36 @@ def profile_table(
|
|||||||
# 5) Score de calidad por columna.
|
# 5) Score de calidad por columna.
|
||||||
col["quality_score"] = column_quality_score(col).get("score")
|
col["quality_score"] = column_quality_score(col).get("score")
|
||||||
|
|
||||||
# 6) Score agregado de la tabla (media de columnas).
|
# 6) Score agregado de la tabla (report 2046): NO media simple.
|
||||||
|
# cell_quality = media de los scores de columna, en [0,1].
|
||||||
|
# row_uniqueness = 1 - duplicate_pct (unicidad de registro).
|
||||||
|
# score = 100 * (0.85*cell_quality + 0.15*row_uniqueness).
|
||||||
|
# Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular.
|
||||||
scores = [
|
scores = [
|
||||||
c["quality_score"] for c in cols if c.get("quality_score") is not None
|
c["quality_score"] for c in cols if c.get("quality_score") is not None
|
||||||
]
|
]
|
||||||
prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None
|
if scores:
|
||||||
|
cell_quality = (sum(scores) / len(scores)) / 100.0
|
||||||
|
dup_pct = prof.get("duplicate_pct")
|
||||||
|
if dup_pct is not None:
|
||||||
|
try:
|
||||||
|
d = float(dup_pct)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
d = None
|
||||||
|
else:
|
||||||
|
d = None
|
||||||
|
if d is not None:
|
||||||
|
# Tolerar escala 0-100 por si algun backend la entrega asi.
|
||||||
|
if d > 1.0:
|
||||||
|
d = d / 100.0
|
||||||
|
row_uniqueness = max(0.0, min(1.0, 1.0 - d))
|
||||||
|
prof["quality_score"] = round(
|
||||||
|
100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prof["quality_score"] = round(100.0 * cell_quality, 1)
|
||||||
|
else:
|
||||||
|
prof["quality_score"] = None
|
||||||
|
|
||||||
# 7) Candidatos a clave.
|
# 7) Candidatos a clave.
|
||||||
key_candidates = []
|
key_candidates = []
|
||||||
|
|||||||
Reference in New Issue
Block a user