diff --git a/python/functions/datascience/automatic_eda/chapters/calidad.py b/python/functions/datascience/automatic_eda/chapters/calidad.py index dcedcf6f..93ef9423 100644 --- a/python/functions/datascience/automatic_eda/chapters/calidad.py +++ b/python/functions/datascience/automatic_eda/chapters/calidad.py @@ -1,22 +1,26 @@ """Data-quality chapter (CALIDAD) for AutomaticEDA. Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The -chapter answers, in Spanish and as tables, the three things the user asked for: +chapter implements the quality model of report 2046: -1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and - their weights (completeness, validity, consistency) before any number, plus a - table-level summary (global score and aggregates). +1. **En qué se basa la calidad** — an intro paragraph explaining the two scored + dimensions and their weights (completitud 60%, validez 40%) plus the + table-level row uniqueness, BEFORE any number, and stating explicitly that + outliers are reported as observations and do **not** lower the score. The + criteria terms (calidad de datos, completitud, validez, unicidad de registro) + are hooked into the shared glossary as clickable jumps. 2. **Scores por columna** — a table with, per column, the total quality score and - its breakdown into completeness / validity / consistency. -3. **Problemas en español** — a second table listing, per column, the readable - issues in Spanish (kept separate from the type ``flags``). + its breakdown into completeness / validity (no consistency dimension). +3. **Problemas de calidad** — a table listing ONLY real quality defects + (nulls, empty cells, values not conforming to their type/semantics). +4. **Observaciones analíticas** — a SEPARATE table for outliers, constant + columns, high-cardinality ids and strong skew, with an explicit note that + these do not affect the score. -The breakdown and the issues are NOT recomputed here: they come from the registry -function ``column_quality_score`` (group ``eda``), which already derives -``{score, completeness, validity, consistency, issues}`` from the ColumnProfile. -This chapter is render-only — it consumes that function and lays the result out -as model blocks; the renderers paginate tables (splitting by rows, repeating the -header) and wrap long cells so nothing is ever cut. +The breakdown, issues and observations are NOT recomputed here: they come from +the registry function ``column_quality_score`` (group ``eda``), which derives +``{score, completeness, validity, dimensions, applicable, issues, +observations}`` from the ColumnProfile. This chapter is render-only. Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". """ @@ -33,28 +37,47 @@ try: # pragma: no cover - import wiring except Exception: # noqa: BLE001 - never let an import error abort the document. _column_quality_score = None -CHAPTER_VERSION = "1.0.0" +CHAPTER_VERSION = "2.0.0" CHAPTER_ID = "calidad" CHAPTER_TITLE = "Calidad" -# Weights mirror column_quality_score: completeness 0.5, validity 0.3, -# consistency 0.2. Kept here only to render the human explanation; the actual -# numbers always come from the function so the two never drift in computation. -_CRITERIA_INTRO = ( - "La calidad de cada columna es un score de 0 a 100 que combina tres " - "criterios, cada uno con un peso:\n\n" - "- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos " - "ni vacíos). Una columna con muchos nulos baja de score.\n" - "- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango " - "esperado (penaliza outliers y semánticas declaradas que no coinciden).\n" - "- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza " - "columnas constantes o identificadores de cardinalidad muy alta).\n\n" - "Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). " - "Los problemas detectados por columna se listan en español más abajo." -) +# Glossary terms this chapter explains (report 2046 §6). Registered in the shared +# collector and marked clickable on their first appearance (contract §11.1). +_TERMS = { + "calidad_datos": ( + "Calidad de datos (score 0-100)", + "Mide hasta qué punto los datos están presentes y son utilizables tal " + "cual, no si son «buenos para el análisis». Se compone solo de " + "dimensiones medibles automáticamente desde el perfil de la tabla, sin " + "fuente externa de verdad: completitud (60%), validez (40%, cuando es " + "medible) y, a nivel de tabla, unicidad de registro. Los valores " + "atípicos NO bajan la calidad: se listan aparte como observaciones.", + ), + "completitud": ( + "Completitud", + "Proporción de valores realmente presentes en una columna (1 − % de " + "nulos; en texto, las celdas vacías también cuentan como faltantes). Los " + "nulos y vacíos bajan el score porque falta información que debería " + "estar. Pesa el 60% del score de columna.", + ), + "validez": ( + "Validez", + "Proporción de valores que encajan con su tipo o formato esperado: un " + "número que parsea, una fecha legible, un email con forma de email. Los " + "valores que no parsean a su tipo bajan el score. Si la columna es texto " + "libre sin formato esperado, la validez no se puede medir y el score se " + "basa solo en la completitud. Pesa el 40% del score cuando es medible.", + ), + "unicidad_registro": ( + "Unicidad de registro", + "A nivel de tabla, las filas duplicadas restan calidad al conjunto " + "(1 − % de filas duplicadas). Es distinta de que una columna no-clave " + "repita valores, que no es un defecto de calidad.", + ), +} -# Cap for the joined issues cell so a single row never grows taller than a page; -# the remainder is summarized as "(+N más)" instead of being silently dropped. +# Cap for the joined cell so a single row never grows taller than a page; the +# remainder is summarized as "(+N más)" instead of being silently dropped. _ISSUES_MAXLEN = 160 @@ -82,12 +105,19 @@ def _fmt_unit_pct(value) -> str: return str(value) +def _fmt_validity(value) -> str: + """Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%.""" + if value is None: + return "n/a" + return _fmt_unit_pct(value) + + def _quality_of(col: dict) -> dict: - """Return ``{score, completeness, validity, consistency, issues}`` for a column. + """Return the quality dict for a column. Uses the registry ``column_quality_score`` when available; otherwise falls back to the per-column ``quality_score`` already in the profile (number only, - empty breakdown/issues). Never raises. + empty breakdown/issues/observations). Never raises. """ if not isinstance(col, dict): col = {} @@ -98,26 +128,25 @@ def _quality_of(col: dict) -> dict: return res except Exception: # noqa: BLE001 - degrade instead of aborting. pass - # Fallback: only the final score is available pre-computed in the profile. return { "score": col.get("quality_score"), "completeness": None, "validity": None, - "consistency": None, "issues": [], + "observations": [], } -def _join_issues(issues) -> str: - """Join Spanish issue strings into one cell, truncating overly long lists. +def _join_cells(items) -> str: + """Join Spanish strings into one cell, truncating overly long lists. - The renderer wraps cell text, but a column with many long issues could make a - single row taller than a whole page; cap the length and append ``(+N más)`` - so the count of hidden issues is honest rather than silently lost. + The renderer wraps cell text, but a column with many long entries could make + a single row taller than a whole page; cap the length and append ``(+N más)`` + so the count of hidden entries is honest rather than silently lost. """ - if not isinstance(issues, (list, tuple)) or not issues: + if not isinstance(items, (list, tuple)) or not items: return "" - parts = [model._safe_str(i).strip() for i in issues] + parts = [model._safe_str(i).strip() for i in items] parts = [p for p in parts if p] if not parts: return "" @@ -142,6 +171,33 @@ def _columns_with_quality(profile: dict): yield c, _quality_of(c) +def _fmt_unit_pct_or_pct(value) -> str: + """Format a value that may be a 0-1 fraction or an already-0-100 percentage.""" + try: + num = float(value) + except (TypeError, ValueError): + return model._safe_str(value) + if num != num: # NaN + return "—" + pct = num * 100 if num <= 1.0 else num + text = f"{pct:.1f}".rstrip("0").rstrip(".") + return f"{text}%" + + +def _row_uniqueness(profile: dict): + """Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown.""" + dup = profile.get("duplicate_pct") + if dup is None: + return None + try: + d = float(dup) + except (TypeError, ValueError): + return None + if d > 1.0: # tolerate a 0-100 scale + d = d / 100.0 + return max(0.0, min(1.0, 1.0 - d)) + + def _summary_block(profile: dict, evaluated: list): """Table-level KVTable: global score and quality aggregates.""" rows = [] @@ -153,14 +209,15 @@ def _summary_block(profile: dict, evaluated: list): if isinstance(q.get("completeness"), (int, float))] vals = [q.get("validity") for _, q in evaluated if isinstance(q.get("validity"), (int, float))] - cons = [q.get("consistency") for _, q in evaluated - if isinstance(q.get("consistency"), (int, float))] if comps: rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps)))) if vals: - rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals)))) - if cons: - rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons)))) + rows.append(("Validez media (donde aplica)", + _fmt_unit_pct(sum(vals) / len(vals)))) + + ru = _row_uniqueness(profile) + if ru is not None: + rows.append(("Unicidad de registro", _fmt_unit_pct(ru))) n_problem = sum(1 for _, q in evaluated if q.get("issues")) rows.append(("Columnas con problemas", str(n_problem))) @@ -182,22 +239,9 @@ def _summary_block(profile: dict, evaluated: list): return model.KVTable(rows=rows, title="Resumen de calidad") -def _fmt_unit_pct_or_pct(value) -> str: - """Format a value that may be a 0-1 fraction or an already-0-100 percentage.""" - try: - num = float(value) - except (TypeError, ValueError): - return model._safe_str(value) - if num != num: # NaN - return "—" - pct = num * 100 if num <= 1.0 else num - text = f"{pct:.1f}".rstrip("0").rstrip(".") - return f"{text}%" - - def _scores_block(evaluated: list): - """DataTable with per-column score and its three-criteria breakdown.""" - header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"] + """DataTable with per-column score and its completeness/validity breakdown.""" + header = ["Columna", "Calidad", "Completitud", "Validez"] rows = [] # Worst columns first so the reader sees the problems at the top. ordered = sorted( @@ -210,22 +254,22 @@ def _scores_block(evaluated: list): col.get("name") or "(col)", _fmt_score(q.get("score")), _fmt_unit_pct(q.get("completeness")), - _fmt_unit_pct(q.get("validity")), - _fmt_unit_pct(q.get("consistency")), + _fmt_validity(q.get("validity")), ]) if not rows: return None return model.DataTable(header=header, rows=rows, title="Scores de calidad por columna", - note="0 = peor, 100 = mejor; ordenado de peor a mejor") + note="0 = peor, 100 = mejor; «n/a» = dimensión no " + "medible; ordenado de peor a mejor") def _issues_block(evaluated: list): - """DataTable listing Spanish issues per column, or a Note when there are none.""" - header = ["Columna", "Problemas detectados (español)"] + """DataTable listing ONLY real quality defects per column, or a Note.""" + header = ["Columna", "Problemas de calidad (español)"] rows = [] for col, q in evaluated: - joined = _join_issues(q.get("issues")) + joined = _join_cells(q.get("issues")) if joined: rows.append([col.get("name") or "(col)", joined]) if not rows: @@ -235,6 +279,63 @@ def _issues_block(evaluated: list): title="Problemas de calidad por columna") +def _observations_block(evaluated: list): + """DataTable listing analytical observations per column, or None. + + Observations (outliers, constant columns, ids, strong skew) are NOT quality + defects: they do not affect the score. Returned as a separate table from the + issues so the report never presents a legitimate outlier as a problem. + """ + header = ["Columna", "Observaciones analíticas"] + rows = [] + for col, q in evaluated: + joined = _join_cells(q.get("observations")) + if joined: + rows.append([col.get("name") or "(col)", joined]) + if not rows: + return None + return model.DataTable( + header=header, rows=rows, + title="Observaciones analíticas por columna", + note="No son defectos de calidad y NO afectan al score; orientan el " + "análisis (atípicos, columnas constantes, identificadores).") + + +def _term(key: str, label: str, mark: bool) -> str: + """Render a term as a clickable glossary span when marking is enabled.""" + if mark: + return f"[[term:{key}]]**{label}**[[/term]]" + return f"**{label}**" + + +def _criteria_intro(mark: bool) -> str: + """Intro paragraph explaining the two scored dimensions and the principle.""" + calidad = _term("calidad_datos", "calidad de datos", mark) + completitud = _term("completitud", "Completitud (peso 60%)", mark) + validez = _term("validez", "Validez (peso 40%, cuando es medible)", mark) + unicidad = _term("unicidad_registro", "unicidad de registro", mark) + return ( + f"La {calidad} de cada columna es un score de 0 a 100 que combina solo " + "dimensiones medibles desde el perfil de la tabla, sin fuente externa " + "de verdad:\n\n" + f"- {completitud}: proporción de valores presentes (1 − % de nulos; en " + "texto, las celdas vacías cuentan como faltantes). Los nulos y vacíos " + "bajan el score.\n" + f"- {validez}: proporción de valores que encajan con su tipo o formato " + "(un número que parsea, una fecha legible, un email con forma de email). " + "Si una columna es texto libre sin formato esperado, la validez no se " + "mide y el score se basa solo en la completitud.\n\n" + f"Score de columna = 100 × (0,6·completitud + 0,4·validez), " + "renormalizado cuando la validez no aplica. A nivel de tabla se añade " + f"la {unicidad} (1 − % de filas duplicadas).\n\n" + "**Los valores atípicos (outliers) NO bajan la calidad.** Un valor " + "extremo puede ser real y correcto; detectar atípicos es parte del " + "análisis de la distribución, no un juicio de corrección. Por eso, junto " + "con las columnas constantes y los identificadores, se listan aparte " + "como **observaciones analíticas** que no afectan al score." + ) + + def build_calidad(profile: dict, ctx: dict): """Build the data-quality Chapter, or None if the profile has no columns. @@ -250,17 +351,35 @@ def build_calidad(profile: dict, ctx: dict): if not evaluated: return None # no columns to score -> chapter does not apply. + # Register the criteria terms in the shared glossary (if present) and mark + # their first appearance clickable. Contract §11.1. + glossary = ctx.get("glossary") + mark = False + if isinstance(glossary, model.GlossaryCollector): + for key, (label, definition) in _TERMS.items(): + glossary.add(key, label, definition) + mark = True + blocks = [ model.Heading(text="Cómo se calcula la calidad", level=2), - model.Markdown(text=_CRITERIA_INTRO), + model.Markdown(text=_criteria_intro(mark)), _summary_block(profile, evaluated), model.Heading(text="Scores por columna", level=2), ] scores = _scores_block(evaluated) if scores is not None: blocks.append(scores) - blocks.append(model.Heading(text="Problemas detectados", level=2)) + + blocks.append(model.Heading(text="Problemas de calidad", level=2)) blocks.append(_issues_block(evaluated)) + observations = _observations_block(evaluated) + if observations is not None: + blocks.append(model.Heading(text="Observaciones analíticas", level=2)) + blocks.append(model.Note( + "Las observaciones siguientes NO son defectos de calidad y no " + "afectan al score: son señales para orientar el análisis.")) + blocks.append(observations) + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/calidad_test.py b/python/functions/datascience/automatic_eda/chapters/calidad_test.py index 3e6bf5f6..5dc623ee 100644 --- a/python/functions/datascience/automatic_eda/chapters/calidad_test.py +++ b/python/functions/datascience/automatic_eda/chapters/calidad_test.py @@ -1,11 +1,12 @@ -"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut. +"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut + glossary. Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast -and deterministic. Verifies that the chapter explains the quality criteria, shows -per-column scores with the completeness/validity/consistency breakdown, lists the -issues in Spanish (separate from the type flags), returns None when it does not -apply, and that a wide profile with long names renders to PDF and PPTX without -cutting any cell text (long content wraps, it is never truncated). +and deterministic. Verifies the report-2046 quality model: the chapter explains +the two scored dimensions (completitud 60% / validez 40%), shows per-column +scores without a consistency column, keeps quality DEFECTS (issues) separate +from analytical OBSERVATIONS (outliers, constant, ids), hooks the criteria terms +into the glossary, returns None when it does not apply, and renders a wide +profile to PDF and PPTX without cutting any cell text. """ import os @@ -20,28 +21,30 @@ from datascience.automatic_eda.chapters.calidad import ( CHAPTER_VERSION, ) from datascience.automatic_eda import build_document, render_pdf, render_pptx +from datascience.automatic_eda import model def _profile() -> dict: """A small profile with one column per quality problem (nulls, outliers, - constant, high-cardinality id) plus one clean column.""" + constant, high-cardinality id) plus one clean column. ``outlier_pct`` is in + the 0-100 scale that describe_numeric actually emits.""" return { "table": "demo", - "quality_score": 72.5, + "quality_score": 82.0, "duplicate_pct": 0.04, "null_cell_pct": 0.11, "constant_cols": ["flag_const"], "all_null_cols": [], "columns": [ - {"name": "edad", "inferred_type": "integer", "null_pct": 0.2, - "numeric": {"outlier_pct": 0.15, "min": 0, "max": 99}, - "quality_score": 60}, + {"name": "edad", "inferred_type": "numeric", "null_pct": 0.2, + "n_rows": 100, "unique_pct": 0.5, + "numeric": {"outlier_pct": 15.0, "min": 0, "max": 99}}, {"name": "nombre", "inferred_type": "text", "null_pct": 0.0, - "unique_pct": 0.98, "quality_score": 80}, + "unique_pct": 0.98, "flags": ["possible_id"]}, {"name": "flag_const", "inferred_type": "text", "null_pct": 0.0, - "flags": ["constant"], "quality_score": 50}, - {"name": "limpia", "inferred_type": "float", "null_pct": 0.0, - "numeric": {"outlier_pct": 0.0}, "quality_score": 100}, + "unique_pct": 0.01, "flags": ["constant"]}, + {"name": "limpia", "inferred_type": "numeric", "null_pct": 0.0, + "unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}}, ], } @@ -50,16 +53,9 @@ def _tables(chapter): return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"] -def _scores_table(chapter): +def _table_by_title(chapter, needle): for t in _tables(chapter): - if "Scores" in (t.title or ""): - return t - return None - - -def _issues_table(chapter): - for t in _tables(chapter): - if "Problemas" in (t.title or ""): + if needle in (t.title or ""): return t return None @@ -73,41 +69,84 @@ def test_golden_chapter_estructura_y_version(): assert ch.id == "calidad" assert ch.version == CHAPTER_VERSION kinds = [b.kind for b in ch.blocks] - # intro heading + markdown criteria + summary kv + scores table + issues table assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds -def test_golden_intro_explica_criterios_y_pesos(): +def test_golden_intro_explica_dos_dimensiones_y_pesos(): ch = build_calidad(_profile(), {}) intro = [b for b in ch.blocks if b.kind == "markdown"][0].text - for needle in ("Completitud", "Validez", "Consistencia", - "50%", "30%", "20%"): + for needle in ("Completitud", "Validez", "60%", "40%", + "unicidad de registro"): assert needle in intro, f"falta {needle!r} en la intro de criterios" + # El principio: los outliers NO bajan la calidad. + assert "atípicos" in intro and "NO bajan" in intro + # Ya no se menciona la dimensión consistencia eliminada. + assert "20%" not in intro -def test_golden_scores_incluyen_desglose_por_criterio(): +def test_golden_scores_sin_columna_consistencia(): ch = build_calidad(_profile(), {}) - scores = _scores_table(ch) + scores = _table_by_title(ch, "Scores") assert scores is not None - assert scores.header == ["Columna", "Calidad", "Completitud", - "Validez", "Consistencia"] - # 4 columns scored, none dropped. + assert scores.header == ["Columna", "Calidad", "Completitud", "Validez"] + assert "Consistencia" not in scores.header assert len(scores.rows) == 4 names = {r[0] for r in scores.rows} assert names == {"edad", "nombre", "flag_const", "limpia"} -def test_golden_issues_en_espanol_separados_de_flags(): +def test_golden_outliers_en_observaciones_no_en_problemas(): ch = build_calidad(_profile(), {}) - issues = _issues_table(ch) - assert issues is not None - flat = " | ".join(" ".join(r) for r in issues.rows) - assert "nulos" in flat # completeness issue (ES) - assert "outliers" in flat # validity issue (ES) - assert "columna constante" in flat - assert "posible id de alta cardinalidad" in flat - # The raw type flag string must NOT leak as a "problem". - assert "constant" not in flat or "columna constante" in flat + problemas = _table_by_title(ch, "Problemas de calidad") + observaciones = _table_by_title(ch, "Observaciones") + assert problemas is not None + assert observaciones is not None + + problemas_txt = " | ".join(" ".join(r) for r in problemas.rows) + observaciones_txt = " | ".join(" ".join(r) for r in observaciones.rows) + + # Los nulos SÍ son problema de calidad. + assert "nulos" in problemas_txt + # Los outliers NO aparecen como problema... + assert "atípic" not in problemas_txt and "outlier" not in problemas_txt + # ...sino como observación analítica. + assert "atípic" in observaciones_txt + # Constante e id: observaciones, no problemas. + assert "constante" in observaciones_txt + assert "identificador" in observaciones_txt + assert "constante" not in problemas_txt + + +def test_golden_score_columna_limpia_es_100(): + """Columna sin nulos, numérica nativa: score 100 aunque tenga (o no) outliers.""" + ch = build_calidad(_profile(), {}) + scores = _table_by_title(ch, "Scores") + by_name = {r[0]: r for r in scores.rows} + assert by_name["limpia"][1] == "100 / 100" + # edad: 20% nulos -> 100*(0.6*0.8 + 0.4*1.0) = 88; los outliers no bajan nada. + assert by_name["edad"][1] == "88 / 100" + + +# --------------------------------------------------------------------------- # +# Glosario (contrato §11.1) +# --------------------------------------------------------------------------- # +def test_glosario_registra_los_cuatro_terminos_y_marca_clicable(): + glossary = model.GlossaryCollector() + ch = build_calidad(_profile(), {"glossary": glossary}) + for key in ("calidad_datos", "completitud", "validez", "unicidad_registro"): + assert glossary.has(key), f"término {key!r} no registrado en el glosario" + intro = [b for b in ch.blocks if b.kind == "markdown"][0].text + # Con colector presente, la primera aparición se marca clicable. + assert "[[term:completitud]]" in intro + assert "[[term:validez]]" in intro + assert "[[term:calidad_datos]]" in intro + assert "[[term:unicidad_registro]]" in intro + + +def test_sin_glosario_no_marca_terminos(): + ch = build_calidad(_profile(), {}) # ctx sin glossary + intro = [b for b in ch.blocks if b.kind == "markdown"][0].text + assert "[[term:" not in intro # --------------------------------------------------------------------------- # @@ -124,17 +163,17 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota(): prof = { "quality_score": 100, "columns": [ - {"name": "a", "inferred_type": "float", "null_pct": 0.0, - "numeric": {"outlier_pct": 0.0}}, - {"name": "b", "inferred_type": "float", "null_pct": 0.0, - "numeric": {"outlier_pct": 0.0}}, + {"name": "a", "inferred_type": "numeric", "null_pct": 0.0, + "unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}}, + {"name": "b", "inferred_type": "numeric", "null_pct": 0.0, + "unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}}, ], } ch = build_calidad(prof, {}) assert ch is not None - assert _issues_table(ch) is None # no issues table + assert _table_by_title(ch, "Problemas de calidad") is None # no issues table notes = [b for b in ch.blocks if b.kind == "note"] - assert notes and "No se detectaron problemas" in notes[0].text + assert any("No se detectaron problemas" in n.text for n in notes) # --------------------------------------------------------------------------- # @@ -143,44 +182,42 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota(): def _wide_profile(ncols: int = 22) -> dict: cols = [ {"name": "identificador_unico_de_transaccion_con_nombre_muy_largo", - "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99}, + "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99, + "flags": ["possible_id"]}, {"name": "columna_constante_sin_ninguna_variacion_de_valor", - "inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]}, + "inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.01, + "flags": ["constant"]}, ] for k in range(ncols - 2): cols.append({ "name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo", - "inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05, - "numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000}, + "inferred_type": "numeric", "null_pct": 0.1 + (k % 3) * 0.05, + "unique_pct": 0.5, + "numeric": {"outlier_pct": 8.0, "min": 0, "max": 1000}, }) - return {"table": "ancha", "quality_score": 70.0, "columns": cols} + return {"table": "ancha", "quality_score": 70.0, "duplicate_pct": 0.0, + "columns": cols} def test_anticut_pdf_y_pptx_no_truncan_nombres_largos(): prof = _wide_profile(22) full = build_document(prof, {"dataset_name": "ancha"}) assert any(c.id == "calidad" for c in full) - # Render ONLY the calidad chapter so the anti-cut assertions are scoped to - # this chapter (other chapters, e.g. portada, legitimately contain '…'). chapters = [c for c in full if c.id == "calidad"] long_name = "metrica_numerica_de_negocio_00_con_nombre_largo" with tempfile.TemporaryDirectory() as d: pdf = os.path.join(d, "q.pdf") pptx = os.path.join(d, "q.pptx") rp = render_pdf(chapters, pdf, {"title": "EDA"}) - rx = render_pptx(chapters, pptx, {"title": "EDA"}) + render_pptx(chapters, pptx, {"title": "EDA"}) assert os.path.exists(pdf) and os.path.exists(pptx) - # The wide table forces pagination across several pages/slides. assert (rp or {}).get("n_pages", 0) >= 2 - # PDF: the long name survives whole once wraps (spaces/newlines) removed, - # and there is no truncation marker. pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages) assert "…" not in pdf_txt and "..." not in pdf_txt norm = re.sub(r"\s+", "", pdf_txt) assert long_name in norm, "el nombre largo se cortó en el PDF" - # PPTX: long name present in some cell, untruncated. allt = [] for s in Presentation(pptx).slides: for sh in s.shapes: diff --git a/python/functions/datascience/column_quality_score.md b/python/functions/datascience/column_quality_score.md index 2e5fa7af..0a33f60d 100644 --- a/python/functions/datascience/column_quality_score.md +++ b/python/functions/datascience/column_quality_score.md @@ -4,10 +4,10 @@ name: column_quality_score kind: function lang: py domain: datascience -version: "1.0.0" +version: "2.0.0" purity: pure signature: "def column_quality_score(col: dict) -> dict" -description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda, con desglose completeness/validity/consistency y lista de issues legibles. Funcion pura, no muta el input." +description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda. Combina completeness (0.6) y validity (0.4) con renormalizacion por aplicabilidad; los outliers, columnas constantes e ids NO bajan el score (van a observations). Devuelve desglose por dimension, issues (defectos) y observations (señales analiticas). Funcion pura, no muta el input." tags: [eda, data-quality, profiling, scoring, datascience] uses_functions: [] uses_types: [] @@ -17,20 +17,26 @@ error_type: "" imports: [] example: | from datascience import column_quality_score - col = {"name": "precio", "inferred_type": "float", "null_pct": 0.2, - "unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 0.08}} + col = {"name": "precio", "inferred_type": "numeric", "null_pct": 0.2, + "unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 8.0}} column_quality_score(col) - # {"score": 86.8, "completeness": 0.8, "validity": 0.92, - # "consistency": 1.0, "issues": ["20% nulos", "8% outliers"]} + # {"score": 88.0, "completeness": 0.8, "validity": 1.0, + # "applicable": ["completeness", "validity"], "issues": ["20% nulos"], + # "observations": ["8% de valores atípicos (z-score>3): ..."]} tested: true tests: - "test_clean_column_high_score" - - "test_half_null_lowers_completeness_and_score" - - "test_constant_column_flags_issue" + - "test_weights_60_40_native_type" + - "test_outliers_do_not_penalize_score" + - "test_nulls_lower_score_more_than_outliers" + - "test_validity_from_parse_rate_lowers_score" + - "test_validity_from_match_rate" + - "test_free_text_renormalizes_to_completeness_only" + - "test_all_null_column_scores_zero" + - "test_constant_column_scores_full_and_is_observation" + - "test_high_cardinality_id_scores_full_and_is_observation" + - "test_mostly_null_no_double_counts_validity" - "test_empty_dict_does_not_crash" - - "test_outliers_penalize_validity" - - "test_mostly_null_flag_halves_validity" - - "test_high_cardinality_text_flagged_as_id" - "test_none_values_treated_defensively" - "test_does_not_mutate_input" test_file_path: "python/functions/datascience/column_quality_score_test.py" @@ -38,16 +44,22 @@ file_path: "python/functions/datascience/column_quality_score.py" params: - name: col desc: > - ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb). - Se leen sus claves de forma defensiva con .get(...) y se toleran valores - None. Claves usadas: null_pct (0-1), inferred_type, semantic_type, - unique_pct (0-1), flags (list[str], reconoce "constant"/"mostly_null"), - numeric ({outlier_pct: 0-1, ...}|None) y match_rate (opcional, 0-1). + ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb / + profile_table). Se leen sus claves de forma defensiva con .get(...) y se + toleran valores None. Claves usadas: null_pct (0-1), n_rows, empty_count + (texto), inferred_type, semantic_type, validity_rate (0-1, lo expone + profile_table al promocionar texto a numero/fecha), match_rate (0-1), + unique_pct (0-1), flags (list[str], reconoce + "constant"/"possible_id"/"high_cardinality") y numeric ({outlier_pct: 0-100, + skew, ...}|None). output: > - dict con score (float 0-100, redondeado a 1 decimal), completeness (0-1), - validity (0-1), consistency (0-1) e issues (list[str] de descripciones - legibles de los problemas detectados). score = round(100 * (0.5*completeness - + 0.3*validity + 0.2*consistency), 1). + dict con score (float 0-100, 1 decimal), completeness (0-1), validity (0-1 o + None si no aplicable), dimensions ({completeness, validity}), applicable + (list[str] de dimensiones que entraron en el score), issues (list[str] SOLO de + defectos de calidad: nulos, vacios, valores no conformes) y observations + (list[str] de señales analiticas que NO bajan el score: outliers, columna + constante, posible id, asimetria). score = round(100 * (0.6*completeness + + 0.4*validity) / pesos_aplicables, 1), renormalizado cuando validity no aplica. --- ## Ejemplo @@ -59,51 +71,71 @@ from datascience import column_quality_score col = { "name": "precio", "physical_type": "DOUBLE", - "inferred_type": "float", + "inferred_type": "numeric", "semantic_type": "", - "count": 800, "n_rows": 1000, "null_count": 200, "null_pct": 0.20, "distinct_count": 400, "unique_pct": 0.40, "flags": [], - "numeric": {"outlier_pct": 0.08}, + "numeric": {"outlier_pct": 8.0, "skew": 0.3}, "categorical": None, "datetime": None, } column_quality_score(col) # { -# "score": 86.8, -# "completeness": 0.8, # 1 - 0.20 -# "validity": 0.92, # 1 - min(0.08, 0.3) -# "consistency": 1.0, -# "issues": ["20% nulos", "8% outliers"], +# "score": 88.0, # 100 * (0.6*0.8 + 0.4*1.0) +# "completeness": 0.8, # 1 - 0.20 +# "validity": 1.0, # numerica nativa: el tipo es conforme +# "dimensions": {"completeness": 0.8, "validity": 1.0}, +# "applicable": ["completeness", "validity"], +# "issues": ["20% nulos"], # SOLO defectos de calidad +# "observations": ["8% de valores atípicos (z-score>3): ..."], # NO bajan score # } ``` ## Cuando usarla Cuando hayas perfilado una tabla con el grupo `eda` (p.ej. -`summarize_table_duckdb`) y necesites un numero 0-100 por columna para -ordenar/priorizar limpieza de datos, pintar semaforos de calidad en un -dashboard, o decidir que columnas descartar antes de modelar. Es la capa de -scoring sobre el ColumnProfile crudo: lee el perfil, no toca los datos. +`summarize_table_duckdb` / `profile_table`) y necesites un numero 0-100 por +columna para ordenar/priorizar limpieza de datos, pintar semaforos de calidad, +o decidir que columnas descartar antes de modelar. Separa los **defectos de +calidad reales** (`issues`: nulos, vacios, valores que no parsean a su tipo) de +las **observaciones analiticas** (`observations`: outliers, columnas constantes, +ids), que se reportan pero no penalizan. Es la capa de scoring sobre el +ColumnProfile crudo: lee el perfil, no toca los datos. -## Notas +## Gotchas -Funcion pura, sin I/O ni dependencias externas, no muta `col`. Lee todas las -claves con `.get(...)` y tolera que vengan en `None` (un ColumnProfile recien -salido de `summarize_table_duckdb` trae muchas claves a `None`), por lo que -nunca falla por claves ausentes — un `{}` produce un resultado bien definido. +Funcion pura, sin I/O, no muta `col`. Aun asi conviene saber: -Pesos del score: completeness 0.5, validity 0.3, consistency 0.2. +- **Los outliers NO bajan el score.** Un valor extremo puede ser real y correcto + (un cliente que compra mucho); detectar atipicos es analisis de la + distribucion, no un juicio de correccion. Salen en `observations`, no en + `issues`. Mismo trato para columnas constantes e identificadores de alta + cardinalidad: son observaciones, no defectos. +- **`validity` puede ser `None`** (no aplicable): texto libre sin `semantic_type` + ni `validity_rate`, o columna 100% nula. En ese caso el score se renormaliza a + solo `completeness` (la columna no se premia ni castiga por algo no medible). +- **`outlier_pct` se interpreta en escala 0-100** (la que emite + `describe_numeric`, z-score>3). Pasar una fraccion 0-1 produce un texto de + observacion con el % equivocado, pero NUNCA afecta al score. +- **`validity_rate` lo puebla `profile_table`** al promocionar una columna de + texto a numero/fecha (fraccion que parsea). Si no esta presente y el tipo es + nativo numerico/fecha/bool, `validity = 1.0`. +- Sin doble conteo: la falta de datos cuenta solo en `completeness` (el antiguo + castigo de `mostly_null` sobre `validity` se elimino). -- **completeness** = `1 - null_pct` (None -> 0 nulls -> 1.0). -- **validity**: parte de 1.0 y penaliza `min(outlier_pct, 0.3)` en columnas - numericas, `0.5 * (1 - match_rate)` si hay `semantic_type` declarado con - `match_rate` bajo disponible, y multiplica por 0.5 si el flag `mostly_null` - esta presente. -- **consistency**: 1.0 salvo flag `constant` (-> 0.3, columna poco informativa) - o texto con `unique_pct > 0.9` (-> 0.6, posible id de alta cardinalidad). +## Capability growth log + +- v2.0.0 (2026-06-30) — nueva formula de calidad (report 2046): pesos 60/40 + (completeness/validity) con renormalizacion por aplicabilidad; se elimina la + dimension `consistency`-como-informatividad y el doble castigo de + `mostly_null`; los outliers/constantes/ids salen del score a `observations`; + validity mide conformidad real (parse rate / match rate / tipo nativo). Salida + ampliada con `dimensions`, `applicable` y `observations`. +- v1.0.0 — version inicial: pesos 50/30/20 (completeness/validity/consistency), + los outliers penalizaban validity (con bug de escala) y consistency penalizaba + informatividad. diff --git a/python/functions/datascience/column_quality_score.py b/python/functions/datascience/column_quality_score.py index d16e9849..11d83988 100644 --- a/python/functions/datascience/column_quality_score.py +++ b/python/functions/datascience/column_quality_score.py @@ -1,34 +1,78 @@ """Score de calidad de datos (0-100) para un ColumnProfile del grupo eda. Funcion pura: dado el perfil de una columna producido por el grupo de -capacidad `eda` (p.ej. summarize_table_duckdb), calcula un score agregado -de calidad junto a su desglose en completeness / validity / consistency y -una lista de issues legibles. No realiza I/O ni muta el input. +capacidad `eda` (p.ej. summarize_table_duckdb / profile_table), calcula un +score agregado de calidad junto a su desglose por dimension y dos listas +legibles separadas: `issues` (defectos de calidad reales que SI bajan el +score) y `observations` (señales analiticas que NO bajan el score). No +realiza I/O ni muta el input. + +Modelo (DAMA-DMBOK / ISO 8000), ver report 2046: + +- Solo entran en el score las dimensiones medibles automaticamente desde el + perfil, sin fuente externa de verdad: completeness y validity por columna. +- Renormalizacion por aplicabilidad: si una dimension no es medible en la + columna (texto libre sin semantica -> validity no aplica; columna 100% nula + -> validity no medible), se excluye y los pesos se renormalizan sobre las + aplicables. Una columna ni se premia ni se castiga por algo no medible. +- Sin doble conteo: la falta de datos cuenta solo en completeness (se elimino + el antiguo castigo extra de `mostly_null` sobre validity). +- Los OUTLIERS NO bajan la calidad. Un valor extremo puede ser real y + correcto; detectar atipicos es analisis de la distribucion, no un juicio de + coreccion. Outliers, columnas constantes e identificadores de alta + cardinalidad pasan a `observations`, nunca a `issues`. """ +# Pesos base de las dimensiones de columna (se renormalizan por aplicabilidad). +_W_COMPLETENESS = 0.6 +_W_VALIDITY = 0.4 + +# Tipos inferidos cuyo almacen garantiza la conformidad de tipo (validity=1.0) +# cuando NO vienen de una promocion de texto (en cuyo caso manda validity_rate). +_NATIVE_TYPED = ("numeric", "integer", "float", "datetime", "date", "boolean", "bool") + + def column_quality_score(col: dict) -> dict: """Calcula un score de calidad de datos 0-100 para un ColumnProfile. - El score pondera tres dimensiones: - - completeness (0.5): proporcion de valores no nulos. - - validity (0.3): ausencia de outliers / heuristicas de validez. - - consistency (0.2): la columna aporta informacion (no constante, no ruido). + El score combina solo dimensiones de calidad medibles desde el perfil, con + renormalizacion por aplicabilidad: + + - completeness (peso base 0.6, siempre aplica): proporcion de valores + presentes = 1 - null_pct. En texto, las celdas vacias (`empty_count`) + tambien cuentan como faltantes. + - validity (peso base 0.4, cuando hay un criterio de validacion real): + fraccion de valores no nulos conformes a su tipo/semantica. Tipo nativo + numerico/fecha/bool = 1.0; texto promovido a numero/fecha = parse rate + (`validity_rate`); texto con `semantic_type` regexable = `match_rate`; + texto libre o columna 100% nula = NO aplicable (renormaliza a solo + completeness). + + Los outliers, columnas constantes, identificadores y asimetria fuerte NO + bajan el score: se devuelven en `observations`. Args: col: ColumnProfile dict del grupo eda. Se leen las claves de forma defensiva con .get(...) y se tolera que muchas vengan en None. - Claves relevantes: null_pct, inferred_type, semantic_type, - unique_pct, flags (list[str]), numeric ({outlier_pct, ...}|None), - match_rate (opcional). + Claves relevantes: null_pct (0-1), n_rows, empty_count, + inferred_type, semantic_type, validity_rate (0-1, lo expone + profile_table al promocionar texto a numero/fecha), match_rate + (0-1), unique_pct (0-1), flags (list[str], reconoce + "constant"/"possible_id"/"high_cardinality"), numeric + ({outlier_pct: 0-100, skew, ...}|None). Returns: dict con: - score (float, 0-100, redondeado a 1 decimal), - completeness (float, 0-1), - validity (float, 0-1), - consistency (float, 0-1), - issues (list[str]) descripciones legibles de los problemas. + score (float 0-100, redondeado a 1 decimal), + completeness (float 0-1), + validity (float 0-1 | None si no aplicable), + dimensions ({completeness, validity}), + applicable (list[str] de dimensiones que entraron en el score), + issues (list[str]) SOLO defectos de calidad (nulos, vacios, + valores no conformes a su tipo/semantica), + observations (list[str]) señales analiticas que NO bajan el score + (outliers, columna constante, posible id, asimetria). """ if not isinstance(col, dict): col = {} @@ -39,103 +83,153 @@ def column_quality_score(col: dict) -> dict: flags = set(flags) issues: list[str] = [] + observations: list[str] = [] + + inferred_type = col.get("inferred_type") or "" + semantic_type = col.get("semantic_type") or "" # --- completeness ------------------------------------------------- - null_pct = col.get("null_pct") - if null_pct is None: - null_pct = 0.0 - try: - null_pct = float(null_pct) - except (TypeError, ValueError): - null_pct = 0.0 - null_pct = _clamp(null_pct, 0.0, 1.0) + # Falta de datos = nulos + (en texto) celdas vacias. Es el unico sitio + # donde la falta de datos cuenta: nunca se duplica en validity. + null_pct = _clamp(_num(col.get("null_pct"), 0.0), 0.0, 1.0) completeness = 1.0 - null_pct if null_pct > 0: - issues.append(f"{round(null_pct * 100)}% nulos") + issues.append(f"{_pct(null_pct)} nulos") - # --- validity ----------------------------------------------------- - validity = 1.0 - inferred_type = col.get("inferred_type") or "" + empty_frac = 0.0 + n_rows = col.get("n_rows") + empty_count = col.get("empty_count") + if ( + isinstance(n_rows, (int, float)) and not isinstance(n_rows, bool) and n_rows > 0 + and isinstance(empty_count, (int, float)) and not isinstance(empty_count, bool) + and empty_count > 0 + ): + empty_frac = _clamp(float(empty_count) / float(n_rows), 0.0, 1.0) + completeness = _clamp(completeness - empty_frac, 0.0, 1.0) + issues.append(f"{_pct(empty_frac)} vacíos") - numeric = col.get("numeric") - is_numeric = inferred_type in ("integer", "float", "numeric") or isinstance(numeric, dict) - if isinstance(numeric, dict): - outlier_pct = numeric.get("outlier_pct") - if outlier_pct is not None: - try: - outlier_pct = float(outlier_pct) - except (TypeError, ValueError): - outlier_pct = 0.0 - outlier_pct = _clamp(outlier_pct, 0.0, 1.0) - if outlier_pct > 0: - penalty = min(outlier_pct, 0.3) - validity -= penalty - issues.append(f"{round(outlier_pct * 100)}% outliers") - - # semantic_type declarado pero con baja tasa de match (si la conocemos). - semantic_type = col.get("semantic_type") or "" - match_rate = col.get("match_rate") - if semantic_type and match_rate is not None: - try: - match_rate = float(match_rate) - except (TypeError, ValueError): - match_rate = None - if match_rate is not None: - match_rate = _clamp(match_rate, 0.0, 1.0) - if match_rate < 1.0: - shortfall = 1.0 - match_rate - validity -= 0.5 * shortfall - issues.append( - f"semantic_type '{semantic_type}' con baja coincidencia " - f"({round(match_rate * 100)}%)" - ) - - if "mostly_null" in flags: - validity *= 0.5 - issues.append("mayoritariamente nula") - - validity = _clamp(validity, 0.0, 1.0) - - # --- consistency -------------------------------------------------- - consistency = 1.0 - if "constant" in flags: - consistency = 0.3 - issues.append("columna constante") + # --- validity (con renormalizacion por aplicabilidad) ------------- + # None = no medible -> se excluye del score (no penaliza ni premia). + validity = None + if completeness <= 0.0: + # Columna 100% faltante: no hay valores no nulos sobre los que medir + # conformidad. validity no aplica -> el score sale solo de completeness + # (= 0). Es el peor defecto de calidad posible. + validity = None else: - unique_pct = col.get("unique_pct") - if unique_pct is not None: - try: - unique_pct = float(unique_pct) - except (TypeError, ValueError): - unique_pct = None - if ( - inferred_type == "text" + validity_rate = col.get("validity_rate") + match_rate = col.get("match_rate") + if validity_rate is not None: + # Texto promovido a numero/fecha: parse rate real de la muestra. + v = _num(validity_rate, None) + if v is not None: + validity = _clamp(v, 0.0, 1.0) + if validity < 1.0: + kind = ( + "número" if inferred_type == "numeric" + else "fecha" if inferred_type == "datetime" + else inferred_type or "su tipo" + ) + issues.append( + f"{_pct(1.0 - validity)} no parsea al tipo {kind}" + ) + elif inferred_type in _NATIVE_TYPED: + # Tipo nativo garantizado por el almacen: no hay valores que no + # parseen. validity = 1.0 (no se confunde con tener outliers). + validity = 1.0 + elif semantic_type and match_rate is not None: + v = _num(match_rate, None) + if v is not None: + validity = _clamp(v, 0.0, 1.0) + if validity < 1.0: + issues.append( + f"{_pct(1.0 - validity)} no casa con el " + f"formato «{semantic_type}»" + ) + else: + # Texto libre / categorica sin semantica: no hay criterio honesto + # de validez. No aplica. + validity = None + + # --- observations (NO bajan el score) ----------------------------- + numeric = col.get("numeric") + if isinstance(numeric, dict): + # outlier_pct viene en escala 0-100 desde describe_numeric (z-score>3). + outlier_pct = _num(numeric.get("outlier_pct"), None) + if outlier_pct is not None and outlier_pct >= 0.05: + observations.append( + f"{_pct(outlier_pct / 100.0)} de valores atípicos (z-score>3): " + "revisar si son errores u observaciones legítimas" + ) + skew = _num(numeric.get("skew"), None) + if skew is not None and abs(skew) >= 1.0: + observations.append( + f"asimetría fuerte (skew={round(skew, 2)}): considerar " + "re-expresión antes de modelar" + ) + + if "constant" in flags: + observations.append( + "columna constante: aporta poca información para el análisis" + ) + + unique_pct = _num(col.get("unique_pct"), None) + is_id = ( + "possible_id" in flags + or "high_cardinality" in flags + or ( + inferred_type in ("text", "categorical") and unique_pct is not None and _clamp(unique_pct, 0.0, 1.0) > 0.9 - ): - consistency = 0.6 - issues.append("posible id de alta cardinalidad") - - consistency = _clamp(consistency, 0.0, 1.0) - - # --- score agregado ---------------------------------------------- - score = round( - 100.0 * (0.5 * completeness + 0.3 * validity + 0.2 * consistency), - 1, + ) ) + if is_id: + observations.append( + "valores casi únicos: posible identificador (no es un defecto de calidad)" + ) - # Silencia warnings sobre la variable de tipo no usada. - _ = is_numeric + # --- score agregado con renormalizacion --------------------------- + applicable = ["completeness"] + num = _W_COMPLETENESS * completeness + den = _W_COMPLETENESS + if validity is not None: + applicable.append("validity") + num += _W_VALIDITY * validity + den += _W_VALIDITY + score = round(100.0 * num / den, 1) if den > 0 else 0.0 return { "score": score, "completeness": completeness, "validity": validity, - "consistency": consistency, + "dimensions": {"completeness": completeness, "validity": validity}, + "applicable": applicable, "issues": issues, + "observations": observations, } +def _pct(frac: float) -> str: + """Formatea una fraccion 0-1 como porcentaje honesto: «N%» si >=1%, «0.N%» + por debajo (para no mostrar «0%» cuando hay un defecto real pequeño).""" + p = frac * 100.0 + if p >= 1.0: + return f"{round(p)}%" + return f"{p:.1f}%" + + +def _num(x, default): + """Convierte x a float; devuelve `default` si es None o no parseable.""" + if x is None: + return default + if isinstance(x, bool): + return default + try: + return float(x) + except (TypeError, ValueError): + return default + + def _clamp(x: float, lo: float, hi: float) -> float: """Recorta x al rango [lo, hi].""" if x < lo: diff --git a/python/functions/datascience/column_quality_score_test.py b/python/functions/datascience/column_quality_score_test.py index 6ed0d7e2..6f862ec6 100644 --- a/python/functions/datascience/column_quality_score_test.py +++ b/python/functions/datascience/column_quality_score_test.py @@ -1,4 +1,12 @@ -"""Tests para column_quality_score.""" +"""Tests para column_quality_score (nueva fórmula, report 2046). + +Verifica las invariantes de la fórmula de calidad: + - completeness (0.6) + validity (0.4) con renormalización por aplicabilidad. + - Los OUTLIERS no bajan el score (van a observations, no a issues). + - Columnas constantes e ids no bajan el score (observations). + - Sin doble conteo de la falta de datos. + - all-null -> score 0; función pura (no muta el input). +""" import os import sys @@ -9,11 +17,11 @@ from column_quality_score import column_quality_score def _clean_numeric_col() -> dict: - """ColumnProfile de una columna numerica sana, sin problemas.""" + """ColumnProfile de una columna numérica nativa sana, sin problemas.""" return { "name": "edad", "physical_type": "INTEGER", - "inferred_type": "integer", + "inferred_type": "numeric", "semantic_type": "", "count": 1000, "n_rows": 1000, @@ -28,85 +36,163 @@ def _clean_numeric_col() -> dict: } +# --------------------------------------------------------------------------- # +# Golden +# --------------------------------------------------------------------------- # def test_clean_column_high_score(): out = column_quality_score(_clean_numeric_col()) - assert out["score"] > 90 + assert out["score"] == 100.0 assert out["completeness"] == 1.0 assert out["validity"] == 1.0 - assert out["consistency"] == 1.0 + assert out["applicable"] == ["completeness", "validity"] assert out["issues"] == [] + assert out["observations"] == [] -def test_half_null_lowers_completeness_and_score(): +def test_weights_60_40_native_type(): + """30% nulos en numérica nativa: score = 100*(0.6*0.7 + 0.4*1.0) = 82.""" col = _clean_numeric_col() - col["null_count"] = 500 - col["null_pct"] = 0.5 - clean_score = column_quality_score(_clean_numeric_col())["score"] + col["null_pct"] = 0.30 + col["null_count"] = 300 out = column_quality_score(col) - assert out["completeness"] == 0.5 - assert out["score"] < clean_score - assert any("nulos" in issue for issue in out["issues"]) + assert out["completeness"] == 0.7 + assert out["validity"] == 1.0 + assert out["score"] == 82.0 + assert any("nulos" in i for i in out["issues"]) -def test_constant_column_flags_issue(): +# --------------------------------------------------------------------------- # +# Outliers FUERA del score +# --------------------------------------------------------------------------- # +def test_outliers_do_not_penalize_score(): + """Columna con outliers pero sin nulos -> score máximo; outliers en observations.""" + col = _clean_numeric_col() + col["numeric"] = {"outlier_pct": 18.0, "skew": 0.2} # 18% atípicos (escala 0-100) + out = column_quality_score(col) + assert out["score"] == 100.0 # los outliers NO bajan la calidad + assert out["validity"] == 1.0 + # No aparecen como problema de calidad... + assert not any("atípic" in i or "outlier" in i for i in out["issues"]) + # ...sino como observación analítica. + assert any("atípic" in o for o in out["observations"]) + + +def test_nulls_lower_score_more_than_outliers(): + """Vacíos sí penalizan; outliers no: comparar las dos columnas.""" + con_nulos = _clean_numeric_col() + con_nulos["null_pct"] = 0.30 + con_outliers = _clean_numeric_col() + con_outliers["numeric"] = {"outlier_pct": 30.0} + assert column_quality_score(con_nulos)["score"] < \ + column_quality_score(con_outliers)["score"] + + +# --------------------------------------------------------------------------- # +# Validity: aplicabilidad y renormalización +# --------------------------------------------------------------------------- # +def test_validity_from_parse_rate_lowers_score(): + """Numérica como texto con 20% basura: validity=0.8 -> score=92.""" + col = { + "name": "precio_txt", "inferred_type": "numeric", "semantic_type": "decimal", + "null_pct": 0.0, "validity_rate": 0.80, "flags": [], "numeric": None, + } + out = column_quality_score(col) + assert out["validity"] == 0.8 + assert out["score"] == 92.0 # 100*(0.6 + 0.4*0.8) + assert any("no parsea" in i for i in out["issues"]) + + +def test_validity_from_match_rate(): + """Texto con semantic_type y 5% no conforme: validity=0.95.""" + col = { + "name": "email", "inferred_type": "text", "semantic_type": "email", + "null_pct": 0.0, "match_rate": 0.95, "unique_pct": 0.5, "flags": [], + } + out = column_quality_score(col) + assert out["validity"] == 0.95 + assert out["score"] == 98.0 # 100*(0.6 + 0.4*0.95) + assert any("no casa" in i for i in out["issues"]) + + +def test_free_text_renormalizes_to_completeness_only(): + """Texto libre sin semántica: validity no aplica -> score = 100*completeness.""" + col = { + "name": "comentario", "inferred_type": "text", "semantic_type": "", + "null_pct": 0.30, "unique_pct": 0.5, "flags": [], "numeric": None, + } + out = column_quality_score(col) + assert out["validity"] is None + assert out["applicable"] == ["completeness"] + assert out["completeness"] == 0.7 + assert out["score"] == 70.0 # renormalizado a solo completeness + + +# --------------------------------------------------------------------------- # +# Casos límite (report §4.6) +# --------------------------------------------------------------------------- # +def test_all_null_column_scores_zero(): + col = _clean_numeric_col() + col["null_pct"] = 1.0 + col["null_count"] = 1000 + out = column_quality_score(col) + assert out["completeness"] == 0.0 + assert out["validity"] is None # no medible sin valores no nulos + assert out["score"] == 0.0 + + +def test_constant_column_scores_full_and_is_observation(): + """Columna constante: dato válido y completo -> score 100; baja info = observación.""" col = _clean_numeric_col() col["flags"] = ["constant"] col["distinct_count"] = 1 col["unique_pct"] = 0.001 out = column_quality_score(col) - assert out["consistency"] == 0.3 - assert any("constante" in issue for issue in out["issues"]) + assert out["score"] == 100.0 # NO se castiga la baja informatividad + assert not any("constante" in i for i in out["issues"]) + assert any("constante" in o for o in out["observations"]) +def test_high_cardinality_id_scores_full_and_is_observation(): + """Id de alta cardinalidad: unicidad perfecta -> score 100; posible id = observación.""" + col = { + "name": "uuid", "inferred_type": "text", "semantic_type": "", + "null_pct": 0.0, "unique_pct": 0.99, "flags": ["possible_id"], + "numeric": None, + } + out = column_quality_score(col) + assert out["score"] == 100.0 + assert not any("identificador" in i for i in out["issues"]) + assert any("identificador" in o for o in out["observations"]) + + +def test_mostly_null_no_double_counts_validity(): + """85% nulos: solo completeness penaliza; validity nativa sigue 1.0 (sin doble castigo).""" + col = _clean_numeric_col() + col["null_pct"] = 0.85 + col["flags"] = ["mostly_null"] + out = column_quality_score(col) + assert out["validity"] == 1.0 # ya no se multiplica por 0.5 + # score = 100*(0.6*0.15 + 0.4*1.0) = 49 + assert out["score"] == 49.0 + assert not any("mayoritariamente" in o for o in out["observations"]) + + +# --------------------------------------------------------------------------- # +# Robustez +# --------------------------------------------------------------------------- # def test_empty_dict_does_not_crash(): out = column_quality_score({}) assert isinstance(out["score"], float) assert out["completeness"] == 1.0 assert 0.0 <= out["score"] <= 100.0 assert isinstance(out["issues"], list) - - -def test_outliers_penalize_validity(): - col = _clean_numeric_col() - col["numeric"] = {"outlier_pct": 0.2} - out = column_quality_score(col) - assert out["validity"] < 1.0 - assert any("outliers" in issue for issue in out["issues"]) - - -def test_mostly_null_flag_halves_validity(): - col = _clean_numeric_col() - col["null_pct"] = 0.85 - col["flags"] = ["mostly_null"] - out = column_quality_score(col) - assert out["validity"] == 0.5 - assert any("mayoritariamente nula" in issue for issue in out["issues"]) - - -def test_high_cardinality_text_flagged_as_id(): - col = { - "name": "uuid", - "inferred_type": "text", - "semantic_type": "", - "null_pct": 0.0, - "unique_pct": 0.99, - "flags": [], - "numeric": None, - } - out = column_quality_score(col) - assert out["consistency"] < 1.0 - assert any("alta cardinalidad" in issue for issue in out["issues"]) + assert isinstance(out["observations"], list) def test_none_values_treated_defensively(): col = { - "name": "x", - "inferred_type": None, - "semantic_type": None, - "null_pct": None, - "unique_pct": None, - "flags": None, - "numeric": None, + "name": "x", "inferred_type": None, "semantic_type": None, + "null_pct": None, "unique_pct": None, "flags": None, "numeric": None, } out = column_quality_score(col) assert out["completeness"] == 1.0 diff --git a/python/functions/datascience/summarize_table_duckdb.md b/python/functions/datascience/summarize_table_duckdb.md index e9c9213f..006e61bc 100644 --- a/python/functions/datascience/summarize_table_duckdb.md +++ b/python/functions/datascience/summarize_table_duckdb.md @@ -3,7 +3,7 @@ name: summarize_table_duckdb kind: function lang: py domain: datascience -version: "1.0.0" +version: "1.1.0" purity: impure signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict" description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)." @@ -64,6 +64,7 @@ else: - **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima). - **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75. - **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`). +- **`duplicate_rows`/`duplicate_pct` se pueblan push-down** (desde v1.1.0) con `count(*)` sobre `SELECT DISTINCT *` (sin traer filas a RAM): `duplicate_rows = n_rows - filas_distintas`, `duplicate_pct` en fraccion 0-1. Habilitan la dimension de unicidad de registro del score de dataset (`profile_table` paso 6). Si la tabla tiene tipos no comparables con `DISTINCT` (BLOB/LIST/MAP) la query degrada y ambas vuelven a `None` (renormaliza el score a solo `cell_quality`). - **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica). - **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage). - **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base. diff --git a/python/functions/datascience/summarize_table_duckdb.py b/python/functions/datascience/summarize_table_duckdb.py index 641c230f..93e5ebb1 100644 --- a/python/functions/datascience/summarize_table_duckdb.py +++ b/python/functions/datascience/summarize_table_duckdb.py @@ -196,6 +196,21 @@ def summarize_table_duckdb( sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0 ) + # Unicidad de registro: filas duplicadas via COUNT de filas distintas + # push-down (DISTINCT *), sin traer filas a RAM. Habilita la dimension + # de uniqueness del score de dataset (1 - duplicate_pct). Degrada a None + # si la tabla tiene tipos no comparables con DISTINCT (BLOB/LIST/MAP). + duplicate_rows = None + duplicate_pct = None + if n_rows > 0: + dup_res = duckdb_query_readonly( + db_path, f"SELECT count(*) AS c FROM (SELECT DISTINCT * FROM {quoted})" + ) + if dup_res["status"] == "ok" and dup_res["rows"]: + distinct_rows = int(dup_res["rows"][0]["c"]) + duplicate_rows = max(0, n_rows - distinct_rows) + duplicate_pct = duplicate_rows / n_rows # fraccion 0-1 + profile = { "table": table, "source": "duckdb", @@ -203,8 +218,8 @@ def summarize_table_duckdb( "n_rows": n_rows, "n_cols": len(columns), "size_bytes": None, - "duplicate_rows": None, - "duplicate_pct": None, + "duplicate_rows": duplicate_rows, + "duplicate_pct": duplicate_pct, "constant_cols": constant_cols, "all_null_cols": all_null_cols, "null_cell_pct": null_cell_pct, diff --git a/python/functions/datascience/summarize_table_duckdb_test.py b/python/functions/datascience/summarize_table_duckdb_test.py index 57ff5485..ad8f9578 100644 --- a/python/functions/datascience/summarize_table_duckdb_test.py +++ b/python/functions/datascience/summarize_table_duckdb_test.py @@ -54,6 +54,30 @@ def test_shape_y_metadatos_tabla(db): assert profile["correlations"] is None +def test_duplicate_pct_sin_duplicados(db): + """Tabla con todas las filas distintas: duplicate_pct = 0, no None.""" + profile = summarize_table_duckdb(db, "ventas")["profile"] + assert profile["duplicate_rows"] == 0 + assert profile["duplicate_pct"] == 0.0 + + +def test_duplicate_pct_con_duplicados(tmp_path): + """Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down.""" + path = str(tmp_path / "dups.duckdb") + con = duckdb.connect(path) + con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)") + # 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4. + con.execute( + "INSERT INTO t VALUES " + "(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')" + ) + con.close() + profile = summarize_table_duckdb(path, "t")["profile"] + assert profile["n_rows"] == 5 + assert profile["duplicate_rows"] == 2 + assert profile["duplicate_pct"] == 0.4 + + def test_column_profile_shape(db): profile = summarize_table_duckdb(db, "ventas")["profile"] by_name = {c["name"]: c for c in profile["columns"]} diff --git a/python/functions/pipelines/profile_table.md b/python/functions/pipelines/profile_table.md index 2808965a..82a7a406 100644 --- a/python/functions/pipelines/profile_table.md +++ b/python/functions/pipelines/profile_table.md @@ -4,7 +4,7 @@ kind: pipeline lang: py domain: pipelines purity: impure -version: "1.0.0" +version: "1.1.0" signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict" description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla." tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries] @@ -114,3 +114,12 @@ para auditar la calidad de una tabla ya productiva. Reemplaza orquestar a mano Formatos exoticos pueden descartarse silenciosamente del calculo numerico. - `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red). +- **Score de calidad (report 2046, desde v1.1.0).** Paso 5: cada columna recibe + `quality_score` de `column_quality_score` con la formula 60/40 + (completeness/validity); al promocionar texto a numero/fecha se expone + `col["validity_rate"]` (parse rate de la muestra) para alimentar la dimension + validity. Paso 6: el score de dataset NO es la media simple — es + `100 * (0.85*cell_quality + 0.15*row_uniqueness)`, donde + `cell_quality = media(score_col/100)` y `row_uniqueness = 1 - duplicate_pct`. + Si `duplicate_pct` es `None` (backend sin calcularlo) el score se renormaliza + a solo `cell_quality`. Los outliers NO bajan el score (van a `observations`). diff --git a/python/functions/pipelines/profile_table.py b/python/functions/pipelines/profile_table.py index 8a0077af..17321805 100644 --- a/python/functions/pipelines/profile_table.py +++ b/python/functions/pipelines/profile_table.py @@ -477,9 +477,18 @@ def profile_table( if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE: col["inferred_type"] = "numeric" inferred = "numeric" + # Tasa de parseo real de la muestra: alimenta la + # dimension validity de column_quality_score (fraccion + # de valores conformes al tipo numerico promovido). + col["validity_rate"] = len(ok) / len(vals) elif semantic in _DATETIME_SEMANTIC: col["inferred_type"] = "datetime" inferred = "datetime" + # Tasa de parseo de la muestra a fecha (mismo papel que el + # parse rate numerico) para la dimension validity. + parsed_dt = [_to_ordinal_days(v) for v in vals] + ok_dt = [d for d in parsed_dt if d is not None] + col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None # 4) Enriquecer segun el inferred_type final. if inferred == "numeric": @@ -506,11 +515,36 @@ def profile_table( # 5) Score de calidad por columna. col["quality_score"] = column_quality_score(col).get("score") - # 6) Score agregado de la tabla (media de columnas). + # 6) Score agregado de la tabla (report 2046): NO media simple. + # cell_quality = media de los scores de columna, en [0,1]. + # row_uniqueness = 1 - duplicate_pct (unicidad de registro). + # score = 100 * (0.85*cell_quality + 0.15*row_uniqueness). + # Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular. scores = [ c["quality_score"] for c in cols if c.get("quality_score") is not None ] - prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None + if scores: + cell_quality = (sum(scores) / len(scores)) / 100.0 + dup_pct = prof.get("duplicate_pct") + if dup_pct is not None: + try: + d = float(dup_pct) + except (TypeError, ValueError): + d = None + else: + d = None + if d is not None: + # Tolerar escala 0-100 por si algun backend la entrega asi. + if d > 1.0: + d = d / 100.0 + row_uniqueness = max(0.0, min(1.0, 1.0 - d)) + prof["quality_score"] = round( + 100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1 + ) + else: + prof["quality_score"] = round(100.0 * cell_quality, 1) + else: + prof["quality_score"] = None # 7) Candidatos a clave. key_candidates = []