From 7fa19d65db4b7cc385309e25022478cd3148eebc Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Tue, 30 Jun 2026 20:38:39 +0200 Subject: [PATCH] =?UTF-8?q?feat(eda):=20cap=C3=ADtulo=20MISSINGNESS=20?= =?UTF-8?q?=E2=80=94=20patrones=20de=20datos=20faltantes=20(co-ocurrencia?= =?UTF-8?q?=20+=20MCAR/MAR)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Añade el capítulo `missingness` al motor AutomaticEDA, complemento natural de `calidad`: donde calidad reporta cuánto falta por columna, este capítulo analiza el PATRÓN de los nulos — dónde faltan y si las columnas faltan juntas (co-ocurrencia de ausencias), la señal que distingue MCAR de MAR antes de imputar. Capítulo (`chapters/missingness.py`), registrado en `chapters_registry.py` justo tras `calidad`: - Resumen global: % de celdas faltantes, columnas con nulos, filas completas vs incompletas. - Ranking por columna (tabla + barras horizontales). - Co-ocurrencia: correlación de las máscaras is-null entre columnas (heatmap + tabla de los pares que co-faltan, con co-faltantes y Jaccard). - Patrones de fila más frecuentes (estilo matriz de missingno). - Lectura MCAR/MAR exploratoria (heurística por correlación/solape de ausencias, no confirmatoria), que cita la evidencia concreta. - Términos de glosario clicables: missingness, MCAR, MAR. La máscara is-null por fila de TODAS las columnas (numéricas y categóricas) se construye con un push-down DuckDB sobre ctx['db_path']/table (mismo patrón que el capítulo agregación), con fallback a ctx['raw_numeric'] cuando no hay BD. Activa solo si la tabla tiene nulos; si no, devuelve None. Funciones nuevas del grupo `eda` (dominio datascience): - extract_null_mask (impura): máscara is-null por fila vía query_fn. - missingness_overview (pura): resumen global + filas completas/incompletas. - missingness_correlation (pura): correlación de ausencias + pares + Jaccard, reutiliza pearson. - missingness_row_patterns (pura): patrones de fila más comunes. - missingness_corr_heatmap_figure / missingness_rank_bar_figure (impuras): figuras. Verificado: EDA de titanic genera el capítulo en PDF + PPTX + MD con Cabin 77.1%, Age 19.9% y la co-ocurrencia Age↔Cabin (158 filas). Suite completa de AutomaticEDA + render_automatic_eda en verde (125 passed); tests por función y por capítulo; fn index sin error. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../automatic_eda/chapters/missingness.py | 594 ++++++++++++++++++ .../chapters/missingness_test.py | 162 +++++ .../automatic_eda/chapters_registry.py | 1 + .../datascience/extract_null_mask.md | 97 +++ .../datascience/extract_null_mask.py | 101 +++ .../datascience/extract_null_mask_test.py | 116 ++++ .../missingness_corr_heatmap_figure.md | 103 +++ .../missingness_corr_heatmap_figure.py | 158 +++++ .../missingness_corr_heatmap_figure_test.py | 62 ++ .../datascience/missingness_correlation.md | 68 ++ .../datascience/missingness_correlation.py | 120 ++++ .../missingness_correlation_test.py | 115 ++++ .../datascience/missingness_overview.md | 99 +++ .../datascience/missingness_overview.py | 116 ++++ .../datascience/missingness_overview_test.py | 146 +++++ .../missingness_rank_bar_figure.md | 93 +++ .../missingness_rank_bar_figure.py | 150 +++++ .../missingness_rank_bar_figure_test.py | 64 ++ .../datascience/missingness_row_patterns.md | 65 ++ .../datascience/missingness_row_patterns.py | 107 ++++ .../missingness_row_patterns_test.py | 87 +++ 21 files changed, 2624 insertions(+) create mode 100644 python/functions/datascience/automatic_eda/chapters/missingness.py create mode 100644 python/functions/datascience/automatic_eda/chapters/missingness_test.py create mode 100644 python/functions/datascience/extract_null_mask.md create mode 100644 python/functions/datascience/extract_null_mask.py create mode 100644 python/functions/datascience/extract_null_mask_test.py create mode 100644 python/functions/datascience/missingness_corr_heatmap_figure.md create mode 100644 python/functions/datascience/missingness_corr_heatmap_figure.py create mode 100644 python/functions/datascience/missingness_corr_heatmap_figure_test.py create mode 100644 python/functions/datascience/missingness_correlation.md create mode 100644 python/functions/datascience/missingness_correlation.py create mode 100644 python/functions/datascience/missingness_correlation_test.py create mode 100644 python/functions/datascience/missingness_overview.md create mode 100644 python/functions/datascience/missingness_overview.py create mode 100644 python/functions/datascience/missingness_overview_test.py create mode 100644 python/functions/datascience/missingness_rank_bar_figure.md create mode 100644 python/functions/datascience/missingness_rank_bar_figure.py create mode 100644 python/functions/datascience/missingness_rank_bar_figure_test.py create mode 100644 python/functions/datascience/missingness_row_patterns.md create mode 100644 python/functions/datascience/missingness_row_patterns.py create mode 100644 python/functions/datascience/missingness_row_patterns_test.py diff --git a/python/functions/datascience/automatic_eda/chapters/missingness.py b/python/functions/datascience/automatic_eda/chapters/missingness.py new file mode 100644 index 00000000..3a7034cd --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/missingness.py @@ -0,0 +1,594 @@ +"""Missingness chapter (MISSINGNESS) — patterns of missing data. + +Complements the CALIDAD chapter: where CALIDAD reports *how much* is missing per +column (the null percentage that lowers the completeness score), this chapter +reports the **pattern** of the missing data — whether columns tend to be missing +*together* (co-occurrence of absences) or independently. That distinction is what +separates data that is missing completely at random ([[term:mcar]]MCAR[[/term]]) +from data missing as a function of another variable ([[term:mar]]MAR[[/term]]), +which is the key question to settle before imputing or modelling. + +The chapter activates only when the table actually has missing data (at least one +column with a null in the aggregated profile); otherwise it returns ``None`` and +disappears from the document. + +Sections, in order: + +1. **Resumen global** — % of missing cells in the dataset, number of columns with + nulls, and complete rows (no missing) vs incomplete rows (≥1 missing). +2. **Ranking por columna** — columns sorted by their null percentage, with a + horizontal bar figure. +3. **Co-ocurrencia de ausencias** — the correlation of the binary is-null masks + between columns (which columns tend to be missing together): a heatmap plus a + table of the top column pairs that co-miss. +4. **Patrones de fila** — the most frequent "which columns are missing together" + row patterns, in the style of missingno's pattern matrix. +5. **Lectura MCAR/MAR** — an interpretive, *exploratory* note (not a confirmatory + test such as Little's) reading the absence correlations as a hint of MCAR + (independent absences) vs MAR (co-occurring absences). + +The aggregate per-column null counts come from the ``eda`` group ``TableProfile`` +(``columns[i]['null_count'] / 'null_pct'`` and the table-level ``null_cell_pct``). +The per-row is-null mask needed for co-occurrence is built from raw data: a single +DuckDB push-down over ``ctx['db_path'] / ctx['table']`` (same pattern as the +AGREGACION chapter) covering ALL columns, with a fallback to the numeric-only +``ctx['raw_numeric']`` when no database is reachable. All the heavy lifting is +delegated to pure registry functions (``missingness_overview``, +``missingness_correlation``, ``missingness_row_patterns``) and two figure helpers +(``missingness_rank_bar_figure``, ``missingness_corr_heatmap_figure``); every one +is imported lazily and degrades to an honest note so this chapter never raises. + +Contract: build_(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z". +""" + +from __future__ import annotations + +from .. import model + +CHAPTER_VERSION = "1.0.0" +CHAPTER_ID = "missingness" +CHAPTER_TITLE = "Datos faltantes" + +# Sample cap for the per-row is-null mask push-down. Co-occurrence and row +# patterns are computed on this sample; the global % of missing cells and the +# per-column ranking come from the (exact) aggregated profile instead. +MASK_SAMPLE = 5000 +# Thresholds for the MCAR/MAR heuristic note. A pair counts as a *strong* +# co-occurrence when the absence correlation alone is high; as a *partial* +# co-occurrence when the absences overlap materially (high Jaccard) even if the +# Pearson correlation is modest — the usual case when one column is missing far +# more often than the other (e.g. Cabin 77% vs Age 20% in Titanic), which dilutes +# the correlation while the rows still co-miss in absolute terms. +_CORR_STRONG = 0.30 +_JACCARD_NOTABLE = 0.20 +# Rows shown in the top-pairs and row-patterns tables (bounded, never silently +# truncated: the table note reports the full count). +_TOP_PAIRS = 12 +_TOP_PATTERNS = 12 +# Truncate long column names in tables (the renderer also wraps). +_LABEL_MAX = 28 + +# Glossary terms this chapter explains (contract §11.1). Registered in the shared +# collector and marked clickable on their first appearance. +_TERMS = { + "missingness": ( + "Patrón de datos faltantes (missingness)", + "El patrón con el que faltan los datos: cuánto falta, en qué columnas y " + "si las ausencias de unas columnas coinciden (co-ocurren) con las de " + "otras. Analizarlo —no solo contar nulos— distingue datos que faltan al " + "azar (MCAR) de los que faltan en función de otra variable (MAR), lo que " + "decide cómo imputar o si descartar filas sin sesgar el análisis.", + ), + "mcar": ( + "MCAR (Missing Completely At Random)", + "Los valores faltan de forma independiente de cualquier dato, observado o " + "no: las ausencias de unas columnas no se relacionan entre sí ni con los " + "valores. Es el caso más benigno —descartar filas o imputar la media no " + "introduce sesgo—, pero rara vez se cumple del todo en datos reales.", + ), + "mar": ( + "MAR (Missing At Random)", + "La probabilidad de que un valor falte depende de OTRAS variables " + "observadas (p. ej. una medición que falta más en cierto grupo). Las " + "ausencias co-ocurren entre columnas o se relacionan con los valores de " + "otras; imputar exige condicionar en esas variables para no sesgar. La " + "co-ocurrencia fuerte de ausencias es un indicio (exploratorio) de MAR.", + ), +} + + +# --------------------------------------------------------------------------- # +# Small defensive formatters (own copy: the chapter never imports siblings). +# --------------------------------------------------------------------------- # +def _fmt_int(value) -> str: + if value is None: + return "—" + try: + return f"{int(round(float(value))):,}".replace(",", ".") + except (TypeError, ValueError): + return model._safe_str(value) + + +def _fmt_pct(value, decimals: int = 1) -> str: + """Format an already-0-100 value as a percentage. None -> placeholder.""" + if value is None: + return "—" + try: + return f"{float(value):.{decimals}f}%" + except (TypeError, ValueError): + return model._safe_str(value) + + +def _fmt_num(value, decimals: int = 3) -> str: + if value is None: + return "—" + try: + f = float(value) + except (TypeError, ValueError): + return model._safe_str(value) + if f != f: # NaN + return "—" + text = f"{f:.{decimals}f}".rstrip("0").rstrip(".") + return text if text else "0" + + +def _truncate(text, limit: int = _LABEL_MAX) -> str: + s = model._safe_str(text) + if len(s) <= limit: + return s + return s[: max(1, limit - 1)].rstrip() + "…" + + +def _term(key: str, label: str, mark: bool) -> str: + if mark: + return f"[[term:{key}]]**{label}**[[/term]]" + return f"**{label}**" + + +# --------------------------------------------------------------------------- # +# Profile reads (exact, all rows). +# --------------------------------------------------------------------------- # +def _null_count_of(col: dict): + """Best-effort null count of a column: ``null_count`` or null_pct*n_rows.""" + nc = col.get("null_count") + if isinstance(nc, (int, float)) and not isinstance(nc, bool): + return int(nc) + np_ = col.get("null_pct") + nr = col.get("n_rows") + if isinstance(np_, (int, float)) and isinstance(nr, (int, float)): + return int(round(float(np_) * float(nr))) + return 0 + + +def _columns_with_nulls(profile: dict): + """Return ``[(name, null_count, null_pct_0_100)]`` for columns with nulls, + sorted by null percentage descending. Reads the aggregated profile (exact).""" + cols = profile.get("columns") or [] + out = [] + for c in cols: + if not isinstance(c, dict): + continue + nc = _null_count_of(c) + if nc <= 0: + continue + np_ = c.get("null_pct") + nr = c.get("n_rows") or profile.get("n_rows") + if isinstance(np_, (int, float)) and not isinstance(np_, bool): + pct = float(np_) * 100.0 if np_ <= 1.0 else float(np_) + elif nr: + pct = nc / float(nr) * 100.0 + else: + pct = None + out.append((c.get("name") or "(col)", nc, pct)) + out.sort(key=lambda t: (t[2] if t[2] is not None else -1.0), reverse=True) + return out + + +def _global_missing_pct(profile: dict): + """Table-level % of missing cells (0-100), exact, from the profile.""" + v = profile.get("null_cell_pct") + if isinstance(v, (int, float)) and not isinstance(v, bool): + return float(v) * 100.0 if v <= 1.0 else float(v) + return None + + +# --------------------------------------------------------------------------- # +# Per-row is-null mask (sample): DuckDB push-down, fallback to raw_numeric. +# --------------------------------------------------------------------------- # +def _build_query_fn(ctx: dict): + """Return ``(query_fn, table)`` for a DuckDB-backed ctx, or ``(None, None)``. + + Mirrors build_eda_render_ctx: a read-only closure over the registry wrapper. + Only DuckDB is supported here; any other backend degrades to raw_numeric.""" + db_path = ctx.get("db_path") + table = ctx.get("table") + if not db_path or not table: + return None, None + try: + from infra import duckdb_query_readonly + except Exception: # noqa: BLE001 — wrapper unavailable -> degrade. + return None, None + + def query_fn(sql): + return duckdb_query_readonly(db_path, sql) + + return query_fn, table + + +def _null_mask(profile: dict, ctx: dict): + """Build the per-row is-null mask ``{col: [0/1, ...]}``. + + Tries a single DuckDB push-down over ALL columns first (so categorical + columns like Cabin are covered, not only numeric ones); falls back to the + numeric-only ``ctx['raw_numeric']`` (None -> missing); returns ``(None, 0, + None)`` when neither is reachable. Never raises. + Returns ``(mask, n_sampled, source)`` with source in {"db","raw_numeric"}. + """ + cols = profile.get("columns") or [] + names = [c.get("name") for c in cols + if isinstance(c, dict) and c.get("name")] + # 1) DuckDB push-down over every column (covers categoricals too). + query_fn, table = _build_query_fn(ctx) + if query_fn is not None and names: + try: + from datascience.extract_null_mask import extract_null_mask + + res = extract_null_mask(query_fn, table, names, max_rows=MASK_SAMPLE) + if isinstance(res, dict) and res.get("status") == "ok": + mask = res.get("mask") or {} + if mask: + return mask, int(res.get("n") or 0), "db" + except Exception: # noqa: BLE001 — degrade to raw_numeric. + pass + # 2) Fallback: numeric-only mask derived from raw_numeric (None -> missing). + rn = ctx.get("raw_numeric") + if isinstance(rn, dict) and rn: + mask = {} + for col, vals in rn.items(): + if isinstance(vals, (list, tuple)): + mask[col] = [1 if v is None else 0 for v in vals] + if mask: + n = max((len(v) for v in mask.values()), default=0) + return mask, n, "raw_numeric" + return None, 0, None + + +# --------------------------------------------------------------------------- # +# Lazy registry delegations (each degrades to None on any failure). +# --------------------------------------------------------------------------- # +def _overview(mask: dict): + try: + from datascience.missingness_overview import missingness_overview + + out = missingness_overview(mask) + return out if isinstance(out, dict) else None + except Exception: # noqa: BLE001 + return None + + +def _correlation(mask: dict, top_k: int): + try: + from datascience.missingness_correlation import missingness_correlation + + out = missingness_correlation(mask, top_k=top_k) + return out if isinstance(out, dict) else None + except Exception: # noqa: BLE001 + return None + + +def _row_patterns(mask: dict, top_n: int): + try: + from datascience.missingness_row_patterns import missingness_row_patterns + + out = missingness_row_patterns(mask, top_n=top_n) + return out if isinstance(out, dict) else None + except Exception: # noqa: BLE001 + return None + + +def _rank_bar_make(names, pcts, title): + def make(): + try: + from datascience.missingness_rank_bar_figure import ( + missingness_rank_bar_figure, + ) + + return missingness_rank_bar_figure(names, pcts, title=title) + except Exception: # noqa: BLE001 — minimal fallback figure. + return _fallback_fig("ranking de nulos no disponible") + + return make + + +def _heatmap_make(matrix, labels, title): + def make(): + try: + from datascience.missingness_corr_heatmap_figure import ( + missingness_corr_heatmap_figure, + ) + + return missingness_corr_heatmap_figure(matrix, labels, title=title) + except Exception: # noqa: BLE001 — minimal fallback figure. + return _fallback_fig("heatmap de co-ocurrencia no disponible") + + return make + + +def _fallback_fig(message: str): + import matplotlib + + matplotlib.use("Agg") + from matplotlib.figure import Figure + + fig = Figure(figsize=(5.0, 2.2)) + ax = fig.add_subplot(111) + ax.text(0.5, 0.5, message, ha="center", va="center") + ax.axis("off") + return fig + + +# --------------------------------------------------------------------------- # +# Block builders. +# --------------------------------------------------------------------------- # +def _summary_block(profile: dict, with_nulls: list, overview, sampled, n_total): + rows = [] + gpct = _global_missing_pct(profile) + rows.append(("Celdas faltantes (global)", _fmt_pct(gpct))) + rows.append(("Columnas con faltantes", str(len(with_nulls)))) + all_null = profile.get("all_null_cols") + if isinstance(all_null, (list, tuple)) and all_null: + rows.append(("Columnas 100% faltantes", str(len(all_null)))) + if isinstance(overview, dict): + cr = overview.get("complete_rows") + ir = overview.get("incomplete_rows") + suffix = "" + if (isinstance(sampled, int) and isinstance(n_total, (int, float)) + and sampled and n_total and sampled < n_total): + suffix = f" (sobre muestra de {_fmt_int(sampled)} filas)" + if cr is not None: + rows.append(("Filas completas (sin faltantes)", + f"{_fmt_int(cr)} ({_fmt_pct(overview.get('complete_pct'))})" + + suffix)) + if ir is not None: + rows.append(("Filas con ≥1 faltante", + f"{_fmt_int(ir)} " + f"({_fmt_pct(overview.get('incomplete_pct'))})" + suffix)) + return model.KVTable(rows=rows, title="Resumen de datos faltantes") + + +def _ranking_block(with_nulls: list): + header = ["Columna", "Faltantes", "% faltante"] + rows = [[_truncate(n), _fmt_int(c), _fmt_pct(p)] for (n, c, p) in with_nulls] + if not rows: + return None + return model.DataTable( + header=header, rows=rows, title="Faltantes por columna", + note="ordenado de más a menos faltante") + + +def _ranking_figure(with_nulls: list): + names = [n for (n, _, p) in with_nulls if p is not None] + pcts = [p for (_, _, p) in with_nulls if p is not None] + if not names: + return None + return model.Figure( + make=_rank_bar_make(names, pcts, "% de valores faltantes por columna"), + caption="Porcentaje de valores faltantes por columna (barras).") + + +def _pairs_block(corr: dict): + """Top column pairs whose absences co-occur, as a table, or None.""" + pairs = (corr or {}).get("pairs") or [] + header = ["Columna A", "Columna B", "Corr. ausencia", "Co-faltan", "Jaccard"] + rows = [] + for p in pairs[:_TOP_PAIRS]: + if not isinstance(p, dict): + continue + rows.append([ + _truncate(p.get("a")), + _truncate(p.get("b")), + _fmt_num(p.get("corr")), + _fmt_int(p.get("co_missing")), + _fmt_num(p.get("jaccard")), + ]) + if not rows: + return None + shown = len(rows) + total = len(pairs) + note = ("correlación de las máscaras is-null entre columnas; " + "«Co-faltan» = nº de filas en que ambas faltan a la vez") + if total > shown: + note += f" — top {shown} de {total} pares" + return model.DataTable(header=header, rows=rows, + title="Pares de columnas que co-faltan", note=note) + + +def _heatmap_block(corr: dict): + cols = (corr or {}).get("columns") or [] + matrix = (corr or {}).get("matrix") or [] + if len(cols) < 2 or not matrix: + return None + labels = [_truncate(c, 16) for c in cols] + return model.Figure( + make=_heatmap_make(matrix, labels, "Co-ocurrencia de ausencias"), + caption=("Correlación de las ausencias entre columnas (azul = faltan " + "juntas; rojo = cuando una falta la otra tiende a estar).")) + + +def _patterns_block(patterns_res: dict): + patterns = (patterns_res or {}).get("patterns") or [] + header = ["Columnas que faltan juntas", "Filas", "%"] + rows = [] + for p in patterns[:_TOP_PATTERNS]: + if not isinstance(p, dict): + continue + cols = p.get("missing_cols") or [] + if cols: + label = ", ".join(_truncate(c, 18) for c in cols) + else: + label = "(fila completa — sin faltantes)" + rows.append([label, _fmt_int(p.get("n_rows")), _fmt_pct(p.get("pct"))]) + if not rows: + return None + total = (patterns_res or {}).get("n_patterns") + shown = len(rows) + note = "cada fila es un patrón de «qué columnas faltan juntas»" + if isinstance(total, int) and total > shown: + note += f" — top {shown} de {total} patrones distintos" + return model.DataTable(header=header, rows=rows, + title="Patrones de fila más comunes", note=note) + + +def _mcar_mar_note(corr: dict, mark: bool): + """Interpretive, exploratory MCAR/MAR note from the absence correlations. + + Reads the absence correlations at two levels so the verdict never contradicts + the visible evidence: a *strong* correlation flags a clear non-random (MAR) + pattern; a *partial* overlap (many rows co-miss — high Jaccard — even if the + correlation is diluted by one column being missing far more often) flags a + localized possible-MAR and cites the concrete co-missing pair; only when + neither holds does it read the absences as compatible with MCAR.""" + + def _pairs_with(attr_ok): + out = [] + for p in (corr or {}).get("pairs") or []: + if isinstance(p, dict) and attr_ok(p): + out.append(p) + return out + + def _cf(v): + try: + return float(v) + except (TypeError, ValueError): + return 0.0 + + strong = _pairs_with(lambda p: abs(_cf(p.get("corr"))) >= _CORR_STRONG) + partial = _pairs_with( + lambda p: _cf(p.get("corr")) > 0 and _cf(p.get("jaccard")) >= _JACCARD_NOTABLE) + mcar = _term("mcar", "MCAR", mark) + mar = _term("mar", "MAR", mark) + head = ( + "**Lectura exploratoria MCAR/MAR.** Esta es una heurística basada en la " + "correlación de las ausencias entre columnas, NO un test confirmatorio " + "(como el de Little); orienta, no demuestra. ") + if strong: + top = strong[0] + ev = (f"«{model._safe_str(top.get('a'))}» y " + f"«{model._safe_str(top.get('b'))}» " + f"(corr {_fmt_num(top.get('corr'))})") + body = ( + f"Hay ausencias que co-ocurren con fuerza —{ev}—: las columnas no " + f"faltan de forma independiente, lo que es un indicio de un patrón no " + f"aleatorio ({mar}). Antes de imputar o descartar filas conviene " + f"comprobar si la ausencia depende de otra variable observada; en ese " + f"caso la imputación debería condicionar en ella para no sesgar.") + elif partial: + top = max(partial, key=lambda p: _cf(p.get("jaccard"))) + ev = (f"«{model._safe_str(top.get('a'))}» y " + f"«{model._safe_str(top.get('b'))}» faltan a la vez en " + f"{_fmt_int(top.get('co_missing'))} filas " + f"(Jaccard {_fmt_num(top.get('jaccard'))})") + body = ( + f"Hay co-ocurrencia parcial de ausencias —{ev}—: algunas columnas " + f"tienden a faltar juntas aunque la correlación global sea modesta " + f"(habitual cuando una columna falta mucho más que la otra). Es un " + f"indicio de un posible patrón localizado no aleatorio ({mar}); " + f"conviene revisar si esa ausencia depende de otra variable observada " + f"antes de imputar, en lugar de asumir que faltan al azar.") + else: + body = ( + f"Las ausencias entre columnas no muestran correlación ni solape " + f"relevante: parecen independientes, lo que es compatible con que " + f"falten al azar ({mcar}). Aun así, la ausencia podría depender de " + f"variables no observadas (la heurística no lo descarta).") + return model.Markdown(text=head + body) + + +def _intro_block(mark: bool, source): + missingness = _term("missingness", "missingness", mark) + text = ( + f"Este capítulo analiza el {missingness} de la tabla: no solo cuánto " + "falta (eso lo cubre la calidad), sino DÓNDE falta y si las columnas " + "faltan juntas. La co-ocurrencia de ausencias se calcula sobre la matriz " + "binaria «is-null» por fila.") + if source == "raw_numeric": + text += (" Nota: no se pudo leer la tabla cruda completa, así que la " + "co-ocurrencia se limita a las columnas numéricas disponibles.") + return model.Markdown(text=text) + + +# --------------------------------------------------------------------------- # +# Entry point. +# --------------------------------------------------------------------------- # +def build_missingness(profile: dict, ctx: dict): + """Build the missingness Chapter, or None if the table has no missing data.""" + if not isinstance(profile, dict): + profile = {} + ctx = ctx or {} + + with_nulls = _columns_with_nulls(profile) + if not with_nulls: + return None # no missing data anywhere -> chapter does not apply. + + # Register glossary terms (if a collector is present) and mark them clickable. + glossary = ctx.get("glossary") + mark = False + if isinstance(glossary, model.GlossaryCollector): + for key, (label, definition) in _TERMS.items(): + glossary.add(key, label, definition) + mark = True + + # Per-row is-null mask (sample) for co-occurrence and row patterns. + mask, sampled, source = _null_mask(profile, ctx) + overview = _overview(mask) if mask else None + n_total = profile.get("n_rows") + + blocks = [ + model.Heading(text="Cuánto y dónde faltan datos", level=2), + _intro_block(mark, source), + _summary_block(profile, with_nulls, overview, sampled, n_total), + model.Heading(text="Faltantes por columna", level=2), + ] + ranking = _ranking_block(with_nulls) + if ranking is not None: + blocks.append(ranking) + rank_fig = _ranking_figure(with_nulls) + if rank_fig is not None: + blocks.append(rank_fig) + + # Co-occurrence + row patterns need the per-row mask. Without it, say so. + if not mask: + blocks.append(model.Note( + "No se pudo construir la matriz «is-null» por fila (sin acceso a los " + "datos crudos), así que no se analiza la co-ocurrencia de ausencias " + "ni los patrones de fila en este informe.")) + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) + + corr = _correlation(mask, _TOP_PAIRS) or {} + co_blocks = [model.Heading(text="Co-ocurrencia de ausencias", level=2)] + heatmap = _heatmap_block(corr) + if heatmap is not None: + co_blocks.append(heatmap) + pairs = _pairs_block(corr) + if pairs is not None: + co_blocks.append(pairs) + if heatmap is None and pairs is None: + co_blocks.append(model.Note( + "Ninguna pareja de columnas comparte ausencias con variación " + "suficiente para correlacionarlas (p. ej. una sola columna con " + "faltantes), así que no hay co-ocurrencia que mostrar.")) + # Keep the co-occurrence heading next to its heatmap and table. + blocks.append(model.Group(blocks=co_blocks)) + + patterns_res = _row_patterns(mask, _TOP_PATTERNS) or {} + patterns = _patterns_block(patterns_res) + if patterns is not None: + blocks.append(model.Heading(text="Patrones de fila", level=2)) + blocks.append(patterns) + + blocks.append(model.Heading(text="Lectura MCAR / MAR", level=2)) + blocks.append(_mcar_mar_note(corr, mark)) + + return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE, + version=CHAPTER_VERSION, blocks=blocks) diff --git a/python/functions/datascience/automatic_eda/chapters/missingness_test.py b/python/functions/datascience/automatic_eda/chapters/missingness_test.py new file mode 100644 index 00000000..323270e1 --- /dev/null +++ b/python/functions/datascience/automatic_eda/chapters/missingness_test.py @@ -0,0 +1,162 @@ +"""Tests for the MISSINGNESS chapter. + +Covers the Definition of Done for this chapter: + * Activates (non-None Chapter with the expected sections) when the profile has + missing data, building the co-occurrence from the per-row is-null mask. + * Returns None when the table has no missing data at all (edge case). + * Registers the MCAR/MAR/missingness glossary terms. + * The DuckDB push-down path covers categorical columns (not only numeric), + so a categorical column that co-misses with a numeric one is detected. +""" + +import os +import sys + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions +if _FUNCTIONS not in sys.path: + sys.path.insert(0, _FUNCTIONS) + +from datascience.automatic_eda import model # noqa: E402 +from datascience.automatic_eda.chapters.missingness import ( # noqa: E402 + build_missingness, +) + + +def _titles(chapter): + """Collect heading texts and table/figure titles for assertions.""" + out = [] + for b in chapter.blocks: + kind = getattr(b, "kind", None) + if kind == "heading": + out.append(("heading", getattr(b, "text", ""))) + elif kind in ("data_table", "kv_table"): + out.append((kind, getattr(b, "title", ""))) + elif kind == "group": + for inner in getattr(b, "blocks", []): + ik = getattr(inner, "kind", None) + if ik == "heading": + out.append(("heading", getattr(inner, "text", ""))) + elif ik in ("data_table", "kv_table"): + out.append((ik, getattr(inner, "title", ""))) + elif ik == "figure": + out.append(("figure", getattr(inner, "caption", ""))) + elif kind == "figure": + out.append(("figure", getattr(b, "caption", ""))) + return out + + +def _all_text(chapter): + parts = [] + def walk(blocks): + for b in blocks: + for attr in ("text", "title", "note", "caption"): + v = getattr(b, attr, None) + if v: + parts.append(str(v)) + if getattr(b, "kind", None) == "group": + walk(getattr(b, "blocks", [])) + walk(chapter.blocks) + return "\n".join(parts) + + +def test_returns_none_when_no_missing_data(): + profile = { + "n_rows": 4, + "null_cell_pct": 0.0, + "columns": [ + {"name": "a", "null_count": 0, "null_pct": 0.0, "n_rows": 4}, + {"name": "b", "null_count": 0, "null_pct": 0.0, "n_rows": 4}, + ], + } + assert build_missingness(profile, {}) is None + + +def test_activates_with_cooccurrence_via_raw_numeric(): + # a and b are missing in EXACTLY the same rows (0,1,2) -> perfect absence + # correlation. c has no nulls. No db_path -> the chapter falls back to the + # numeric raw_numeric mask. + profile = { + "n_rows": 6, + "null_cell_pct": (0.5 + 0.5 + 0.0) / 3.0, + "columns": [ + {"name": "a", "null_count": 3, "null_pct": 0.5, "n_rows": 6}, + {"name": "b", "null_count": 3, "null_pct": 0.5, "n_rows": 6}, + {"name": "c", "null_count": 0, "null_pct": 0.0, "n_rows": 6}, + ], + } + glossary = model.GlossaryCollector() + ctx = { + "raw_numeric": { + "a": [None, None, None, 1.0, 2.0, 3.0], + "b": [None, None, None, 4.0, 5.0, 6.0], + }, + "glossary": glossary, + } + ch = build_missingness(profile, ctx) + assert ch is not None + assert ch.id == "missingness" + assert ch.blocks + + titles = _titles(ch) + headings = {t for (k, t) in titles if k == "heading"} + # Core sections present. + assert any("Cuánto y dónde" in h for h in headings) + assert any("Faltantes por columna" in h for h in headings) + assert any("Co-ocurrencia" in h for h in headings) + assert any("MCAR" in h for h in headings) + # A summary KVTable, a ranking DataTable, a co-occurrence figure and the + # pairs table all exist. + kinds = {k for (k, _) in titles} + assert "kv_table" in kinds + assert "data_table" in kinds + assert "figure" in kinds + + # Glossary terms registered. + keys = {t["key"] for t in glossary.terms()} + assert {"missingness", "mcar", "mar"} <= keys + + # The MCAR/MAR note reads the co-occurrence; with a perfect overlap it must + # flag the non-random (MAR) reading. + text = _all_text(ch) + assert "MAR" in text + + +def test_db_pushdown_covers_categorical_column(tmp_path): + """The is-null mask push-down must cover a categorical column, so a + categorical that co-misses with a numeric one shows up in the pairs.""" + import duckdb + + db = str(tmp_path / "miss.duckdb") + con = duckdb.connect(db) + con.execute("CREATE TABLE t (num1 DOUBLE, num2 DOUBLE, cat VARCHAR)") + # num1 and cat are NULL together in the first 4 of 10 rows; num2 never null. + rows = [] + for i in range(10): + if i < 4: + rows.append((None, float(i), None)) + else: + rows.append((float(i), float(i), f"c{i}")) + con.executemany("INSERT INTO t VALUES (?,?,?)", rows) + con.close() + + profile = { + "n_rows": 10, + "null_cell_pct": (0.4 + 0.0 + 0.4) / 3.0, + "columns": [ + {"name": "num1", "null_count": 4, "null_pct": 0.4, "n_rows": 10}, + {"name": "num2", "null_count": 0, "null_pct": 0.0, "n_rows": 10}, + {"name": "cat", "null_count": 4, "null_pct": 0.4, "n_rows": 10}, + ], + } + ctx = {"db_path": db, "table": "t", "glossary": model.GlossaryCollector()} + ch = build_missingness(profile, ctx) + assert ch is not None + + # The pairs table must mention both num1 and cat (they co-miss perfectly), + # which is only possible if the mask covered the categorical column. + text = _all_text(ch) + assert "num1" in text and "cat" in text + # Co-occurrence section + a pairs data table exist. + titles = _titles(ch) + assert any("co-faltan" in (t or "").lower() for (k, t) in titles) diff --git a/python/functions/datascience/automatic_eda/chapters_registry.py b/python/functions/datascience/automatic_eda/chapters_registry.py index d9030999..d424b934 100644 --- a/python/functions/datascience/automatic_eda/chapters_registry.py +++ b/python/functions/datascience/automatic_eda/chapters_registry.py @@ -32,6 +32,7 @@ CHAPTER_ORDER = [ "num_distr", # numeric distributions "cat_distr", # categorical distributions "calidad", # data quality + "missingness", # missing-data patterns (co-occurrence of absences; MCAR/MAR) "correlacion", # correlations / associations "relaciones", # key relations: declared/candidate PK + FK (inter/intra-table) "modelos", # cheap models (PCA/KMeans/outliers) diff --git a/python/functions/datascience/extract_null_mask.md b/python/functions/datascience/extract_null_mask.md new file mode 100644 index 00000000..66510b2c --- /dev/null +++ b/python/functions/datascience/extract_null_mask.md @@ -0,0 +1,97 @@ +--- +name: extract_null_mask +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def extract_null_mask(query_fn, table: str, columns: list, max_rows: int = 5000) -> dict" +description: "Extrae la mascara de nulos (1=falta / 0=presente) de una muestra de filas de una tabla, una lista 0/1 por columna alineada por fila, para alimentar el capitulo de calidad / patron de nulos de AutomaticEDA sin que el capitulo toque la base de datos. Recibe un lector read-only inyectado `query_fn(sql) -> dict` (mismo contrato que duckdb_query_readonly / pg_query / el `_q` de profile_table) y NO abre ninguna conexion por su cuenta. Construye UNA sola query que proyecta por cada columna `CASE WHEN \"col\" IS NULL THEN 1 ELSE 0 END` con identificadores escapados y LIMIT. Devuelve dict dict-no-throw: columns (efectivamente leidas, en orden), mask (lista int 0/1 por columna, misma longitud todas) y n. Una celda None se cuenta defensivamente como 1 (falta)." +tags: [eda, nulls, missing, datascience, automatic-eda, extraction, read-only, duckdb, postgres, python] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [] +params: + - name: query_fn + desc: "callable lector read-only del backend activo. Recibe un string SQL y devuelve un dict {'status':'ok','rows':[{col:val,...},...]} (mismo contrato que duckdb_query_readonly o el `_q` de profile_table). NO se abre ninguna conexion dentro de la funcion: toda la lectura pasa por query_fn. Si es None -> error." + - name: table + desc: "nombre de la tabla de la que muestrear la mascara de nulos. Se escapa con comillas dobles en la query. Vacio o None -> status error." + - name: columns + desc: "lista de nombres de columna a evaluar. Cada una produce una entrada en `mask` con una lista 0/1 paralela por fila (1=IS NULL, 0=presente). Cada nombre se escapa con comillas dobles. Vacia o None -> status error." + - name: max_rows + desc: "limite de filas a muestrear (clausula LIMIT). Default 5000. Protege frente a tablas enormes; con LIMIT obtienes el primer tramo, no un muestreo uniforme." +output: "dict (nunca lanza). En exito: {'status':'ok','table':str,'columns':[str,...] (en orden),'mask':{col:[int 0/1,...],...} (1=falta/IS NULL, 0=presente; todas las listas con misma longitud = n),'n':int}. En error (sin lanzar): {'status':'error','error':str,'table':str,'columns':[],'mask':{},'n':0}. Errores: query_fn None, table vacia, columns vacia, o query_fn devuelve status!='ok' (se propaga su error)." +tested: true +tests: ["test_golden_mask_alineada", "test_celda_none_cuenta_como_falta", "test_columns_vacia_status_error", "test_query_fn_status_error_propaga", "test_query_fn_none_da_error_sin_reventar", "test_sql_contiene_case_y_limit"] +test_file_path: "python/functions/datascience/extract_null_mask_test.py" +file_path: "python/functions/datascience/extract_null_mask.py" +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience.extract_null_mask import extract_null_mask +from infra import duckdb_query_readonly + +# El lector read-only se inyecta como closure (igual que el `_q` de profile_table). +db = "data/clientes.duckdb" +def _q(sql): + return duckdb_query_readonly(db, sql) + +res = extract_null_mask(_q, "clientes", ["email", "telefono", "edad"]) +# res == { +# "status": "ok", +# "table": "clientes", +# "columns": ["email", "telefono", "edad"], +# "mask": { +# "email": [0, 0, 1, 0, ...], # fila 2 sin email +# "telefono": [1, 0, 1, 0, ...], +# "edad": [0, 0, 0, 1, ...], +# }, +# "n": 5000, +# } + +# % de nulos por columna a partir de la muestra: +pct = {c: 100 * sum(bits) / max(res["n"], 1) for c, bits in res["mask"].items()} + +# Se entrega al capitulo de calidad sin que este toque la BD: +ctx = {"null_mask": res} +``` + +## Cuando usarla + +Cuando el capitulo de calidad / patron de nulos de AutomaticEDA necesita saber +DONDE faltan los valores (no solo cuantos) y NO debe abrir la base de datos por +su cuenta: extraes aqui la mascara 0/1 por columna alineada por fila y se la pasas +en `ctx['null_mask']`. Usala siempre que quieras detectar co-ocurrencia de nulos +(filas que fallan en varias columnas a la vez), calcular el % de nulos sobre una +muestra, o pintar un heatmap de missingness reutilizando un unico lector read-only +inyectado, en vez de hacer N `COUNT(*) WHERE col IS NULL` por separado. + +## Gotchas + +- **Impura**: lee de la base de datos a traves de `query_fn`. No abre conexiones + por su cuenta — depende por completo del lector inyectado. Sigue el estilo + dict-no-throw del grupo `eda`: nunca lanza; ante cualquier fallo devuelve + `{"status":"error","error":...}` con `columns=[]`, `mask={}`, `n=0`. +- **`error_type` en el frontmatter es `error_go_core` por convencion del registry** + (toda funcion impura debe declararlo y el indexer lo exige), pero el codigo + NO lanza esa excepcion: degrada al dict de error. Es metadata, no comportamiento. +- **Muestra, no censo**: con `LIMIT max_rows` obtienes el primer tramo de filas que + devuelva el backend, no un muestreo uniforme ni la tabla entera. El % de nulos + derivado es una estimacion sobre esa muestra; para el conteo exacto usa un + agregado `COUNT(*)`/`COUNT(col)` aparte. +- **Alineacion por fila**: `mask[col][i]` corresponde a la misma fila `i` que + `mask[otra_col][i]`. Todas las listas tienen longitud `n`, asi que puedes cruzar + columnas por indice (co-ocurrencia de nulos) sin re-alinear. +- **Defensa None -> 1**: el SQL ya devuelve 0/1, pero si una celda llega como `None` + (CASE no aplicado, columna ausente en la fila, backend que nulifica) se cuenta + como 1 (falta). Un valor inesperado no convertible a int se trata como presente (0). +- **No loguear los datos crudos**: aunque `mask` es solo 0/1, los nombres de columna + pueden revelar el esquema. En trazas usa `n` y el numero de columnas, no el dict + completo. diff --git a/python/functions/datascience/extract_null_mask.py b/python/functions/datascience/extract_null_mask.py new file mode 100644 index 00000000..4ed1e7e7 --- /dev/null +++ b/python/functions/datascience/extract_null_mask.py @@ -0,0 +1,101 @@ +"""extract_null_mask — extrae la mascara de nulos (1=falta / 0=presente) de una tabla. + +Lector read-only inyectado: recibe `query_fn(sql) -> dict` con el mismo contrato +que duckdb_query_readonly / pg_query (y que el `_q` de profile_table): +`{"status": "ok", "rows": [{col: val, ...}, ...]}`. Esta funcion NO abre ninguna +conexion por su cuenta — solo usa `query_fn`. Construye UNA sola query que, por +cada columna pedida, evalua `CASE WHEN "col" IS NULL THEN 1 ELSE 0 END` y devuelve +una muestra de filas con esos bits. El resultado es un dict `mask` con una lista +0/1 por columna, alineada por fila (1 = el valor falta / IS NULL, 0 = presente), +listo para alimentar el capitulo de calidad / patron de nulos de AutomaticEDA sin +que el capitulo toque la base de datos. + +Estilo dict-no-throw del grupo `eda`: nunca lanza; captura cualquier excepcion y +degrada a `{"status": "error", "error": str, ...}`. +""" + + +def _to_bit(value): + """Coacciona el valor 0/1 del CASE a int de forma defensiva. + + El SQL ya devuelve 0 (presente) o 1 (falta). Por si una celda llega como None + (el CASE no se aplico o el backend la nulifico), se cuenta como 1 (falta). El + resto se reduce a int: un entero distinto de 0 cuenta como 1 (falta), 0 como + presente. Un valor no convertible se trata como presente (0) — nunca lanza. + """ + if value is None: + return 1 + try: + return 1 if int(value) != 0 else 0 + except (TypeError, ValueError): + return 0 + + +def extract_null_mask(query_fn, table, columns, max_rows=5000): + """Extrae la mascara de nulos (1=falta / 0=presente) de una muestra de la tabla. + + Args: + query_fn: callable lector read-only del backend activo. Recibe un string + SQL y devuelve un dict {"status": "ok", "rows": [{col: val, ...}]} + (mismo contrato que duckdb_query_readonly / el `_q` de profile_table). + No se abre ninguna conexion aqui: toda la lectura pasa por query_fn. + table: nombre de la tabla. Se escapa con comillas dobles en la query. + columns: lista de nombres de columna a evaluar. Cada una produce una + entrada en `mask` con una lista 0/1 paralela por fila. Vacia o None -> + status error. + max_rows: limite de filas a muestrear (clausula LIMIT). Default 5000. + + Returns: + dict (nunca lanza): + { + "status": "ok" | "error", + "error": str, # solo si status == "error" + "table": str, + "columns": [str, ...], # columnas efectivamente leidas, en orden + "mask": {col: [int 0/1, ...], ...}, # alineada por fila, 1=falta, 0=presente + "n": int # nº de filas muestreadas + } + Todas las listas de `mask` tienen la misma longitud (= n). + """ + base = {"status": "ok", "table": table, "columns": [], "mask": {}, "n": 0} + try: + if query_fn is None: + return {**base, "status": "error", "error": "query_fn es None"} + if not table: + return {**base, "status": "error", "error": "table es obligatorio"} + if not columns: + return {**base, "status": "error", "error": "columns vacío"} + + # Identificadores escapados con comillas dobles (como hace profile_table) + # para tolerar nombres con mayusculas/espacios/palabras reservadas. Cada + # columna se proyecta como su propio bit IS NULL conservando el alias. + select_sql = ", ".join( + f'(CASE WHEN "{c}" IS NULL THEN 1 ELSE 0 END) AS "{c}"' for c in columns + ) + sql = f'SELECT {select_sql} FROM "{table}" LIMIT {int(max_rows)}' + + q = query_fn(sql) + if not isinstance(q, dict) or q.get("status") != "ok": + err = ( + q.get("error", "query_fn fallo") + if isinstance(q, dict) + else "query_fn no devolvio un dict" + ) + return {**base, "status": "error", "error": err} + + rows = q.get("rows", []) or [] + mask = {c: [] for c in columns} + for row in rows: + for c in columns: + # row.get tolera filas que no traigan la columna (None -> falta). + mask[c].append(_to_bit(row.get(c) if isinstance(row, dict) else None)) + + return { + "status": "ok", + "table": table, + "columns": list(columns), + "mask": mask, + "n": len(rows), + } + except Exception as e: # noqa: BLE001 - dict-no-throw: degradar, nunca lanzar + return {**base, "status": "error", "error": str(e)} diff --git a/python/functions/datascience/extract_null_mask_test.py b/python/functions/datascience/extract_null_mask_test.py new file mode 100644 index 00000000..65a2cfc2 --- /dev/null +++ b/python/functions/datascience/extract_null_mask_test.py @@ -0,0 +1,116 @@ +"""Tests para extract_null_mask. + +No usa DuckDB real: inyecta un query_fn FAKE (closure) que devuelve filas +predefinidas (simulando el SELECT de bits 0/1) y, opcionalmente, captura el SQL +recibido para verificar la query generada (CASE WHEN ... IS NULL + LIMIT). Asi el +test es autocontenido y no depende de ningun backend. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +from extract_null_mask import extract_null_mask + + +def _fake_query(rows, captured=None, status="ok", error=None): + """Crea un query_fn FAKE. + + `captured` (lista opcional) recibe el SQL ejecutado para poder inspeccionarlo. + `status`/`error` permiten simular un fallo del backend. + """ + + def _q(sql): + if captured is not None: + captured.append(sql) + if status != "ok": + return {"status": "error", "error": error or "boom"} + return {"status": "ok", "rows": rows} + + return _q + + +def test_golden_mask_alineada(): + """Golden: mask 0/1 por columna alineada por fila, n correcto, status ok.""" + # Cada fila simula el SELECT (CASE WHEN col IS NULL THEN 1 ELSE 0 END) AS col. + rows = [ + {"email": 0, "telefono": 1, "edad": 0}, + {"email": 0, "telefono": 0, "edad": 1}, + {"email": 1, "telefono": 1, "edad": 0}, + ] + res = extract_null_mask(_fake_query(rows), "clientes", ["email", "telefono", "edad"]) + assert res["status"] == "ok" + assert res["table"] == "clientes" + assert res["columns"] == ["email", "telefono", "edad"] + assert res["n"] == 3 + assert res["mask"]["email"] == [0, 0, 1] + assert res["mask"]["telefono"] == [1, 0, 1] + assert res["mask"]["edad"] == [0, 1, 0] + # Todas las listas con la misma longitud. + assert all(len(v) == res["n"] for v in res["mask"].values()) + + +def test_celda_none_cuenta_como_falta(): + """Una celda None se cuenta defensivamente como 1 (falta).""" + rows = [ + {"email": 0, "telefono": None}, + {"email": None, "telefono": 1}, + {"email": 1, "telefono": 0}, + ] + res = extract_null_mask(_fake_query(rows), "clientes", ["email", "telefono"]) + assert res["status"] == "ok" + assert res["mask"]["email"] == [0, 1, 1] + assert res["mask"]["telefono"] == [1, 1, 0] + assert res["n"] == 3 + + +def test_columns_vacia_status_error(): + """columns vacia -> status error con columns/mask/n vacios.""" + res = extract_null_mask(_fake_query([]), "clientes", []) + assert res["status"] == "error" + assert "columns" in res["error"] + assert res["table"] == "clientes" + assert res["columns"] == [] + assert res["mask"] == {} + assert res["n"] == 0 + + +def test_query_fn_status_error_propaga(): + """query_fn que devuelve status != ok -> se propaga como error, mask {}.""" + res = extract_null_mask( + _fake_query([], status="error", error="db locked"), + "clientes", + ["email"], + ) + assert res["status"] == "error" + assert "db locked" in res["error"] + assert res["mask"] == {} + assert res["n"] == 0 + + +def test_query_fn_none_da_error_sin_reventar(): + """query_fn None -> error degradado, sin excepcion.""" + res = extract_null_mask(None, "clientes", ["email"]) + assert res["status"] == "error" + assert res["columns"] == [] + assert res["mask"] == {} + assert res["n"] == 0 + + +def test_sql_contiene_case_y_limit(): + """La query genera un CASE WHEN IS NULL por columna escapada + LIMIT sobre la tabla.""" + captured = [] + rows = [{"email": 0}] + extract_null_mask( + _fake_query(rows, captured), + "clientes_tbl", + ["email"], + max_rows=123, + ) + assert len(captured) == 1 + sql = captured[0] + assert 'CASE WHEN "email" IS NULL THEN 1 ELSE 0 END' in sql + assert 'AS "email"' in sql + assert 'FROM "clientes_tbl"' in sql + assert "LIMIT 123" in sql diff --git a/python/functions/datascience/missingness_corr_heatmap_figure.md b/python/functions/datascience/missingness_corr_heatmap_figure.md new file mode 100644 index 00000000..595b72f8 --- /dev/null +++ b/python/functions/datascience/missingness_corr_heatmap_figure.md @@ -0,0 +1,103 @@ +--- +id: missingness_corr_heatmap_figure_py_datascience +name: missingness_corr_heatmap_figure +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def missingness_corr_heatmap_figure(matrix, labels, title=\"Co-ocurrencia de ausencias\") -> \"matplotlib.figure.Figure\"" +description: "Construye una figura matplotlib (heatmap) de la matriz NxN de correlación de ausencias entre columnas: +1 = dos columnas suelen ser nulas a la vez, -1 = cuando una falta la otra está presente, 0 = ausencias independientes. Usa ax.imshow con coolwarm fijado a [-1,1], ticks con los labels truncados (X rotados 45º), colorbar y anota el valor de cada celda si N<=12. Devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA (capítulo de datos faltantes). Backend Agg sin pyplot global; defensivo ante matrix/labels vacíos o celdas no numéricas (nunca lanza)." +tags: [eda, missing, missingness, correlation, heatmap, matplotlib, figure, visualization, datascience, impure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [matplotlib] +example: | + from datascience.missingness_corr_heatmap_figure import missingness_corr_heatmap_figure + matrix = [ + [1.0, 0.82, -0.10], + [0.82, 1.0, 0.05], + [-0.10, 0.05, 1.0], + ] + labels = ["telefono", "movil", "email"] + fig = missingness_corr_heatmap_figure(matrix, labels, title="Co-ocurrencia de ausencias") +tested: true +tests: + - "test_returns_figure_with_axes" + - "test_empty_matrix_does_not_raise_and_returns_figure" + - "test_empty_labels_returns_message_figure" + - "test_large_matrix_omits_annotations" + - "test_ragged_and_non_numeric_cells_are_handled" +test_file_path: "python/functions/datascience/missingness_corr_heatmap_figure_test.py" +file_path: "python/functions/datascience/missingness_corr_heatmap_figure.py" +params: + - name: matrix + desc: "Lista de listas (NxN) de floats en [-1,1]: la correlación de ausencias por pares de columnas. Puede venir vacía. Filas de longitud desigual se toleran (se rellenan/recortan a N); celdas None, NaN o no numéricas se coercen a 0.0. No se muta el original." + - name: labels + desc: "Lista de N nombres de columna, paralela a matrix. Puede venir vacía (devuelve figura \"sin columnas con ausencia variable\"). Se truncan a ~14 chars con elipsis para los ticks; los originales no se mutan." + - name: title + desc: "Título de la figura. Se trunca a ~60 chars con elipsis si es muy largo. Default \"Co-ocurrencia de ausencias\"." +output: "Un matplotlib.figure.Figure (figsize 6.4x5.2, dpi 150) con un Axes heatmap (imshow vmin=-1, vmax=1, cmap coolwarm) más una colorbar etiquetada \"correlación de ausencias\". Ticks en ambos ejes con los labels truncados (X rotados 45º). Si N<=12 cada celda lleva su valor numérico anotado (texto blanco sobre celdas saturadas, oscuro sobre pálidas); con N grande se omiten las anotaciones para no saturar. Si matrix o labels vienen vacíos devuelve una Figure con texto centrado \"sin columnas con ausencia variable\"; cualquier error inesperado se captura y devuelve una Figure con el mensaje de error (nunca lanza). El caller rasteriza/cierra la figura; la función no la muestra ni la guarda." +--- + +## Ejemplo + +```python +from datascience.missingness_corr_heatmap_figure import missingness_corr_heatmap_figure + +# Correlación de ausencias entre 3 columnas de contacto: +# telefono y movil tienden a faltar juntos (0.82); email es casi independiente. +matrix = [ + [1.00, 0.82, -0.10], + [0.82, 1.00, 0.05], + [-0.10, 0.05, 1.00], +] +labels = ["telefono", "movil", "email"] + +fig = missingness_corr_heatmap_figure( + matrix, + labels, + title="Co-ocurrencia de ausencias", +) + +# El renderer del informe lo rasteriza; aquí solo persistimos para inspección. +fig.savefig("/tmp/missingness_heatmap.png") +``` + +## Cuando usarla + +Úsala en el capítulo de datos faltantes de un informe EDA cuando quieras ver de +un vistazo qué columnas faltan juntas (mismo formulario sin rellenar, mismo +proceso roto) frente a columnas cuyas ausencias son independientes. Pásale la +matriz de correlación de ausencias (calculada sobre la máscara de nulos, p. ej. +`df.isnull().corr()`) restringida a las columnas que de verdad tienen ausencia +variable, junto con sus nombres. Es la pareja "estructura" del ranking de % de +nulos: las barras dicen *cuánto* falta cada columna, este heatmap dice *si las +ausencias están relacionadas* entre columnas. + +## Gotchas + +- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg` + y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí, + para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO + es thread-safe; esta función evita ese riesgo construyendo el `Figure` + directamente, así que es segura de llamar en bucle desde el renderer. +- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo + guarda. Quien la consume debe rasterizarla y luego liberarla + (`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes. +- **Escala de color fija en [-1, 1].** `vmin=-1`, `vmax=1` están fijados a + propósito para que el color sea comparable entre informes y entre columnas. No + se autoescala al rango real de la matriz; valores fuera de `[-1, 1]` se + saturan al extremo del colormap. +- **Anotaciones solo con N<=12.** Por encima de 12 columnas el grid de números + se vuelve ilegible y se omite; queda solo el color + la colorbar. Filtra a las + columnas con ausencia variable antes de llamar para no llegar a matrices + enormes. +- **Defensiva, nunca lanza.** `matrix=[]`, `labels=[]`, filas cortas, celdas + `None`/`NaN`/no numéricas o cualquier error inesperado se manejan sin propagar: + en el peor caso devuelve una `Figure` con "sin columnas con ausencia variable" + o con el texto del error. No envuelvas la llamada en try/except por miedo a un + raise — no lo hay. diff --git a/python/functions/datascience/missingness_corr_heatmap_figure.py b/python/functions/datascience/missingness_corr_heatmap_figure.py new file mode 100644 index 00000000..4f5fd253 --- /dev/null +++ b/python/functions/datascience/missingness_corr_heatmap_figure.py @@ -0,0 +1,158 @@ +"""Impure EDA helper: heatmap of missingness co-occurrence (`eda` group). + +Builds a matplotlib heatmap of the pairwise missingness correlation matrix of a +dataset: a value near ``+1`` means two columns tend to be null together, near +``-1`` means when one is null the other tends to be present, and ``0`` means +their absences are independent. Returns a ready-to-rasterize +``matplotlib.figure.Figure``; it never shows nor saves it. + +Impure because it touches matplotlib's rendering machinery. It uses the headless +Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no +global state and is safe to call repeatedly from a report renderer. +""" + +import matplotlib + +matplotlib.use("Agg") + +from matplotlib.figure import Figure # noqa: E402 + +# Muted gray for secondary text (no-data / fallback messages). +_MUTED_TEXT = "#5f6b7a" +# Soft red for the error fallback message (kept readable, not alarming). +_ERROR_TEXT = "#b00020" + + +def _truncate(text, width: int = 14) -> str: + """Truncate ``text`` to ``width`` chars, appending an ellipsis if cut.""" + s = "" if text is None else str(text) + if len(s) <= width: + return s + if width <= 1: + return s[:width] + return s[: width - 1] + "…" + + +def _message_figure(message: str, color: str = _MUTED_TEXT) -> "Figure": + """Return a fallback ``Figure`` carrying a single centered message.""" + fig = Figure(figsize=(6.4, 4.0), dpi=150) + ax = fig.add_subplot(111) + ax.axis("off") + ax.text( + 0.5, + 0.5, + message, + ha="center", + va="center", + fontsize=12, + color=color, + wrap=True, + transform=ax.transAxes, + ) + fig.tight_layout() + return fig + + +def missingness_corr_heatmap_figure( + matrix, + labels, + title: str = "Co-ocurrencia de ausencias", +) -> "matplotlib.figure.Figure": + """Build a heatmap figure of a missingness correlation matrix. + + Renders an ``NxN`` matrix of missingness correlations in ``[-1, 1]`` with a + diverging ``coolwarm`` colormap (fixed ``vmin=-1``, ``vmax=1`` so the color + scale is comparable across reports). Both axes are tick-labelled with the + column names (truncated to ~14 chars; the X labels rotated 45°). A colorbar + is attached. When the matrix is small (``N <= 12``) each cell is annotated + with its numeric value; for larger matrices the annotations are omitted to + avoid an unreadable grid. + + The function is fully defensive: empty/ragged/non-numeric input never raises. + When there is nothing valid to draw it returns a ``Figure`` carrying a + centered "sin columnas con ausencia variable" message, and any unexpected + error is caught and turned into a fallback ``Figure`` carrying the error text. + + Args: + matrix: List of lists (``NxN``) of floats in ``[-1, 1]`` — the pairwise + missingness correlation. May be empty; rows of unequal length are + tolerated by treating the matrix as invalid only when it is empty or + its label count does not match. Non-numeric/``None`` cells are + coerced to ``0.0``. + labels: List of ``N`` column names, parallel to ``matrix``. May be empty. + Truncated for display; the originals are not mutated. + title: Figure title. Default "Co-ocurrencia de ausencias". + + Returns: + A ``matplotlib.figure.Figure`` with a single heatmap Axes plus a + colorbar. The caller is responsible for rasterizing/closing it. + """ + try: + # --- Validate shape: need a non-empty square-ish matrix with labels. + if ( + not isinstance(matrix, (list, tuple)) + or not isinstance(labels, (list, tuple)) + or len(matrix) == 0 + or len(labels) == 0 + ): + return _message_figure("sin columnas con ausencia variable") + + n = len(labels) + # Build a clean NxN grid: coerce each cell to float, default 0.0, pad/clip + # rows so a ragged input never crashes imshow. + grid = [] + for i in range(n): + row_src = matrix[i] if i < len(matrix) else [] + if not isinstance(row_src, (list, tuple)): + row_src = [] + row = [] + for j in range(n): + cell = row_src[j] if j < len(row_src) else 0.0 + try: + val = float(cell) + except (TypeError, ValueError): + val = 0.0 + if val != val: # NaN guard. + val = 0.0 + row.append(val) + grid.append(row) + + fig = Figure(figsize=(6.4, 5.2), dpi=150) + ax = fig.add_subplot(111) + + im = ax.imshow(grid, vmin=-1, vmax=1, cmap="coolwarm", aspect="equal") + + short = [_truncate(lab, 14) for lab in labels] + ax.set_xticks(range(n)) + ax.set_yticks(range(n)) + ax.set_xticklabels(short, rotation=45, ha="right", fontsize=8) + ax.set_yticklabels(short, fontsize=8) + + # Annotate each cell only when the grid is small enough to stay legible. + if n <= 12: + for i in range(n): + for j in range(n): + val = grid[i][j] + # White text over saturated (dark) cells, dark over pale. + txt_color = "white" if abs(val) >= 0.55 else "#202020" + ax.text( + j, + i, + f"{val:.2f}", + ha="center", + va="center", + fontsize=7, + color=txt_color, + ) + + cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + cbar.ax.tick_params(labelsize=8) + cbar.set_label("correlación de ausencias", fontsize=8) + + if title: + ax.set_title(_truncate(title, 60), fontsize=12, loc="center", pad=10) + + fig.tight_layout() + return fig + except Exception as exc: # noqa: BLE001 — never raise from a figure builder. + return _message_figure(f"error al dibujar heatmap: {exc}", color=_ERROR_TEXT) diff --git a/python/functions/datascience/missingness_corr_heatmap_figure_test.py b/python/functions/datascience/missingness_corr_heatmap_figure_test.py new file mode 100644 index 00000000..973f8c67 --- /dev/null +++ b/python/functions/datascience/missingness_corr_heatmap_figure_test.py @@ -0,0 +1,62 @@ +"""Tests para missingness_corr_heatmap_figure (heatmap de ausencias, grupo eda). + +Usa el backend Agg sin pyplot; no muestra ni guarda figuras. Cada test cierra +explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular +estado entre tests. +""" + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt # noqa: E402 +from matplotlib.figure import Figure # noqa: E402 + +from missingness_corr_heatmap_figure import missingness_corr_heatmap_figure + + +def _identity_matrix(n): + """Matriz NxN con diagonal 1.0 y resto 0.0 (correlación de ausencias).""" + return [[1.0 if i == j else 0.0 for j in range(n)] for i in range(n)] + + +def test_returns_figure_with_axes(): + matrix = [[1.0, 0.3, -0.2], [0.3, 1.0, 0.5], [-0.2, 0.5, 1.0]] + labels = ["edad", "ingresos", "ciudad"] + fig = missingness_corr_heatmap_figure(matrix, labels, title="ausencias") + assert isinstance(fig, Figure) + # Heatmap (>=1 axes) + colorbar añade su propio Axes -> al menos 1. + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_empty_matrix_does_not_raise_and_returns_figure(): + fig = missingness_corr_heatmap_figure([], [], title="vacía") + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_empty_labels_returns_message_figure(): + fig = missingness_corr_heatmap_figure([[1.0]], [], title="sin labels") + assert isinstance(fig, Figure) + plt.close(fig) + + +def test_large_matrix_omits_annotations(): + n = 16 + fig = missingness_corr_heatmap_figure( + _identity_matrix(n), [f"col_{i}" for i in range(n)] + ) + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_ragged_and_non_numeric_cells_are_handled(): + # Fila corta + celda None + celda string -> se rellenan/coercen sin lanzar. + matrix = [[1.0, None], ["x", 1.0, 0.5]] + labels = ["a", "b"] + fig = missingness_corr_heatmap_figure(matrix, labels) + assert isinstance(fig, Figure) + plt.close(fig) diff --git a/python/functions/datascience/missingness_correlation.md b/python/functions/datascience/missingness_correlation.md new file mode 100644 index 00000000..337cdbb1 --- /dev/null +++ b/python/functions/datascience/missingness_correlation.md @@ -0,0 +1,68 @@ +--- +name: missingness_correlation +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def missingness_correlation(null_mask: dict, top_k: int = 20) -> dict" +description: "Co-ocurrencia de ausencias: nucleo del capitulo de missingness del grupo eda. Recibe la mascara binaria de nulos de una tabla (1 = falta, 0 = presente, alineada por fila) y mide hasta que punto las columnas faltan juntas. Calcula la matriz de correlacion de Pearson entre los vectores binarios de ausencia de las columnas con varianza (al menos un 1 y un 0), mas las cifras de solapamiento de conjuntos por par (co-missing, either-missing, Jaccard). Excluye las columnas constantes en su ausencia (correlacion indefinida) y reporta cuantas. Compone la funcion atomica pearson del registry; no la reimplementa. Lectura defensiva; NUNCA lanza." +tags: [eda, missingness, correlation, pearson, co-occurrence, jaccard, datascience] +params: + - name: null_mask + desc: "dict {col: [int 0/1, ...]} con la mascara de ausencias de la tabla, alineada por fila: 1 = el valor falta en esa fila, 0 = presente. Todas las listas se asumen de la misma longitud (numero de filas). Valores truthy distintos de 0 se tratan como ausencia; entradas no-lista se ignoran sin romper." + - name: top_k + desc: "Numero maximo de pares a devolver en `pairs`, ordenados por valor absoluto de correlacion descendente. Default 20. Solo limita la lista de pares; la matriz cubre siempre todas las columnas con varianza." +output: "dict con: columns (columnas con varianza en la ausencia, en orden de entrada); matrix (len(columns) x len(columns) de correlacion de Pearson entre las mascaras binarias, diagonal 1.0); pairs (hasta top_k pares i constante, excluida +} +out = missingness_correlation(mask, top_k=10) + +print(out["columns"]) # ['ingresos', 'deducciones', 'telefono'] +print(out["n_excluded"]) # 1 +print(out["excluded_cols"]) # ['verificado'] + +# El par mas fuerte: ingresos y deducciones faltan siempre juntas. +top = out["pairs"][0] +print(top["a"], top["b"], round(top["corr"], 3)) # ingresos deducciones 1.0 +print(top["co_missing"], top["either_missing"], top["jaccard"]) # 3 3 1.0 +``` + +## Cuando usarla + +- Usala en el capitulo de **missingness** de `AutomaticEDA` cuando ya tengas la mascara binaria de nulos por columna y quieras detectar **patrones de ausencia conjunta**: que columnas faltan siempre juntas (posible misma fuente/proceso roto) y cuales faltan de forma independiente. +- Cuando necesites ordenar los pares de columnas por fuerza de co-ocurrencia (|corr|) para priorizar que bloques de ausencia investigar o imputar juntos. +- Cuando quieras la cifra de solapamiento de conjuntos (Jaccard, co-missing) ademas de la correlacion lineal, para distinguir "faltan juntas" de "estan presentes juntas". +- Antes de elegir una estrategia de imputacion: dos columnas con corr de ausencia ~1.0 no aportan informacion independiente sobre por que falta la otra. + +## Gotchas + +- Funcion pura, sin I/O y determinista. Lectura defensiva: entradas no-dict, columnas no-lista o vacias se ignoran sin lanzar. +- Solo entran al calculo las columnas con **varianza en la ausencia** (al menos un 1 y al menos un 0). Una columna siempre-presente (todo 0) no aporta ausencia y **no** se cuenta como excluida; una columna siempre-ausente o constante con nulos (todo 1) tiene correlacion indefinida y se excluye, sumando a `n_excluded` / `excluded_cols`. +- Con menos de 2 columnas con varianza, `columns`/`matrix`/`pairs` quedan vacios pero `n_excluded`/`excluded_cols` se rellenan igual — el caller debe contemplar el caso "sin pares". +- La correlacion es la de Pearson sobre vectores binarios (equivale al coeficiente phi). El signo importa: corr negativa = las ausencias tienden a ser **complementarias** (cuando una falta, la otra suele estar presente). +- Asume todas las listas alineadas por fila y de la misma longitud. Si vienen de longitudes distintas, `pearson` opera sobre el solapamiento que permita `zip` y degrada a 0.0 cuando no hay varianza efectiva; alinea la mascara antes de llamar. diff --git a/python/functions/datascience/missingness_correlation.py b/python/functions/datascience/missingness_correlation.py new file mode 100644 index 00000000..1141d7da --- /dev/null +++ b/python/functions/datascience/missingness_correlation.py @@ -0,0 +1,120 @@ +"""Co-ocurrencia de ausencias: matriz de correlacion de Pearson entre mascaras de nulos. + +Funcion pura del grupo eda, nucleo del capitulo de missingness. Recibe la mascara +binaria de ausencias de una tabla (1 = falta, 0 = presente, alineada por fila) y +mide hasta que punto las columnas faltan juntas. Para cada par de columnas con +varianza en su ausencia calcula la correlacion de Pearson entre los vectores +binarios, mas las cifras de solapamiento de conjuntos (co-missing, either-missing, +Jaccard). Compone la funcion atomica `pearson` del registry; no reimplementa la +correlacion. Lectura defensiva; NUNCA lanza. +""" + +from datascience import pearson + + +def missingness_correlation(null_mask, top_k=20) -> dict: + """Correlacion de co-ocurrencia de ausencias entre columnas. + + Args: + null_mask: dict {col: [int 0/1, ...]} alineado por fila (1 = el valor + falta en esa fila). Todas las listas se asumen de la misma longitud. + top_k: numero maximo de pares a devolver, ordenados por |corr| desc. + + Returns: + dict con: + - columns: columnas con varianza en la ausencia (al menos un 1 y al + menos un 0), en orden de entrada. + - matrix: matriz len(columns) x len(columns) de correlacion de Pearson + entre las mascaras binarias, diagonal 1.0. + - pairs: lista de hasta top_k pares (i 0 and zeros > 0: + varying.append(col) + varying_vecs.append([float(v) for v in vec]) + elif ones > 0: + # Tiene nulos pero todos (constante en la ausencia): sin varianza. + excluded_cols.append(col) + # ones == 0 -> columna siempre presente, sin nulos: no se cuenta como + # excluida (no aporta ausencia al analisis de co-ocurrencia). + + result["n_excluded"] = len(excluded_cols) + result["excluded_cols"] = excluded_cols + + n = len(varying) + if n < 2: + return result + + result["columns"] = list(varying) + + # Matriz de correlacion de Pearson, diagonal 1.0. + matrix = [[0.0] * n for _ in range(n)] + for i in range(n): + matrix[i][i] = 1.0 + for i in range(n): + for j in range(i + 1, n): + r = pearson(varying_vecs[i], varying_vecs[j]) + matrix[i][j] = r + matrix[j][i] = r + result["matrix"] = matrix + + # Pares con cifras de solapamiento de conjuntos. + pairs = [] + for i in range(n): + vi = varying_vecs[i] + for j in range(i + 1, n): + vj = varying_vecs[j] + co_missing = 0 + either_missing = 0 + for a, b in zip(vi, vj): + a_miss = a != 0.0 + b_miss = b != 0.0 + if a_miss and b_miss: + co_missing += 1 + if a_miss or b_miss: + either_missing += 1 + jaccard = co_missing / either_missing if either_missing > 0 else 0.0 + pairs.append({ + "a": varying[i], + "b": varying[j], + "corr": matrix[i][j], + "co_missing": co_missing, + "either_missing": either_missing, + "jaccard": jaccard, + }) + + pairs.sort(key=lambda p: abs(p["corr"]), reverse=True) + result["pairs"] = pairs[:top_k] if top_k is not None and top_k >= 0 else pairs + + return result diff --git a/python/functions/datascience/missingness_correlation_test.py b/python/functions/datascience/missingness_correlation_test.py new file mode 100644 index 00000000..dcae0a60 --- /dev/null +++ b/python/functions/datascience/missingness_correlation_test.py @@ -0,0 +1,115 @@ +"""Tests para missingness_correlation.""" + +from datascience.missingness_correlation import missingness_correlation + + +def test_co_ocurrencia_fuerte_corr_uno_jaccard_uno(): + # a y b faltan EXACTAMENTE en las mismas filas -> corr 1.0, jaccard 1.0. + mask = { + "a": [1, 0, 1, 0, 1, 0], + "b": [1, 0, 1, 0, 1, 0], + } + out = missingness_correlation(mask) + assert out["columns"] == ["a", "b"] + assert out["n_excluded"] == 0 + # Diagonal 1.0, off-diagonal ~1.0. + assert out["matrix"][0][0] == 1.0 + assert out["matrix"][1][1] == 1.0 + assert abs(out["matrix"][0][1] - 1.0) < 1e-9 + assert len(out["pairs"]) == 1 + pair = out["pairs"][0] + assert {pair["a"], pair["b"]} == {"a", "b"} + assert abs(pair["corr"] - 1.0) < 1e-9 + assert pair["co_missing"] == 3 # filas 0,2,4 + assert pair["either_missing"] == 3 # mismas filas + assert abs(pair["jaccard"] - 1.0) < 1e-9 + + +def test_ausencias_disjuntas_corr_negativa_jaccard_cero(): + # a y b nunca faltan en la misma fila -> co_missing 0, jaccard 0, corr <= 0. + mask = { + "a": [1, 1, 0, 0], + "b": [0, 0, 1, 1], + } + out = missingness_correlation(mask) + assert out["columns"] == ["a", "b"] + pair = out["pairs"][0] + assert pair["co_missing"] == 0 + assert pair["either_missing"] == 4 + assert pair["jaccard"] == 0.0 + # Solapamiento nulo + ausencias complementarias -> correlacion negativa. + assert pair["corr"] < 0.0 + assert abs(pair["corr"] - out["matrix"][0][1]) < 1e-12 + + +def test_columna_sin_varianza_se_excluye(): + # c esta siempre presente (todo 0): no aporta ausencia -> no entra ni como + # excluida. d esta siempre ausente (todo 1): tiene nulos pero sin varianza + # -> excluida y n_excluded incrementa. a y b tienen varianza. + mask = { + "a": [1, 0, 1, 0], + "b": [1, 0, 0, 0], + "c": [0, 0, 0, 0], # siempre presente + "d": [1, 1, 1, 1], # siempre ausente, constante + } + out = missingness_correlation(mask) + assert out["columns"] == ["a", "b"] + assert "d" in out["excluded_cols"] + assert "c" not in out["excluded_cols"] + assert out["n_excluded"] == 1 + # Matriz solo de las columnas con varianza. + assert len(out["matrix"]) == 2 + assert len(out["matrix"][0]) == 2 + + +def test_menos_de_dos_columnas_con_varianza_vacio_pero_cuenta_excluidas(): + # Solo una columna con varianza (a) + una constante-ausente (d). + mask = { + "a": [1, 0, 1, 0], + "d": [1, 1, 1, 1], + } + out = missingness_correlation(mask) + assert out["columns"] == [] + assert out["matrix"] == [] + assert out["pairs"] == [] + assert out["n_excluded"] == 1 + assert out["excluded_cols"] == ["d"] + + +def test_mask_vacio_todo_vacio(): + out = missingness_correlation({}) + assert out == { + "columns": [], + "matrix": [], + "pairs": [], + "n_excluded": 0, + "excluded_cols": [], + } + + +def test_top_k_limita_pares(): + # 4 columnas con varianza -> 6 pares; top_k=2 deja 2. + mask = { + "a": [1, 0, 1, 0, 0], + "b": [1, 0, 0, 1, 0], + "c": [0, 1, 1, 0, 1], + "d": [1, 1, 0, 0, 1], + } + out = missingness_correlation(mask, top_k=2) + assert len(out["columns"]) == 4 + assert len(out["pairs"]) == 2 + # Ordenados por |corr| desc. + assert abs(out["pairs"][0]["corr"]) >= abs(out["pairs"][1]["corr"]) + + +def test_no_lanza_con_entradas_raras(): + # Valores no-lista y no-dict no deben romper. + assert missingness_correlation(None)["columns"] == [] + mask = { + "a": [1, 0, 1, 0], + "b": [1, 0, 1, 0], + "bad": "not a list", + "empty": [], + } + out = missingness_correlation(mask) + assert out["columns"] == ["a", "b"] diff --git a/python/functions/datascience/missingness_overview.md b/python/functions/datascience/missingness_overview.md new file mode 100644 index 00000000..d33bb8ab --- /dev/null +++ b/python/functions/datascience/missingness_overview.md @@ -0,0 +1,99 @@ +--- +id: missingness_overview_py_datascience +name: missingness_overview +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def missingness_overview(null_mask) -> dict" +description: "Resumen de ausencias a nivel de dataset a partir de una máscara de nulos 0/1 por columna ({col: [1=falta, 0=presente]} alineada por fila). Calcula celdas y porcentaje de datos faltantes, cuántas columnas tienen algún nulo y cuántas filas son completas vs. incompletas. Estilo dict-no-throw del grupo eda: nunca lanza. Lectura defensiva — no-dict o dict vacío devuelve todo a 0; columnas no-lista se tratan como vacías; listas de longitud distinta se alinean a la longitud máxima rellenando la cola corta como presente (0); valores None/no-int cuentan como presente; sin ZeroDivisionError." +tags: [eda, missing, missingness, nulls, profiling, datascience, pure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +example: | + from datascience.missingness_overview import missingness_overview + mask = { + "a": [1, 0, 0, 0, 1], + "b": [1, 0, 1, 0, 0], + "c": [0, 0, 0, 0, 1], + } + missingness_overview(mask) + # n_missing_cells=5, missing_cell_pct≈33.33, complete_rows=2, incomplete_rows=3 +tested: true +tests: + - "test_cooccurrence_three_cols_exact" + - "test_empty_dict_all_zero" + - "test_output_keys_contract" + - "test_not_a_dict_returns_zero" + - "test_no_nulls_all_complete" + - "test_none_values_treated_as_present" + - "test_unequal_lengths_pad_with_max" + - "test_columns_present_but_no_rows" + - "test_never_raises_on_garbage" +test_file_path: "python/functions/datascience/missingness_overview_test.py" +file_path: "python/functions/datascience/missingness_overview.py" +params: + - name: null_mask + desc: "Dict {col_name: [int 0/1, ...]} con la máscara de nulos por columna, alineada por fila (1 = el valor falta, 0 = el valor está presente). Normalmente todas las listas tienen la misma longitud = nº de filas. Lectura defensiva: si no es dict o está vacío se devuelve todo a 0; columnas cuyo valor no es lista/tupla se tratan como vacías; listas de longitud distinta se alinean a la longitud máxima (las posiciones inexistentes de las columnas más cortas cuentan como presentes, 0); valores None o no enteros cuentan como presentes." +output: "Dict con exactamente 9 claves, todas siempre presentes (la función nunca lanza): n_rows (longitud de fila = longitud máxima entre columnas, 0 si vacío), n_cols (nº de columnas), n_cols_with_null (columnas con >=1 falta), n_missing_cells (suma total de 1s), missing_cell_pct (0-100 = n_missing_cells / (n_rows*n_cols) * 100), complete_rows (filas sin ninguna falta), incomplete_rows (filas con >=1 falta), complete_pct (0-100), incomplete_pct (0-100). Los porcentajes son 0.0 cuando el denominador es 0 (sin ZeroDivisionError)." +--- + +## Ejemplo + +```python +from datascience.missingness_overview import missingness_overview + +# Máscara de nulos por columna: 1 = falta, 0 = presente, alineada por fila. +mask = { + "a": [1, 0, 0, 0, 1], + "b": [1, 0, 1, 0, 0], + "c": [0, 0, 0, 0, 1], +} + +missingness_overview(mask) +# { +# "n_rows": 5, +# "n_cols": 3, +# "n_cols_with_null": 3, # a, b y c tienen al menos una falta +# "n_missing_cells": 5, # 2 (a) + 2 (b) + 1 (c) +# "missing_cell_pct": 33.33, # 5 / (5*3) * 100 +# "complete_rows": 2, # filas 1 y 3 sin ninguna falta +# "incomplete_rows": 3, # filas 0 (a&b), 2 (b), 4 (a&c) +# "complete_pct": 40.0, # 2 / 5 * 100 +# "incomplete_pct": 60.0, # 3 / 5 * 100 +# } + +missingness_overview({}) +# Todo a 0: {"n_rows": 0, "n_cols": 0, "n_cols_with_null": 0, +# "n_missing_cells": 0, "missing_cell_pct": 0.0, +# "complete_rows": 0, "incomplete_rows": 0, +# "complete_pct": 0.0, "incomplete_pct": 0.0} +``` + +## Cuando usarla + +Úsala al perfilar un dataset cuando ya tienes una máscara de nulos 0/1 por +columna (p. ej. derivada del paso de carga/perfilado del EDA) y quieres la foto +global de ausencias en una llamada: cuánta proporción de celdas falta, cuántas +columnas están afectadas y, sobre todo, cuántas filas quedan completas vs. +incompletas. Es el bloque resumen del capítulo de calidad/missingness de un EDA, +y la base para decidir estrategias de imputación o de borrado de filas. Como es +pura y dict-no-throw, puedes alimentarla con la máscara tal cual sin validarla +antes: entradas malformadas degradan a ceros en vez de romper el pipeline. + +## Gotchas + +- **`n_rows` es la longitud máxima entre columnas.** Con listas de longitud + desigual, las posiciones que faltan en las columnas más cortas se cuentan como + presentes (`0`); no se descartan filas. En el caso normal (todas las listas de + igual longitud) `n_rows` es simplemente esa longitud. +- **Solo el valor exacto `1` cuenta como falta.** `None`, `0`, cadenas y + cualquier otro valor se tratan como presentes. `True` (== 1) también cuenta + como falta por la igualdad. +- **Porcentajes en escala 0-100**, no fracciones. División por cero protegida: + con `n_rows*n_cols == 0` los porcentajes salen `0.0`. diff --git a/python/functions/datascience/missingness_overview.py b/python/functions/datascience/missingness_overview.py new file mode 100644 index 00000000..04fdfec9 --- /dev/null +++ b/python/functions/datascience/missingness_overview.py @@ -0,0 +1,116 @@ +"""Pure EDA helper: dataset-level missingness overview from a 0/1 null mask. + +Part of the `eda` capability group. Consumes a per-column null mask +(``{col_name: [int 0/1, ...]}`` aligned by row, ``1`` = value is missing, +``0`` = value is present) and derives dataset-wide missingness metrics: cell +count and percentage of missing data, how many columns carry any null, and how +many rows are complete vs. incomplete. + +Dict-no-throw style of the `eda` group: it NEVER raises. A non-dict, an empty +dict, malformed columns, ragged lists or non-int cell values all degrade +gracefully to the zero/contract output. Stdlib only. + +Ragged-length policy: columns are allowed to have different lengths. ``n_rows`` +is the **maximum** column length; positions that don't exist in a shorter +column are treated as present (``0``). This keeps the ``n_rows * n_cols`` cell +grid well defined without dropping rows. +""" + + +def _is_missing(value) -> int: + """Return ``1`` iff ``value`` denotes a missing cell, else ``0``. + + Only an exact equality to ``1`` (covers ``int`` ``1`` and ``float`` ``1.0``) + counts as missing. ``None``, ``0``, strings and any other value are treated + as present. The comparison cannot raise for standard inputs. + """ + try: + return 1 if value == 1 else 0 + except Exception: + return 0 + + +def missingness_overview(null_mask) -> dict: + """Summarize dataset-level missingness from a 0/1 null mask. + + Args: + null_mask: Dict ``{col_name: [int 0/1, ...]}`` where each list is aligned + by row (``1`` = missing, ``0`` = present). Lists are normally all the + same length (= number of rows). Defensive: a non-dict or empty dict + returns the all-zero contract; non-list columns are treated as empty; + ragged lists are aligned to the maximum length, padding the missing + tail of shorter columns as present (``0``); ``None`` / non-int cells + count as present. + + Returns: + Dict with exactly these keys, all always present (the function never + raises): ``n_rows``, ``n_cols``, ``n_cols_with_null``, + ``n_missing_cells``, ``missing_cell_pct`` (0-100), ``complete_rows``, + ``incomplete_rows``, ``complete_pct`` (0-100), ``incomplete_pct`` + (0-100). Percentages are ``0.0`` when the denominator is zero (no + ``ZeroDivisionError``). + """ + zero = { + "n_rows": 0, + "n_cols": 0, + "n_cols_with_null": 0, + "n_missing_cells": 0, + "missing_cell_pct": 0.0, + "complete_rows": 0, + "incomplete_rows": 0, + "complete_pct": 0.0, + "incomplete_pct": 0.0, + } + + if not isinstance(null_mask, dict) or not null_mask: + return dict(zero) + + # Normalize every column to a list; non-list columns become empty. + cols = {} + for name, seq in null_mask.items(): + cols[name] = seq if isinstance(seq, (list, tuple)) else [] + + n_cols = len(cols) + lengths = [len(seq) for seq in cols.values()] + n_rows = max(lengths) if lengths else 0 + + if n_rows == 0: + # Columns exist but carry no rows: everything zero except n_cols. + out = dict(zero) + out["n_cols"] = n_cols + return out + + n_missing_cells = 0 + n_cols_with_null = 0 + row_has_missing = [False] * n_rows + + for seq in cols.values(): + col_len = len(seq) + col_has_null = False + for r in range(n_rows): + if r < col_len and _is_missing(seq[r]): + n_missing_cells += 1 + row_has_missing[r] = True + col_has_null = True + if col_has_null: + n_cols_with_null += 1 + + incomplete_rows = sum(1 for flag in row_has_missing if flag) + complete_rows = n_rows - incomplete_rows + + total_cells = n_rows * n_cols + missing_cell_pct = (n_missing_cells / total_cells * 100.0) if total_cells else 0.0 + complete_pct = complete_rows / n_rows * 100.0 + incomplete_pct = incomplete_rows / n_rows * 100.0 + + return { + "n_rows": n_rows, + "n_cols": n_cols, + "n_cols_with_null": n_cols_with_null, + "n_missing_cells": n_missing_cells, + "missing_cell_pct": missing_cell_pct, + "complete_rows": complete_rows, + "incomplete_rows": incomplete_rows, + "complete_pct": complete_pct, + "incomplete_pct": incomplete_pct, + } diff --git a/python/functions/datascience/missingness_overview_test.py b/python/functions/datascience/missingness_overview_test.py new file mode 100644 index 00000000..4cee5452 --- /dev/null +++ b/python/functions/datascience/missingness_overview_test.py @@ -0,0 +1,146 @@ +"""Tests para missingness_overview.""" + +import sys +import os + +import pytest + +sys.path.insert(0, os.path.dirname(__file__)) + +from missingness_overview import missingness_overview + + +# Output contract: every call returns exactly these 9 keys. +EXPECTED_KEYS = { + "n_rows", + "n_cols", + "n_cols_with_null", + "n_missing_cells", + "missing_cell_pct", + "complete_rows", + "incomplete_rows", + "complete_pct", + "incomplete_pct", +} + + +def test_cooccurrence_three_cols_exact(): + # 3 columns, 5 rows. Hand-computed expectations: + # col a missing at rows 0, 4 -> 2 + # col b missing at rows 0, 2 -> 2 + # col c missing at row 4 -> 1 + # n_missing_cells = 5, total_cells = 5*3 = 15 -> 33.333...% + # row 0 (a&b co-occur) -> incomplete + # row 1 (all present) -> complete + # row 2 (b only) -> incomplete + # row 3 (all present) -> complete + # row 4 (a&c co-occur) -> incomplete + mask = { + "a": [1, 0, 0, 0, 1], + "b": [1, 0, 1, 0, 0], + "c": [0, 0, 0, 0, 1], + } + out = missingness_overview(mask) + assert out["n_rows"] == 5 + assert out["n_cols"] == 3 + assert out["n_cols_with_null"] == 3 + assert out["n_missing_cells"] == 5 + assert out["missing_cell_pct"] == pytest.approx(33.33333333, abs=1e-6) + assert out["complete_rows"] == 2 + assert out["incomplete_rows"] == 3 + assert out["complete_pct"] == pytest.approx(40.0) + assert out["incomplete_pct"] == pytest.approx(60.0) + + +def test_empty_dict_all_zero(): + out = missingness_overview({}) + assert out == { + "n_rows": 0, + "n_cols": 0, + "n_cols_with_null": 0, + "n_missing_cells": 0, + "missing_cell_pct": 0.0, + "complete_rows": 0, + "incomplete_rows": 0, + "complete_pct": 0.0, + "incomplete_pct": 0.0, + } + + +def test_output_keys_contract(): + # The 9-key contract holds even for the garbage/zero path. + assert set(missingness_overview({}).keys()) == EXPECTED_KEYS + assert set(missingness_overview({"a": [1, 0]}).keys()) == EXPECTED_KEYS + + +def test_not_a_dict_returns_zero(): + for bad in (None, [1, 0, 1], 42, "nope", 3.14): + out = missingness_overview(bad) + assert out["n_rows"] == 0 + assert out["n_cols"] == 0 + assert out["n_missing_cells"] == 0 + assert out["missing_cell_pct"] == 0.0 + + +def test_no_nulls_all_complete(): + mask = {"a": [0, 0, 0], "b": [0, 0, 0]} + out = missingness_overview(mask) + assert out["n_rows"] == 3 + assert out["n_cols"] == 2 + assert out["n_cols_with_null"] == 0 + assert out["n_missing_cells"] == 0 + assert out["missing_cell_pct"] == 0.0 + assert out["complete_rows"] == 3 + assert out["incomplete_rows"] == 0 + assert out["complete_pct"] == pytest.approx(100.0) + assert out["incomplete_pct"] == pytest.approx(0.0) + + +def test_none_values_treated_as_present(): + # None and other non-1 values count as present (0). + mask = {"a": [None, 1, None, "x", 0]} + out = missingness_overview(mask) + assert out["n_rows"] == 5 + assert out["n_cols"] == 1 + assert out["n_missing_cells"] == 1 # only the explicit 1 at row 1 + assert out["n_cols_with_null"] == 1 + assert out["complete_rows"] == 4 + assert out["incomplete_rows"] == 1 + + +def test_unequal_lengths_pad_with_max(): + # Ragged lists: n_rows = max length; shorter column padded as present. + # a = [1, 1] -> missing at rows 0, 1 + # b = [0] -> row 1 padded to present + # n_rows = 2, n_cols = 2, total_cells = 4, n_missing_cells = 2 -> 50% + mask = {"a": [1, 1], "b": [0]} + out = missingness_overview(mask) + assert out["n_rows"] == 2 + assert out["n_cols"] == 2 + assert out["n_cols_with_null"] == 1 + assert out["n_missing_cells"] == 2 + assert out["missing_cell_pct"] == pytest.approx(50.0) + assert out["complete_rows"] == 0 + assert out["incomplete_rows"] == 2 + assert out["incomplete_pct"] == pytest.approx(100.0) + + +def test_columns_present_but_no_rows(): + # Columns exist but all empty -> zero metrics, n_cols preserved. + out = missingness_overview({"a": [], "b": []}) + assert out["n_rows"] == 0 + assert out["n_cols"] == 2 + assert out["n_missing_cells"] == 0 + assert out["missing_cell_pct"] == 0.0 + assert out["complete_pct"] == 0.0 + + +def test_never_raises_on_garbage(): + # Non-list column values, mixed junk -> must not raise. + mask = {"a": "not a list", "b": 123, "c": [1, 0, 1]} + out = missingness_overview(mask) + assert set(out.keys()) == EXPECTED_KEYS + assert out["n_rows"] == 3 + assert out["n_cols"] == 3 + assert out["n_missing_cells"] == 2 # only col c contributes + assert out["n_cols_with_null"] == 1 diff --git a/python/functions/datascience/missingness_rank_bar_figure.md b/python/functions/datascience/missingness_rank_bar_figure.md new file mode 100644 index 00000000..c75f4613 --- /dev/null +++ b/python/functions/datascience/missingness_rank_bar_figure.md @@ -0,0 +1,93 @@ +--- +id: missingness_rank_bar_figure_py_datascience +name: missingness_rank_bar_figure +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: impure +signature: "def missingness_rank_bar_figure(names, pcts, title=\"% de valores faltantes por columna\") -> \"matplotlib.figure.Figure\"" +description: "Construye una figura matplotlib de barras horizontales que ordena las columnas de un dataset por su porcentaje de valores faltantes (0-100), la mayor arriba, etiquetando cada barra con su NN.N% al final. Usa ax.barh, eje X fijo 0-100 y labels truncados a ~22 chars. Devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA (capítulo de datos faltantes). Backend Agg sin pyplot global; defensivo ante listas vacías, longitudes desiguales o valores no numéricos (nunca lanza)." +tags: [eda, missing, missingness, ranking, bar, barh, matplotlib, figure, visualization, datascience, impure] +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "error_go_core" +imports: [matplotlib] +example: | + from datascience.missingness_rank_bar_figure import missingness_rank_bar_figure + names = ["edad", "ingresos", "ciudad", "email"] + pcts = [12.5, 40.0, 3.2, 0.0] + fig = missingness_rank_bar_figure(names, pcts, title="% de valores faltantes por columna") +tested: true +tests: + - "test_returns_figure_with_axes" + - "test_sorted_descending_largest_on_top" + - "test_empty_lists_do_not_raise_and_returns_figure" + - "test_xlim_is_zero_to_hundred" + - "test_length_mismatch_and_non_numeric_are_handled" +test_file_path: "python/functions/datascience/missingness_rank_bar_figure_test.py" +file_path: "python/functions/datascience/missingness_rank_bar_figure.py" +params: + - name: names + desc: "Lista de nombres de columna. Puede venir vacía (devuelve figura \"sin datos faltantes\"). Los items se convierten a str y se truncan a ~22 chars con elipsis para las etiquetas del eje Y; los originales no se mutan." + - name: pcts + desc: "Lista paralela a names con el % de nulos en [0,100]. Valores None, NaN o no numéricos se coercen a 0.0 y los negativos se recortan a 0. Si len(names) != len(pcts) se recorta al menor de ambos para no romper." + - name: title + desc: "Título de la figura. Se trunca a ~60 chars con elipsis si es muy largo. Default \"% de valores faltantes por columna\"." +output: "Un matplotlib.figure.Figure (figsize 6.4 x alto adaptativo según nº de barras, dpi 150) con un Axes de barras horizontales (ax.barh) ordenadas por % descendente, la mayor arriba. Eje X fijado a [0,100] con label \"% faltante\", etiquetas del eje Y truncadas a ~22 chars, y cada barra anotada con su NN.N% al final. Si names o pcts vienen vacíos devuelve una Figure con texto centrado \"sin datos faltantes\"; cualquier error inesperado se captura y devuelve una Figure con el mensaje de error (nunca lanza). El caller rasteriza/cierra la figura; la función no la muestra ni la guarda." +--- + +## Ejemplo + +```python +from datascience.missingness_rank_bar_figure import missingness_rank_bar_figure + +# % de nulos por columna (p. ej. (df.isnull().mean() * 100). +names = ["edad", "ingresos", "ciudad", "email"] +pcts = [12.5, 40.0, 3.2, 0.0] + +fig = missingness_rank_bar_figure( + names, + pcts, + title="% de valores faltantes por columna", +) + +# ingresos (40.0%) queda arriba; email (0.0%) abajo. +# El renderer del informe lo rasteriza; aquí solo persistimos para inspección. +fig.savefig("/tmp/missingness_rank.png") +``` + +## Cuando usarla + +Úsala al abrir el capítulo de datos faltantes de un informe EDA para responder +"¿qué columnas están más incompletas?" de un vistazo. Pásale los nombres de +columna y el % de nulos de cada una (`(df.isnull().mean() * 100).round(1)`); la +función se encarga de ordenar de mayor a menor y poner la peor arriba. Es la +pareja "magnitud" del heatmap de co-ocurrencia: las barras dicen *cuánto* falta +en cada columna, el heatmap dice *si esas ausencias están relacionadas* entre +columnas. + +## Gotchas + +- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg` + y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí, + para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO + es thread-safe; esta función evita ese riesgo construyendo el `Figure` + directamente, así que es segura de llamar en bucle desde el renderer. +- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo + guarda. Quien la consume debe rasterizarla y luego liberarla + (`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes. +- **Espera porcentajes 0-100, no fracciones 0-1.** El eje X está fijado a + `[0, 100]`. Si pasas fracciones (`0.4` en vez de `40.0`) las barras saldrán + pegadas al origen. Multiplica por 100 antes de llamar. +- **Alto adaptativo.** La altura de la figura crece con el número de barras + (hasta un tope) para que reports con muchas columnas sigan legibles; aun así, + conviene filtrar a las columnas con algún nulo antes de llamar para no listar + decenas de barras a 0%. +- **Defensiva, nunca lanza.** Listas vacías, longitudes desiguales, valores + `None`/`NaN`/no numéricos o cualquier error inesperado se manejan sin propagar: + en el peor caso devuelve una `Figure` con "sin datos faltantes" o con el texto + del error. No envuelvas la llamada en try/except por miedo a un raise — no lo + hay. diff --git a/python/functions/datascience/missingness_rank_bar_figure.py b/python/functions/datascience/missingness_rank_bar_figure.py new file mode 100644 index 00000000..58643ff3 --- /dev/null +++ b/python/functions/datascience/missingness_rank_bar_figure.py @@ -0,0 +1,150 @@ +"""Impure EDA helper: ranked bar figure of missing-value share (`eda` group). + +Builds a horizontal bar chart ranking the columns of a dataset by their +percentage of missing values (0-100), largest at the top, each bar labelled with +its ``NN.N%`` at the end. Returns a ready-to-rasterize +``matplotlib.figure.Figure``; it never shows nor saves it. + +Impure because it touches matplotlib's rendering machinery. It uses the headless +Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no +global state and is safe to call repeatedly from a report renderer. +""" + +import matplotlib + +matplotlib.use("Agg") + +from matplotlib.figure import Figure # noqa: E402 + +# Muted gray for secondary text (no-data / fallback messages). +_MUTED_TEXT = "#5f6b7a" +# Soft red for the error fallback message. +_ERROR_TEXT = "#b00020" +# Bar fill — a calm blue that reads well on white at report size. +_BAR_COLOR = "#4C72B0" + + +def _truncate(text, width: int = 22) -> str: + """Truncate ``text`` to ``width`` chars, appending an ellipsis if cut.""" + s = "" if text is None else str(text) + if len(s) <= width: + return s + if width <= 1: + return s[:width] + return s[: width - 1] + "…" + + +def _message_figure(message: str, color: str = _MUTED_TEXT) -> "Figure": + """Return a fallback ``Figure`` carrying a single centered message.""" + fig = Figure(figsize=(6.4, 4.0), dpi=150) + ax = fig.add_subplot(111) + ax.axis("off") + ax.text( + 0.5, + 0.5, + message, + ha="center", + va="center", + fontsize=12, + color=color, + wrap=True, + transform=ax.transAxes, + ) + fig.tight_layout() + return fig + + +def missingness_rank_bar_figure( + names, + pcts, + title: str = "% de valores faltantes por columna", +) -> "matplotlib.figure.Figure": + """Build a horizontal ranked bar figure of missing-value share per column. + + Pairs each column name with its missing percentage, sorts by percentage + descending and draws horizontal bars with the largest at the top. The X axis + is pinned to ``[0, 100]`` so bars are comparable across reports, each bar is + annotated with its ``NN.N%`` at the end, and the Y tick labels are truncated + to ~22 chars. + + The function is fully defensive: empty/mismatched/non-numeric input never + raises. When there is nothing valid to draw it returns a ``Figure`` carrying + a centered "sin datos faltantes" message, and any unexpected error is caught + and turned into a fallback ``Figure`` carrying the error text. + + Args: + names: List of column names. May be empty. Items are stringified and + truncated for display; the originals are not mutated. + pcts: List parallel to ``names`` of missing-value percentages in + ``[0, 100]``. Non-numeric/``None`` values are coerced to ``0.0`` and + negatives are clamped to ``0``. The list is truncated to + ``min(len(names), len(pcts))`` so a length mismatch never crashes. + title: Figure title. Default "% de valores faltantes por columna". + + Returns: + A ``matplotlib.figure.Figure`` with a single horizontal-bar Axes. The + caller is responsible for rasterizing/closing it. + """ + try: + if ( + not isinstance(names, (list, tuple)) + or not isinstance(pcts, (list, tuple)) + or len(names) == 0 + or len(pcts) == 0 + ): + return _message_figure("sin datos faltantes") + + # --- Pair names with coerced percentages, tolerating length mismatch. + pairs = [] + for name, pct in zip(names, pcts): + try: + val = float(pct) + except (TypeError, ValueError): + val = 0.0 + if val != val: # NaN guard. + val = 0.0 + val = max(0.0, val) + pairs.append((name, val)) + + if not pairs: + return _message_figure("sin datos faltantes") + + # Sort by percentage descending; barh draws bottom-up, so the largest + # ends at the top when we reverse the order before plotting. + pairs.sort(key=lambda p: p[1], reverse=True) + ordered = list(reversed(pairs)) # smallest first -> largest on top. + + labels = [_truncate(name, 22) for name, _ in ordered] + values = [val for _, val in ordered] + y_pos = range(len(ordered)) + + # Height scales with the number of bars so dense reports stay readable. + height = max(2.4, min(0.4 * len(ordered) + 1.2, 14.0)) + fig = Figure(figsize=(6.4, height), dpi=150) + ax = fig.add_subplot(111) + + ax.barh(list(y_pos), values, color=_BAR_COLOR, edgecolor="white") + ax.set_yticks(list(y_pos)) + ax.set_yticklabels(labels, fontsize=8) + ax.set_xlim(0, 100) + ax.set_xlabel("% faltante", fontsize=9) + + # Annotate each bar with its percentage at the end of the bar. + for y, val in zip(y_pos, values): + ax.text( + min(val + 1.5, 99.0), + y, + f"{val:.1f}%", + va="center", + ha="left" if val < 90 else "right", + fontsize=7, + color="#202020", + ) + + if title: + ax.set_title(_truncate(title, 60), fontsize=12, loc="left", pad=10) + + fig.tight_layout() + return fig + except Exception as exc: # noqa: BLE001 — never raise from a figure builder. + return _message_figure(f"error al dibujar barras: {exc}", color=_ERROR_TEXT) diff --git a/python/functions/datascience/missingness_rank_bar_figure_test.py b/python/functions/datascience/missingness_rank_bar_figure_test.py new file mode 100644 index 00000000..42b41b19 --- /dev/null +++ b/python/functions/datascience/missingness_rank_bar_figure_test.py @@ -0,0 +1,64 @@ +"""Tests para missingness_rank_bar_figure (barras de % faltante, grupo eda). + +Usa el backend Agg sin pyplot; no muestra ni guarda figuras. Cada test cierra +explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular +estado entre tests. +""" + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt # noqa: E402 +from matplotlib.figure import Figure # noqa: E402 + +from missingness_rank_bar_figure import missingness_rank_bar_figure + + +def test_returns_figure_with_axes(): + names = ["edad", "ingresos", "ciudad"] + pcts = [12.5, 40.0, 3.2] + fig = missingness_rank_bar_figure(names, pcts, title="faltantes") + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_sorted_descending_largest_on_top(): + names = ["a", "b", "c"] + pcts = [10.0, 50.0, 25.0] + fig = missingness_rank_bar_figure(names, pcts) + ax = fig.axes[0] + # barh dibuja de abajo arriba; la mayor (50, "b") debe quedar arriba (mayor y). + bars = ax.patches + # El último parche (mayor índice y) corresponde a la barra superior. + widths = [b.get_width() for b in bars] + assert max(widths) == 50.0 + # La barra con la mayor anchura es la de mayor coordenada y (arriba). + top_bar = max(bars, key=lambda b: b.get_y()) + assert top_bar.get_width() == 50.0 + plt.close(fig) + + +def test_empty_lists_do_not_raise_and_returns_figure(): + fig = missingness_rank_bar_figure([], [], title="vacía") + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) + + +def test_xlim_is_zero_to_hundred(): + fig = missingness_rank_bar_figure(["a"], [42.0]) + ax = fig.axes[0] + assert ax.get_xlim() == (0.0, 100.0) + plt.close(fig) + + +def test_length_mismatch_and_non_numeric_are_handled(): + # Más names que pcts + un pct None -> zip recorta y None se coacciona a 0. + names = ["a", "b", "c"] + pcts = [None, 30.0] + fig = missingness_rank_bar_figure(names, pcts) + assert isinstance(fig, Figure) + assert len(fig.axes) >= 1 + plt.close(fig) diff --git a/python/functions/datascience/missingness_row_patterns.md b/python/functions/datascience/missingness_row_patterns.md new file mode 100644 index 00000000..ad1437b5 --- /dev/null +++ b/python/functions/datascience/missingness_row_patterns.md @@ -0,0 +1,65 @@ +--- +name: missingness_row_patterns +kind: function +lang: py +domain: datascience +version: "1.0.0" +purity: pure +signature: "def missingness_row_patterns(null_mask, top_n=10) -> dict" +description: "Agrupa las filas de un dataset por su patron de ausencias (estilo matriz de missingno): para cada fila, el patron es la tupla ORDENADA de columnas que faltan en esa fila (las que tienen 1 en el null_mask). Cuenta la frecuencia de cada patron distinto, incluido el patron vacio (fila completa). Devuelve el top_n por frecuencia con su pct sobre el total. Pura, lectura defensiva, NUNCA lanza; {} -> n_rows 0." +tags: [eda, missingness, missingno, patterns, profiling, datascience, data-quality] +params: + - name: null_mask + desc: "Dict {col: [0/1, ...]} alineado por fila, donde 1 = la celda falta en esa fila y 0 = presente. Todas las columnas deberian tener la misma longitud (una entrada por fila); si difieren, n_rows es la lista mas larga y las celdas fuera de rango cuentan como presentes. Las claves se ordenan por str(col) para canonizar el patron. {} (o no-dict) -> n_rows 0." + - name: top_n + desc: "Maximo de patrones devueltos en `patterns`, rankeados por n_rows desc (desempate: menos columnas primero, luego nombres de columna). El recuento total de patrones distintos siempre se reporta en `n_patterns`, no se trunca. Default 10. Valores negativos -> 0; no-int -> 10." +output: "Dict {n_rows: int (filas totales), n_patterns: int (patrones distintos, incluye el patron vacio = fila completa), complete_rows: int (filas con patron vacio, nada falta), patterns: lista del top_n ordenada por n_rows desc con [{missing_cols: [col,...] (vacio = fila completa), n_rows: int, pct: float 0-100 sobre n_rows total, redondeado a 2 decimales}]}. Para {} devuelve n_rows 0 y patterns []. NUNCA lanza." +uses_functions: [] +uses_types: [] +returns: [] +returns_optional: false +error_type: "" +imports: [] +tested: true +tests: ["test_patron_dominante_completas_singleton", "test_mask_vacio", "test_top_n_trunca_pero_cuenta_todos"] +test_file_path: "python/functions/datascience/missingness_row_patterns_test.py" +file_path: "python/functions/datascience/missingness_row_patterns.py" +--- + +## Ejemplo + +```python +import sys, os +sys.path.insert(0, os.path.join("python", "functions")) +from datascience.missingness_row_patterns import missingness_row_patterns + +# null_mask alineado por fila: 1 = la celda falta en esa fila. +null_mask = { + "A": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + "B": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + "C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], +} +out = missingness_row_patterns(null_mask, top_n=10) +print(out["n_rows"], out["n_patterns"], out["complete_rows"]) # 10 3 5 +for p in out["patterns"]: + label = p["missing_cols"] or "(fila completa)" + print(label, p["n_rows"], p["pct"]) +# (fila completa) 5 50.0 +# ['A', 'B'] 4 40.0 +# ['C'] 1 10.0 +``` + +## Cuando usarla + +- Usala en el capitulo de calidad/ausencias de `AutomaticEDA` para mostrar la "matriz de patrones de missingno": en vez de pintar celda a celda, resume que combinaciones de columnas se quedan en blanco juntas y con que frecuencia. +- Cuando ya tengas el null_mask por columna (1=falta) y quieras detectar co-ausencia estructural ("A y B siempre faltan juntas") antes de decidir una imputacion o un drop conjunto de columnas. +- Cuando necesites una tabla compacta "patron -> nº filas -> pct" para un report o un grafico de barras de los patrones de ausencia mas comunes, separando ademas cuantas filas estan completas (`complete_rows`). + +## Gotchas + +- Funcion pura, sin I/O y determinista. Lectura defensiva: `{}` o un no-dict devuelven `n_rows` 0 con `patterns` []. NUNCA lanza. +- El patron vacio (fila completa, `missing_cols=[]`) SI cuenta como patron: aparece en `n_patterns` y puede aparecer en `patterns`. El consumidor lo etiqueta como "(fila completa)". +- `pct` es sobre `n_rows` total (0-100), redondeado a 2 decimales. La suma de los `pct` de TODOS los patrones es 100; si `top_n` trunca, los `pct` mostrados sumaran menos. +- Las columnas se ordenan por `str(col)` para canonizar cada patron, asi `{A,B}` y `{B,A}` colapsan al mismo patron `["A", "B"]`. +- Una celda cuenta como ausente solo si vale 1 (`int(cell) == 1`); 0, None y valores no numericos se tratan como presentes. +- Si las listas de columnas tienen longitudes distintas, `n_rows` es la mas larga y las posiciones fuera de rango de una columna corta cuentan como presentes (0). diff --git a/python/functions/datascience/missingness_row_patterns.py b/python/functions/datascience/missingness_row_patterns.py new file mode 100644 index 00000000..7bf034f8 --- /dev/null +++ b/python/functions/datascience/missingness_row_patterns.py @@ -0,0 +1,107 @@ +"""missingness_row_patterns — distinct per-row missingness patterns (missingno matrix style). + +Pure function: no I/O, deterministic, NEVER raises. Given a per-column null mask +aligned by row ({col: [0/1, ...]}, 1 = missing), it groups rows by their missing +"pattern" — the sorted tuple of column names that are missing in that row — and +counts how often each distinct pattern occurs. + +This mirrors the missingno matrix idea: instead of plotting per-cell nullity, it +collapses each row to the SET of columns it lacks, surfacing co-missing structure +(e.g. "A and B always go missing together"). The empty pattern (a fully complete +row) is a first-class pattern and may appear in the result with missing_cols=[]; +the caller labels it "(fila completa)". +""" + + +def _is_missing(cell) -> bool: + """A cell counts as missing when it equals 1 (truthy 0/1 mask). + + None / 0 / non-numeric are treated as present. Defensive: never raises. + """ + try: + return int(cell) == 1 + except (TypeError, ValueError): + return bool(cell) + + +def missingness_row_patterns(null_mask, top_n=10) -> dict: + """Count distinct per-row missingness patterns from a column null mask. + + For each row, its pattern is the sorted tuple of column names missing in that + row (the columns whose value is 1). The frequency of each distinct pattern is + counted, including the empty pattern (a complete row with nothing missing). + + Args: + null_mask: Dict {col: [0/1, ...]} aligned by row, where 1 means the cell + is missing in that row. Read defensively; columns with differing + lengths are tolerated (n_rows is the longest list; out-of-range cells + count as present). Empty dict -> n_rows 0. + top_n: Maximum number of patterns returned in `patterns`, ranked by + n_rows desc (tiebreak: fewer columns first, then column names). The + full count of distinct patterns is always reported in `n_patterns`. + + Returns: + Dict: + { + "n_rows": int, # total rows + "n_patterns": int, # distinct patterns (incl. the empty pattern) + "complete_rows": int, # rows with the empty pattern (nothing missing) + "patterns": [ # top_n patterns, n_rows desc + {"missing_cols": [col, ...], "n_rows": int, "pct": float} # [] = complete row + ], + } + For {} (or a non-dict) returns n_rows 0 and patterns []. NEVER raises. + """ + empty = {"n_rows": 0, "n_patterns": 0, "complete_rows": 0, "patterns": []} + if not isinstance(null_mask, dict) or not null_mask: + return empty + + # Stable, canonical column order so each row's pattern tuple is sorted. + items = sorted(null_mask.items(), key=lambda kv: str(kv[0])) + names = [str(k) for k, _ in items] + lists = [v if isinstance(v, (list, tuple)) else [] for _, v in items] + + n_rows = max((len(lst) for lst in lists), default=0) + if n_rows == 0: + return empty + + # Defensive parsing of top_n. + try: + limit = int(top_n) + except (TypeError, ValueError): + limit = 10 + if limit < 0: + limit = 0 + + counts: dict = {} + n_cols = len(names) + for r in range(n_rows): + # names is sorted, so iterating in order yields an already-sorted tuple. + pattern = tuple( + names[c] + for c in range(n_cols) + if r < len(lists[c]) and _is_missing(lists[c][r]) + ) + counts[pattern] = counts.get(pattern, 0) + 1 + + complete_rows = counts.get((), 0) + n_patterns = len(counts) + + # Rank: n_rows desc, then fewer columns first, then column names (deterministic). + ordered = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0])) + + patterns = [ + { + "missing_cols": list(pat), + "n_rows": cnt, + "pct": round(100.0 * cnt / n_rows, 2), + } + for pat, cnt in ordered[:limit] + ] + + return { + "n_rows": n_rows, + "n_patterns": n_patterns, + "complete_rows": complete_rows, + "patterns": patterns, + } diff --git a/python/functions/datascience/missingness_row_patterns_test.py b/python/functions/datascience/missingness_row_patterns_test.py new file mode 100644 index 00000000..28e97068 --- /dev/null +++ b/python/functions/datascience/missingness_row_patterns_test.py @@ -0,0 +1,87 @@ +"""Tests para missingness_row_patterns.""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +from missingness_row_patterns import missingness_row_patterns + +_EXPECTED_KEYS = {"n_rows", "n_patterns", "complete_rows", "patterns"} + + +def test_patron_dominante_completas_singleton(): + """Golden: {A,B} co-faltan en 4 filas + 5 filas completas + 1 singleton {C}.""" + # 10 filas. A y B faltan juntas en las filas 0-3; filas 4-8 completas; + # la fila 9 solo le falta C. + null_mask = { + "A": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + "B": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + "C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + } + out = missingness_row_patterns(null_mask) + + assert set(out.keys()) == _EXPECTED_KEYS + assert out["n_rows"] == 10 + # 3 patrones distintos: (A,B), () y (C,). + assert out["n_patterns"] == 3 + # 5 filas completas (filas 4-8). + assert out["complete_rows"] == 5 + + # Orden: n_rows desc; desempate menos columnas primero. + # () tiene 5 filas, (A,B) 4, (C,) 1. + pats = out["patterns"] + assert len(pats) == 3 + + assert pats[0]["missing_cols"] == [] + assert pats[0]["n_rows"] == 5 + assert pats[0]["pct"] == 50.0 + + assert pats[1]["missing_cols"] == ["A", "B"] + assert pats[1]["n_rows"] == 4 + assert pats[1]["pct"] == 40.0 + + assert pats[2]["missing_cols"] == ["C"] + assert pats[2]["n_rows"] == 1 + assert pats[2]["pct"] == 10.0 + + # Tipos de salida. + assert isinstance(out["n_rows"], int) + assert isinstance(pats[0]["pct"], float) + + +def test_mask_vacio(): + """{} -> n_rows 0, sin patrones, nunca lanza.""" + out = missingness_row_patterns({}) + assert out == { + "n_rows": 0, + "n_patterns": 0, + "complete_rows": 0, + "patterns": [], + } + # No dict / None tambien degradan a vacio sin lanzar. + assert missingness_row_patterns(None)["n_rows"] == 0 + # Columnas presentes pero listas vacias -> n_rows 0. + assert missingness_row_patterns({"A": [], "B": []})["patterns"] == [] + + +def test_top_n_trunca_pero_cuenta_todos(): + """top_n limita `patterns`, pero n_patterns reporta TODOS los distintos.""" + null_mask = { + "A": [0, 1, 1, 0, 1], + "B": [0, 0, 0, 1, 1], + "C": [0, 0, 0, 0, 1], + } + # Filas: () (A,) (A,) (B,) (A,B,C) + out = missingness_row_patterns(null_mask, top_n=2) + + assert out["n_rows"] == 5 + assert out["n_patterns"] == 4 # (), (A,), (B,), (A,B,C) + assert out["complete_rows"] == 1 + # Solo 2 patrones devueltos pese a haber 4. + assert len(out["patterns"]) == 2 + # (A,) domina con 2 filas; desempate del 2o entre los de 1 fila -> () (0 cols). + assert out["patterns"][0]["missing_cols"] == ["A"] + assert out["patterns"][0]["n_rows"] == 2 + assert out["patterns"][1]["missing_cols"] == [] + assert out["patterns"][1]["n_rows"] == 1