7fa19d65db
Añade el capítulo `missingness` al motor AutomaticEDA, complemento natural de `calidad`: donde calidad reporta cuánto falta por columna, este capítulo analiza el PATRÓN de los nulos — dónde faltan y si las columnas faltan juntas (co-ocurrencia de ausencias), la señal que distingue MCAR de MAR antes de imputar. Capítulo (`chapters/missingness.py`), registrado en `chapters_registry.py` justo tras `calidad`: - Resumen global: % de celdas faltantes, columnas con nulos, filas completas vs incompletas. - Ranking por columna (tabla + barras horizontales). - Co-ocurrencia: correlación de las máscaras is-null entre columnas (heatmap + tabla de los pares que co-faltan, con co-faltantes y Jaccard). - Patrones de fila más frecuentes (estilo matriz de missingno). - Lectura MCAR/MAR exploratoria (heurística por correlación/solape de ausencias, no confirmatoria), que cita la evidencia concreta. - Términos de glosario clicables: missingness, MCAR, MAR. La máscara is-null por fila de TODAS las columnas (numéricas y categóricas) se construye con un push-down DuckDB sobre ctx['db_path']/table (mismo patrón que el capítulo agregación), con fallback a ctx['raw_numeric'] cuando no hay BD. Activa solo si la tabla tiene nulos; si no, devuelve None. Funciones nuevas del grupo `eda` (dominio datascience): - extract_null_mask (impura): máscara is-null por fila vía query_fn. - missingness_overview (pura): resumen global + filas completas/incompletas. - missingness_correlation (pura): correlación de ausencias + pares + Jaccard, reutiliza pearson. - missingness_row_patterns (pura): patrones de fila más comunes. - missingness_corr_heatmap_figure / missingness_rank_bar_figure (impuras): figuras. Verificado: EDA de titanic genera el capítulo en PDF + PPTX + MD con Cabin 77.1%, Age 19.9% y la co-ocurrencia Age↔Cabin (158 filas). Suite completa de AutomaticEDA + render_automatic_eda en verde (125 passed); tests por función y por capítulo; fn index sin error. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
147 lines
4.6 KiB
Python
147 lines
4.6 KiB
Python
"""Tests para missingness_overview."""
|
|
|
|
import sys
|
|
import os
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from missingness_overview import missingness_overview
|
|
|
|
|
|
# Output contract: every call returns exactly these 9 keys.
|
|
EXPECTED_KEYS = {
|
|
"n_rows",
|
|
"n_cols",
|
|
"n_cols_with_null",
|
|
"n_missing_cells",
|
|
"missing_cell_pct",
|
|
"complete_rows",
|
|
"incomplete_rows",
|
|
"complete_pct",
|
|
"incomplete_pct",
|
|
}
|
|
|
|
|
|
def test_cooccurrence_three_cols_exact():
|
|
# 3 columns, 5 rows. Hand-computed expectations:
|
|
# col a missing at rows 0, 4 -> 2
|
|
# col b missing at rows 0, 2 -> 2
|
|
# col c missing at row 4 -> 1
|
|
# n_missing_cells = 5, total_cells = 5*3 = 15 -> 33.333...%
|
|
# row 0 (a&b co-occur) -> incomplete
|
|
# row 1 (all present) -> complete
|
|
# row 2 (b only) -> incomplete
|
|
# row 3 (all present) -> complete
|
|
# row 4 (a&c co-occur) -> incomplete
|
|
mask = {
|
|
"a": [1, 0, 0, 0, 1],
|
|
"b": [1, 0, 1, 0, 0],
|
|
"c": [0, 0, 0, 0, 1],
|
|
}
|
|
out = missingness_overview(mask)
|
|
assert out["n_rows"] == 5
|
|
assert out["n_cols"] == 3
|
|
assert out["n_cols_with_null"] == 3
|
|
assert out["n_missing_cells"] == 5
|
|
assert out["missing_cell_pct"] == pytest.approx(33.33333333, abs=1e-6)
|
|
assert out["complete_rows"] == 2
|
|
assert out["incomplete_rows"] == 3
|
|
assert out["complete_pct"] == pytest.approx(40.0)
|
|
assert out["incomplete_pct"] == pytest.approx(60.0)
|
|
|
|
|
|
def test_empty_dict_all_zero():
|
|
out = missingness_overview({})
|
|
assert out == {
|
|
"n_rows": 0,
|
|
"n_cols": 0,
|
|
"n_cols_with_null": 0,
|
|
"n_missing_cells": 0,
|
|
"missing_cell_pct": 0.0,
|
|
"complete_rows": 0,
|
|
"incomplete_rows": 0,
|
|
"complete_pct": 0.0,
|
|
"incomplete_pct": 0.0,
|
|
}
|
|
|
|
|
|
def test_output_keys_contract():
|
|
# The 9-key contract holds even for the garbage/zero path.
|
|
assert set(missingness_overview({}).keys()) == EXPECTED_KEYS
|
|
assert set(missingness_overview({"a": [1, 0]}).keys()) == EXPECTED_KEYS
|
|
|
|
|
|
def test_not_a_dict_returns_zero():
|
|
for bad in (None, [1, 0, 1], 42, "nope", 3.14):
|
|
out = missingness_overview(bad)
|
|
assert out["n_rows"] == 0
|
|
assert out["n_cols"] == 0
|
|
assert out["n_missing_cells"] == 0
|
|
assert out["missing_cell_pct"] == 0.0
|
|
|
|
|
|
def test_no_nulls_all_complete():
|
|
mask = {"a": [0, 0, 0], "b": [0, 0, 0]}
|
|
out = missingness_overview(mask)
|
|
assert out["n_rows"] == 3
|
|
assert out["n_cols"] == 2
|
|
assert out["n_cols_with_null"] == 0
|
|
assert out["n_missing_cells"] == 0
|
|
assert out["missing_cell_pct"] == 0.0
|
|
assert out["complete_rows"] == 3
|
|
assert out["incomplete_rows"] == 0
|
|
assert out["complete_pct"] == pytest.approx(100.0)
|
|
assert out["incomplete_pct"] == pytest.approx(0.0)
|
|
|
|
|
|
def test_none_values_treated_as_present():
|
|
# None and other non-1 values count as present (0).
|
|
mask = {"a": [None, 1, None, "x", 0]}
|
|
out = missingness_overview(mask)
|
|
assert out["n_rows"] == 5
|
|
assert out["n_cols"] == 1
|
|
assert out["n_missing_cells"] == 1 # only the explicit 1 at row 1
|
|
assert out["n_cols_with_null"] == 1
|
|
assert out["complete_rows"] == 4
|
|
assert out["incomplete_rows"] == 1
|
|
|
|
|
|
def test_unequal_lengths_pad_with_max():
|
|
# Ragged lists: n_rows = max length; shorter column padded as present.
|
|
# a = [1, 1] -> missing at rows 0, 1
|
|
# b = [0] -> row 1 padded to present
|
|
# n_rows = 2, n_cols = 2, total_cells = 4, n_missing_cells = 2 -> 50%
|
|
mask = {"a": [1, 1], "b": [0]}
|
|
out = missingness_overview(mask)
|
|
assert out["n_rows"] == 2
|
|
assert out["n_cols"] == 2
|
|
assert out["n_cols_with_null"] == 1
|
|
assert out["n_missing_cells"] == 2
|
|
assert out["missing_cell_pct"] == pytest.approx(50.0)
|
|
assert out["complete_rows"] == 0
|
|
assert out["incomplete_rows"] == 2
|
|
assert out["incomplete_pct"] == pytest.approx(100.0)
|
|
|
|
|
|
def test_columns_present_but_no_rows():
|
|
# Columns exist but all empty -> zero metrics, n_cols preserved.
|
|
out = missingness_overview({"a": [], "b": []})
|
|
assert out["n_rows"] == 0
|
|
assert out["n_cols"] == 2
|
|
assert out["n_missing_cells"] == 0
|
|
assert out["missing_cell_pct"] == 0.0
|
|
assert out["complete_pct"] == 0.0
|
|
|
|
|
|
def test_never_raises_on_garbage():
|
|
# Non-list column values, mixed junk -> must not raise.
|
|
mask = {"a": "not a list", "b": 123, "c": [1, 0, 1]}
|
|
out = missingness_overview(mask)
|
|
assert set(out.keys()) == EXPECTED_KEYS
|
|
assert out["n_rows"] == 3
|
|
assert out["n_cols"] == 3
|
|
assert out["n_missing_cells"] == 2 # only col c contributes
|
|
assert out["n_cols_with_null"] == 1
|