7fa19d65db
Añade el capítulo `missingness` al motor AutomaticEDA, complemento natural de `calidad`: donde calidad reporta cuánto falta por columna, este capítulo analiza el PATRÓN de los nulos — dónde faltan y si las columnas faltan juntas (co-ocurrencia de ausencias), la señal que distingue MCAR de MAR antes de imputar. Capítulo (`chapters/missingness.py`), registrado en `chapters_registry.py` justo tras `calidad`: - Resumen global: % de celdas faltantes, columnas con nulos, filas completas vs incompletas. - Ranking por columna (tabla + barras horizontales). - Co-ocurrencia: correlación de las máscaras is-null entre columnas (heatmap + tabla de los pares que co-faltan, con co-faltantes y Jaccard). - Patrones de fila más frecuentes (estilo matriz de missingno). - Lectura MCAR/MAR exploratoria (heurística por correlación/solape de ausencias, no confirmatoria), que cita la evidencia concreta. - Términos de glosario clicables: missingness, MCAR, MAR. La máscara is-null por fila de TODAS las columnas (numéricas y categóricas) se construye con un push-down DuckDB sobre ctx['db_path']/table (mismo patrón que el capítulo agregación), con fallback a ctx['raw_numeric'] cuando no hay BD. Activa solo si la tabla tiene nulos; si no, devuelve None. Funciones nuevas del grupo `eda` (dominio datascience): - extract_null_mask (impura): máscara is-null por fila vía query_fn. - missingness_overview (pura): resumen global + filas completas/incompletas. - missingness_correlation (pura): correlación de ausencias + pares + Jaccard, reutiliza pearson. - missingness_row_patterns (pura): patrones de fila más comunes. - missingness_corr_heatmap_figure / missingness_rank_bar_figure (impuras): figuras. Verificado: EDA de titanic genera el capítulo en PDF + PPTX + MD con Cabin 77.1%, Age 19.9% y la co-ocurrencia Age↔Cabin (158 filas). Suite completa de AutomaticEDA + render_automatic_eda en verde (125 passed); tests por función y por capítulo; fn index sin error. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
156 lines
6.9 KiB
Python
156 lines
6.9 KiB
Python
"""Chapter registry — the canonical order of an AutomaticEDA document.
|
|
|
|
``CHAPTER_ORDER`` declares every chapter the engine will *ever* place, in the
|
|
order they appear in the document. Each id maps by convention to a module
|
|
``automatic_eda/chapters/<id>.py`` exposing ``build_<id>(profile, ctx) ->
|
|
Chapter | None`` and a ``CHAPTER_VERSION`` constant.
|
|
|
|
This pre-declared order is what lets many agents add chapters in parallel
|
|
without contention: an agent only creates its own ``chapters/<id>.py`` module —
|
|
it never edits this file. ``build_document`` imports each chapter lazily; a
|
|
chapter whose module does not exist yet (not implemented) is simply skipped, so
|
|
the document is always renderable with whatever chapters are present today.
|
|
|
|
``build_document`` never raises: a chapter that errors out is dropped with a
|
|
note, and a chapter that returns ``None`` (does not apply to this dataset, e.g.
|
|
time series on a dataset with no date column) is omitted.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
|
|
from . import model
|
|
|
|
# Canonical document order. Implemented today: portada, overview. The rest are
|
|
# placeholders other agents will fill by creating chapters/<id>.py — they will
|
|
# appear in this exact position automatically once their module exists.
|
|
CHAPTER_ORDER = [
|
|
"portada", # cover — BUILT LAST, PLACED FIRST (see build_document).
|
|
"overview", # df.head + columns/types/nulls/examples + describe
|
|
"analisis_llm", # LLM interpretation — sits next to overview (user request)
|
|
"num_distr", # numeric distributions
|
|
"cat_distr", # categorical distributions
|
|
"calidad", # data quality
|
|
"missingness", # missing-data patterns (co-occurrence of absences; MCAR/MAR)
|
|
"correlacion", # correlations / associations
|
|
"relaciones", # key relations: declared/candidate PK + FK (inter/intra-table)
|
|
"modelos", # cheap models (PCA/KMeans/outliers)
|
|
"timeseries", # time-series analysis
|
|
"geospatial", # geospatial
|
|
"agregacion", # aggregations / pivots
|
|
"glosario", # glossary — ALWAYS LAST; clickable term destinations.
|
|
]
|
|
|
|
# Chapters whose position is special-cased by build_document: portada is built
|
|
# last (so it can summarize the rest) but placed first; glosario is built and
|
|
# placed last (it reads the terms every other chapter registered).
|
|
_PORTADA = "portada"
|
|
_GLOSARIO = "glosario"
|
|
|
|
|
|
def build_chapter(chapter_id: str, profile: dict, ctx: dict):
|
|
"""Build a single chapter by id, or None if absent/not-applicable/error.
|
|
|
|
Looks up ``automatic_eda.chapters.<chapter_id>`` and calls its
|
|
``build_<chapter_id>(profile, ctx)``. Returns a normalized Chapter, or None
|
|
when the module is missing, the builder returns None, or anything raises.
|
|
"""
|
|
mod_name = f"{__package__}.chapters.{chapter_id}"
|
|
try:
|
|
mod = importlib.import_module(mod_name)
|
|
except Exception: # noqa: BLE001 — chapter not implemented yet → skip.
|
|
return None
|
|
builder = getattr(mod, f"build_{chapter_id}", None)
|
|
if builder is None:
|
|
return None
|
|
try:
|
|
result = builder(profile or {}, ctx or {})
|
|
except Exception: # noqa: BLE001 — a broken chapter never aborts the doc.
|
|
return None
|
|
return model.as_chapter(result)
|
|
|
|
|
|
def build_document(profile: dict, ctx: dict = None) -> list:
|
|
"""Build the full ordered list of chapters for a TableProfile.
|
|
|
|
Args:
|
|
profile: the ``eda`` group TableProfile dict (may be None/empty).
|
|
ctx: optional context dict carrying presentation metadata not present in
|
|
the profile (dataset_name, source_origin, storage, generated_at,
|
|
description, granularity, quality_criteria, head_rows, ...).
|
|
|
|
Returns:
|
|
list[Chapter] in canonical order, containing only the chapters that are
|
|
implemented and applicable. Never raises.
|
|
"""
|
|
if not isinstance(profile, dict):
|
|
profile = {}
|
|
# Copy ctx so the shared collector / summary we add do not leak to the caller.
|
|
ctx = dict(ctx) if isinstance(ctx, dict) else {}
|
|
|
|
# A single glossary collector is shared by every chapter via ctx['glossary'].
|
|
# Chapters call ctx['glossary'].add(key, label, definition) and mark in-text
|
|
# appearances with [[term:key]]…[[/term]]; the glosario chapter renders the
|
|
# registered terms and the renderers wire the clickable links.
|
|
glossary = ctx.get("glossary")
|
|
if not isinstance(glossary, model.GlossaryCollector):
|
|
glossary = model.GlossaryCollector()
|
|
ctx["glossary"] = glossary
|
|
|
|
# 1) Body: every chapter except portada (built last) and glosario (placed
|
|
# last), in canonical order. This also fills the glossary collector.
|
|
body = []
|
|
for cid in CHAPTER_ORDER:
|
|
if cid in (_PORTADA, _GLOSARIO):
|
|
continue
|
|
ch = build_chapter(cid, profile, ctx)
|
|
if ch is not None and ch.blocks:
|
|
body.append(ch)
|
|
|
|
# 2) Aggregated summary of the rest, for the cover (user decision: the cover
|
|
# is BUILT after the body so it can reflect what the analysis found).
|
|
ctx["document_summary"] = _summarize_document(profile, body)
|
|
|
|
# 3) Build the cover last, place it FIRST.
|
|
portada = build_chapter(_PORTADA, profile, ctx)
|
|
# 4) Build the glossary last (reads the terms the body registered), place LAST.
|
|
glosario = build_chapter(_GLOSARIO, profile, ctx)
|
|
|
|
chapters = []
|
|
if portada is not None and portada.blocks:
|
|
chapters.append(portada)
|
|
chapters.extend(body)
|
|
if glosario is not None and glosario.blocks:
|
|
chapters.append(glosario)
|
|
return chapters
|
|
|
|
|
|
def _summarize_document(profile: dict, body: list) -> dict:
|
|
"""Aggregate a tiny findings summary of the body for the cover. Never raises.
|
|
|
|
Returns a dict with dataset shape, quality, column-type counts and the list
|
|
of chapters actually included — enough for the cover to show a mini-summary
|
|
of the analysis without re-deriving anything."""
|
|
try:
|
|
cols = profile.get("columns") or []
|
|
n_num = sum(1 for c in cols if isinstance(c, dict)
|
|
and c.get("inferred_type") == "numeric")
|
|
n_cat = sum(1 for c in cols if isinstance(c, dict)
|
|
and isinstance(c.get("categorical"), dict)
|
|
and c.get("categorical", {}).get("top")
|
|
and c.get("inferred_type") != "numeric")
|
|
return {
|
|
"n_chapters": len(body),
|
|
"chapter_titles": [getattr(c, "title", "") for c in body],
|
|
"n_rows": profile.get("n_rows"),
|
|
"n_cols": profile.get("n_cols"),
|
|
"quality_score": profile.get("quality_score"),
|
|
"n_numeric": n_num,
|
|
"n_categorical": n_cat,
|
|
"duplicate_pct": profile.get("duplicate_pct"),
|
|
"null_cell_pct": profile.get("null_cell_pct"),
|
|
}
|
|
except Exception: # noqa: BLE001 — the summary is best-effort.
|
|
return {"n_chapters": len(body) if isinstance(body, list) else 0}
|