Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6f88f184f1 | |||
| e5abc18211 | |||
| 9da1ee6533 | |||
| 5d4a48ec5e | |||
| 7fa19d65db | |||
| 105e56cf05 | |||
| eaca41a532 | |||
| e815f5b3b9 | |||
| 7ec2bb1b45 | |||
| a1e2e3567c | |||
| 833597c831 | |||
| 7158be8142 | |||
| 9be84a48ea | |||
| fd63261444 | |||
| 4099d88eaf | |||
| 48de3ce3da | |||
| ab21e5d90b | |||
| da60211826 | |||
| aa5aa67d50 | |||
| 68f4ddabce | |||
| 43821ab11d | |||
| 32054ad781 | |||
| a2074a0167 | |||
| d001d90306 | |||
| 7045f37554 | |||
| fa8db01059 | |||
| 048781df3f | |||
| a421f13d2e | |||
| 13c82be780 |
@@ -34,6 +34,7 @@ from .theils_u import theils_u
|
||||
from .correlation_ratio import correlation_ratio
|
||||
from .mutual_info_columns import mutual_info_columns
|
||||
from .infer_fk_containment_duckdb import infer_fk_containment_duckdb
|
||||
from .detect_declared_keys_duckdb import detect_declared_keys_duckdb
|
||||
from .build_join_graph import build_join_graph
|
||||
from .association_matrix import association_matrix
|
||||
from .correlation_matrix_duckdb import correlation_matrix_duckdb
|
||||
@@ -63,14 +64,17 @@ from .exploratory_caveats import exploratory_caveats
|
||||
from .render_eda_pdf import render_eda_pdf, render_eda_pdf_relational
|
||||
from .render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from .render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
from .render_automatic_eda_markdown import render_automatic_eda_markdown
|
||||
from .detect_time_column import detect_time_column
|
||||
from .extract_timeseries_raw import extract_timeseries_raw
|
||||
from .build_eda_render_ctx import build_eda_render_ctx
|
||||
from .profile_datetime import profile_datetime
|
||||
from .resample_timeseries import resample_timeseries
|
||||
from .add_pdf_internal_links import add_pdf_internal_links
|
||||
from .suggest_intratable_fk_candidates import suggest_intratable_fk_candidates
|
||||
|
||||
__all__ = [
|
||||
"suggest_intratable_fk_candidates",
|
||||
"detect_time_column",
|
||||
"extract_timeseries_raw",
|
||||
"build_eda_render_ctx",
|
||||
@@ -79,6 +83,7 @@ __all__ = [
|
||||
"resample_timeseries",
|
||||
"render_automatic_eda_pdf",
|
||||
"render_automatic_eda_pptx",
|
||||
"render_automatic_eda_markdown",
|
||||
"decode_qr_image",
|
||||
"adf_kpss_stationarity",
|
||||
"acf_pacf",
|
||||
@@ -97,6 +102,7 @@ __all__ = [
|
||||
"correlation_ratio",
|
||||
"mutual_info_columns",
|
||||
"infer_fk_containment_duckdb",
|
||||
"detect_declared_keys_duckdb",
|
||||
"build_join_graph",
|
||||
"association_matrix",
|
||||
"correlation_matrix_duckdb",
|
||||
|
||||
@@ -36,6 +36,7 @@ from .model import ( # noqa: F401
|
||||
from .chapters_registry import CHAPTER_ORDER, build_chapter, build_document # noqa: F401
|
||||
from .render_pdf_impl import render_pdf # noqa: F401
|
||||
from .render_pptx_impl import render_pptx # noqa: F401
|
||||
from .render_md_impl import render_md # noqa: F401
|
||||
|
||||
__all__ = [
|
||||
"ENGINE_NAME",
|
||||
@@ -60,4 +61,5 @@ __all__ = [
|
||||
"build_document",
|
||||
"render_pdf",
|
||||
"render_pptx",
|
||||
"render_md",
|
||||
]
|
||||
|
||||
@@ -89,6 +89,35 @@ _DEF_MAX_CARD = 20
|
||||
_DEF_MAX_MEASURES = 4
|
||||
_DEF_TOP_N = 12
|
||||
|
||||
# Glossary terms this chapter explains. Both appear in the always-rendered intro,
|
||||
# so they are registered and marked clickable whenever a collector is in ctx —
|
||||
# the canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
|
||||
# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
|
||||
# block. Mapping key -> (label, definition).
|
||||
_TERM_DEFS = {
|
||||
"groupby": (
|
||||
"Agrupación (split-apply-combine)",
|
||||
"Operación de agrupación (group by): parte la tabla en grupos según los "
|
||||
"valores de una columna categórica, aplica un cálculo (conteo, media, "
|
||||
"mediana…) dentro de cada grupo y combina los resultados en una tabla "
|
||||
"resumen. Es el patrón split-apply-combine."),
|
||||
"pivot_table": (
|
||||
"Tabla dinámica (pivot)",
|
||||
"Tabla dinámica que cruza dos variables categóricas — una en las filas y "
|
||||
"otra en las columnas — y rellena cada celda con un agregado (media, "
|
||||
"suma…) de una medida numérica. Resume de un vistazo cómo interactúan las "
|
||||
"dos categóricas sobre esa medida."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
it), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the other chapters' defensive style).
|
||||
@@ -525,15 +554,18 @@ def _sections_live(profile: dict, ctx: dict, candidates: dict) -> list:
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _intro_blocks() -> list:
|
||||
def _intro_blocks(gloss=None, mark_term: bool = False) -> list:
|
||||
if gloss is not None:
|
||||
for key, (label, definition) in _TERM_DEFS.items():
|
||||
gloss.add(key, label, definition)
|
||||
t_groupby = _term(mark_term, "groupby", "**por grupos** (split-apply-combine)")
|
||||
t_pivot = _term(mark_term, "pivot_table", "**tablas dinámicas** (pivot)")
|
||||
text = (
|
||||
"Este capítulo analiza la tabla **por grupos** (split-apply-combine): "
|
||||
"elige las columnas categóricas más informativas — por su cardinalidad "
|
||||
"y relevancia, no todas contra todas, para no inflar comparaciones "
|
||||
"espurias — y resume las variables numéricas dentro de cada grupo "
|
||||
"(conteo, media, mediana, desviación). Las **tablas dinámicas** (pivot) "
|
||||
"cruzan dos categóricas sobre una medida, y los **gráficos de barras** "
|
||||
"(siempre desde cero) comparan los grupos de un vistazo."
|
||||
f"Este capítulo analiza la tabla {t_groupby}: elige las columnas "
|
||||
"categóricas más informativas (por cardinalidad y relevancia, no todas "
|
||||
"contra todas) y resume las variables numéricas dentro de cada grupo "
|
||||
f"(conteo, media, mediana, desviación). Se añaden {t_pivot} y "
|
||||
"**gráficos de barras** (siempre desde cero) para comparar los grupos."
|
||||
)
|
||||
return [model.Heading(text=CHAPTER_TITLE, level=1),
|
||||
model.Markdown(text=text)]
|
||||
@@ -556,13 +588,21 @@ def build_agregacion(profile: dict, ctx: dict):
|
||||
if not isinstance(profile, dict):
|
||||
return None
|
||||
|
||||
# Shared glossary collector: groupby + pivot_table live in the always-present
|
||||
# intro, so they are registered + marked there. Degrades silently (mark_term
|
||||
# False) when no collector is in ctx (standalone render).
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
|
||||
# Pre-computed results take precedence (offline / tests / forward-compat).
|
||||
pre = ctx.get("aggregations")
|
||||
if _is_dict(pre) and (pre.get("groupby") or pre.get("pivots")):
|
||||
sections = _sections_from_precomputed(pre)
|
||||
if not sections:
|
||||
return None
|
||||
blocks = _intro_blocks() + sections + _insights_section(ctx)
|
||||
blocks = (_intro_blocks(gloss, mark_term) + sections
|
||||
+ _insights_section(ctx))
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -583,10 +623,11 @@ def build_agregacion(profile: dict, ctx: dict):
|
||||
"crudos. Pasa ctx['db_path'] + ctx['table'] (para el cálculo "
|
||||
"push-down en DuckDB) o ctx['aggregations'] ya precalculado. "
|
||||
f"Columnas categóricas candidatas: {keys or '—'}.")
|
||||
blocks = _intro_blocks() + [note] + _insights_section(ctx)
|
||||
blocks = (_intro_blocks(gloss, mark_term) + [note]
|
||||
+ _insights_section(ctx))
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
blocks = _intro_blocks() + sections + _insights_section(ctx)
|
||||
blocks = _intro_blocks(gloss, mark_term) + sections + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -254,3 +254,25 @@ def test_anti_corte_muchos_grupos_y_texto_largo():
|
||||
# First, middle and last words of the long paragraph all present.
|
||||
for i in (0, 60, 119):
|
||||
assert f"palabra{i}" in txt
|
||||
|
||||
|
||||
def test_glosario_engancha_groupby_y_pivot():
|
||||
"""Mejora 4b: la agrupación (split-apply-combine) y la tabla dinámica (pivot)
|
||||
se registran en el colector compartido y se marcan clicables en el cuerpo.
|
||||
Sin colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ctx = dict(_ctx_precomputed())
|
||||
ctx["glossary"] = g
|
||||
ch = build_agregacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"groupby", "pivot_table"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
assert "[[term:groupby]]" in body and "[[term:pivot_table]]" in body
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_agregacion(_profile(), _ctx_precomputed())
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -42,7 +42,11 @@ from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
# 1.1.0: drop the duplicated section labels — the dictionary and PII DataTables
|
||||
# no longer carry a ``title`` (the section Heading labels them once, per the
|
||||
# OVERVIEW pattern in the contract). The data-dictionary column already reads
|
||||
# "Significado de negocio".
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "analisis_llm"
|
||||
CHAPTER_TITLE = "Análisis LLM"
|
||||
|
||||
@@ -118,6 +122,11 @@ def _dictionary_block(llm: dict):
|
||||
Columns: Columna / Descripción / Significado de negocio / Unidad. The
|
||||
paginator splits this by rows repeating the header and wraps long cells, so a
|
||||
long dictionary (many columns) never gets cut.
|
||||
|
||||
The block carries **no** ``title``: the section is labelled once by the
|
||||
``Heading`` that ``build_analisis_llm`` appends right before it (the canonical
|
||||
OVERVIEW pattern, contract §8). Giving the table its own ``title`` too would
|
||||
print "Diccionario de datos" twice in a row.
|
||||
"""
|
||||
entries = llm.get("dictionary")
|
||||
if not isinstance(entries, (list, tuple)) or not entries:
|
||||
@@ -137,7 +146,7 @@ def _dictionary_block(llm: dict):
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows, title="Diccionario de datos")
|
||||
return model.DataTable(header=header, rows=rows)
|
||||
|
||||
|
||||
def _analyses_blocks(llm: dict) -> list:
|
||||
@@ -159,7 +168,12 @@ def _cleaning_blocks(llm: dict) -> list:
|
||||
|
||||
|
||||
def _pii_block(llm: dict):
|
||||
"""DataTable for PII/GDPR findings, or None if absent/empty."""
|
||||
"""DataTable for PII/GDPR findings, or None if absent/empty.
|
||||
|
||||
Like the dictionary block, it carries **no** ``title`` (the ``Heading`` in
|
||||
``build_analisis_llm`` labels the section once); it keeps its ``note`` with
|
||||
the orientative-detection caveat, which the renderers print under the table.
|
||||
"""
|
||||
entries = llm.get("pii")
|
||||
if not isinstance(entries, (list, tuple)) or not entries:
|
||||
return None
|
||||
@@ -176,7 +190,7 @@ def _pii_block(llm: dict):
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(
|
||||
header=header, rows=rows, title="Datos personales (PII / RGPD)",
|
||||
header=header, rows=rows,
|
||||
note="detección automática orientativa — revisar antes de tratar los datos")
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ from pptx import Presentation
|
||||
from datascience.automatic_eda.chapters.analisis_llm import (
|
||||
build_analisis_llm, CHAPTER_VERSION)
|
||||
from datascience.automatic_eda.chapters_registry import build_document
|
||||
from datascience.automatic_eda.model import Chapter, DataTable
|
||||
from datascience.automatic_eda.model import Chapter, DataTable, Heading
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
@@ -117,6 +117,45 @@ def test_golden_build_y_render_pdf_pptx():
|
||||
assert "DESCTOKEN" in ptx
|
||||
|
||||
|
||||
def test_sin_rotulos_duplicados_y_significado_de_negocio():
|
||||
"""The dictionary / PII sections must be labelled ONCE.
|
||||
|
||||
Regression for the duplicated 'Diccionario de datos' and 'Datos personales
|
||||
(PII / RGPD)' headings (each section used to print its label twice: a Heading
|
||||
plus the DataTable's own title). The fix drops the DataTable title and keeps
|
||||
a single Heading — the OVERVIEW pattern. The data-dictionary column header is
|
||||
also pinned to the exact text 'Significado de negocio'.
|
||||
"""
|
||||
ch = build_analisis_llm(_profile(), {})
|
||||
assert ch is not None
|
||||
|
||||
# Structure: section labels come from Headings; tables carry no title.
|
||||
headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
|
||||
assert headings.count("Diccionario de datos") == 1
|
||||
assert headings.count("Datos personales (PII / RGPD)") == 1
|
||||
for b in ch.blocks:
|
||||
if isinstance(b, DataTable):
|
||||
assert not b.title, f"DataTable should not duplicate the label: {b.title!r}"
|
||||
|
||||
# The data dictionary's third column reads exactly 'Significado de negocio'.
|
||||
dicts = [b for b in ch.blocks if isinstance(b, DataTable) and "Descripción" in b.header]
|
||||
assert dicts, "expected the data-dictionary DataTable"
|
||||
assert dicts[0].header == ["Columna", "Descripción", "Significado de negocio", "Unidad"]
|
||||
|
||||
# The PII table keeps its orientative-detection note.
|
||||
pii = [b for b in ch.blocks if isinstance(b, DataTable) and b.header == ["Columna", "Tipo", "Severidad"]]
|
||||
assert pii and pii[0].note and "orientativa" in pii[0].note
|
||||
|
||||
# Render: each label appears exactly once across the whole document (the only
|
||||
# 'Diccionario de datos' / 'Datos personales' producer is this chapter).
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out_pdf = os.path.join(d, "eda.pdf")
|
||||
render_automatic_eda_pdf(_profile(), out_pdf, {"title": "EDA — ventas"})
|
||||
txt = _pdf_text(out_pdf)
|
||||
assert txt.count("Diccionario de datos") == 1
|
||||
assert txt.count("Datos personales") == 1
|
||||
|
||||
|
||||
def test_orden_capitulo_junto_a_overview():
|
||||
chapters = build_document(_profile(), {})
|
||||
ids = [c.id for c in chapters]
|
||||
|
||||
@@ -1,22 +1,27 @@
|
||||
"""Data-quality chapter (CALIDAD) for AutomaticEDA.
|
||||
|
||||
Builds the quality chapter from a ``TableProfile`` of the ``eda`` group. The
|
||||
chapter answers, in Spanish and as tables, the three things the user asked for:
|
||||
chapter implements the quality model of report 2046:
|
||||
|
||||
1. **En qué se basa la calidad** — an intro paragraph explaining the criteria and
|
||||
their weights (completeness, validity, consistency) before any number, plus a
|
||||
table-level summary (global score and aggregates).
|
||||
1. **En qué se basa la calidad** — a concise intro naming the two scored
|
||||
dimensions and their weights (completitud 60%, validez 40%) plus the
|
||||
table-level row uniqueness, BEFORE any number, and stating that outliers are
|
||||
reported as observations and do **not** lower the score. The criteria terms
|
||||
(calidad de datos, completitud, validez, unicidad de registro) are hooked
|
||||
into the shared glossary as clickable jumps; their full definitions live in
|
||||
the GLOSARIO chapter, not inline here.
|
||||
2. **Scores por columna** — a table with, per column, the total quality score and
|
||||
its breakdown into completeness / validity / consistency.
|
||||
3. **Problemas en español** — a second table listing, per column, the readable
|
||||
issues in Spanish (kept separate from the type ``flags``).
|
||||
its breakdown into completeness / validity (no consistency dimension).
|
||||
3. **Problemas de calidad** — a table listing ONLY real quality defects
|
||||
(nulls, empty cells, values not conforming to their type/semantics).
|
||||
4. **Observaciones analíticas** — a SEPARATE table for outliers, constant
|
||||
columns, high-cardinality ids and strong skew, with an explicit note that
|
||||
these do not affect the score.
|
||||
|
||||
The breakdown and the issues are NOT recomputed here: they come from the registry
|
||||
function ``column_quality_score`` (group ``eda``), which already derives
|
||||
``{score, completeness, validity, consistency, issues}`` from the ColumnProfile.
|
||||
This chapter is render-only — it consumes that function and lays the result out
|
||||
as model blocks; the renderers paginate tables (splitting by rows, repeating the
|
||||
header) and wrap long cells so nothing is ever cut.
|
||||
The breakdown, issues and observations are NOT recomputed here: they come from
|
||||
the registry function ``column_quality_score`` (group ``eda``), which derives
|
||||
``{score, completeness, validity, dimensions, applicable, issues,
|
||||
observations}`` from the ColumnProfile. This chapter is render-only.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
@@ -33,28 +38,47 @@ try: # pragma: no cover - import wiring
|
||||
except Exception: # noqa: BLE001 - never let an import error abort the document.
|
||||
_column_quality_score = None
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "2.0.0"
|
||||
CHAPTER_ID = "calidad"
|
||||
CHAPTER_TITLE = "Calidad"
|
||||
|
||||
# Weights mirror column_quality_score: completeness 0.5, validity 0.3,
|
||||
# consistency 0.2. Kept here only to render the human explanation; the actual
|
||||
# numbers always come from the function so the two never drift in computation.
|
||||
_CRITERIA_INTRO = (
|
||||
"La calidad de cada columna es un score de 0 a 100 que combina tres "
|
||||
"criterios, cada uno con un peso:\n\n"
|
||||
"- **Completitud (peso 50%)**: proporción de valores presentes (sin nulos "
|
||||
"ni vacíos). Una columna con muchos nulos baja de score.\n"
|
||||
"- **Validez (peso 30%)**: los valores son coherentes con su tipo y rango "
|
||||
"esperado (penaliza outliers y semánticas declaradas que no coinciden).\n"
|
||||
"- **Consistencia (peso 20%)**: la columna aporta información útil (penaliza "
|
||||
"columnas constantes o identificadores de cardinalidad muy alta).\n\n"
|
||||
"Score = 100 × (0,5·completitud + 0,3·validez + 0,2·consistencia). "
|
||||
"Los problemas detectados por columna se listan en español más abajo."
|
||||
)
|
||||
# Glossary terms this chapter explains (report 2046 §6). Registered in the shared
|
||||
# collector and marked clickable on their first appearance (contract §11.1).
|
||||
_TERMS = {
|
||||
"calidad_datos": (
|
||||
"Calidad de datos (score 0-100)",
|
||||
"Mide hasta qué punto los datos están presentes y son utilizables tal "
|
||||
"cual, no si son «buenos para el análisis». Se compone solo de "
|
||||
"dimensiones medibles automáticamente desde el perfil de la tabla, sin "
|
||||
"fuente externa de verdad: completitud (60%), validez (40%, cuando es "
|
||||
"medible) y, a nivel de tabla, unicidad de registro. Los valores "
|
||||
"atípicos NO bajan la calidad: se listan aparte como observaciones.",
|
||||
),
|
||||
"completitud": (
|
||||
"Completitud",
|
||||
"Proporción de valores realmente presentes en una columna (1 − % de "
|
||||
"nulos; en texto, las celdas vacías también cuentan como faltantes). Los "
|
||||
"nulos y vacíos bajan el score porque falta información que debería "
|
||||
"estar. Pesa el 60% del score de columna.",
|
||||
),
|
||||
"validez": (
|
||||
"Validez",
|
||||
"Proporción de valores que encajan con su tipo o formato esperado: un "
|
||||
"número que parsea, una fecha legible, un email con forma de email. Los "
|
||||
"valores que no parsean a su tipo bajan el score. Si la columna es texto "
|
||||
"libre sin formato esperado, la validez no se puede medir y el score se "
|
||||
"basa solo en la completitud. Pesa el 40% del score cuando es medible.",
|
||||
),
|
||||
"unicidad_registro": (
|
||||
"Unicidad de registro",
|
||||
"A nivel de tabla, las filas duplicadas restan calidad al conjunto "
|
||||
"(1 − % de filas duplicadas). Es distinta de que una columna no-clave "
|
||||
"repita valores, que no es un defecto de calidad.",
|
||||
),
|
||||
}
|
||||
|
||||
# Cap for the joined issues cell so a single row never grows taller than a page;
|
||||
# the remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||
# Cap for the joined cell so a single row never grows taller than a page; the
|
||||
# remainder is summarized as "(+N más)" instead of being silently dropped.
|
||||
_ISSUES_MAXLEN = 160
|
||||
|
||||
|
||||
@@ -82,12 +106,19 @@ def _fmt_unit_pct(value) -> str:
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_validity(value) -> str:
|
||||
"""Validity is ``None`` when not applicable: show ``n/a`` not a fake 0%."""
|
||||
if value is None:
|
||||
return "n/a"
|
||||
return _fmt_unit_pct(value)
|
||||
|
||||
|
||||
def _quality_of(col: dict) -> dict:
|
||||
"""Return ``{score, completeness, validity, consistency, issues}`` for a column.
|
||||
"""Return the quality dict for a column.
|
||||
|
||||
Uses the registry ``column_quality_score`` when available; otherwise falls
|
||||
back to the per-column ``quality_score`` already in the profile (number only,
|
||||
empty breakdown/issues). Never raises.
|
||||
empty breakdown/issues/observations). Never raises.
|
||||
"""
|
||||
if not isinstance(col, dict):
|
||||
col = {}
|
||||
@@ -98,26 +129,25 @@ def _quality_of(col: dict) -> dict:
|
||||
return res
|
||||
except Exception: # noqa: BLE001 - degrade instead of aborting.
|
||||
pass
|
||||
# Fallback: only the final score is available pre-computed in the profile.
|
||||
return {
|
||||
"score": col.get("quality_score"),
|
||||
"completeness": None,
|
||||
"validity": None,
|
||||
"consistency": None,
|
||||
"issues": [],
|
||||
"observations": [],
|
||||
}
|
||||
|
||||
|
||||
def _join_issues(issues) -> str:
|
||||
"""Join Spanish issue strings into one cell, truncating overly long lists.
|
||||
def _join_cells(items) -> str:
|
||||
"""Join Spanish strings into one cell, truncating overly long lists.
|
||||
|
||||
The renderer wraps cell text, but a column with many long issues could make a
|
||||
single row taller than a whole page; cap the length and append ``(+N más)``
|
||||
so the count of hidden issues is honest rather than silently lost.
|
||||
The renderer wraps cell text, but a column with many long entries could make
|
||||
a single row taller than a whole page; cap the length and append ``(+N más)``
|
||||
so the count of hidden entries is honest rather than silently lost.
|
||||
"""
|
||||
if not isinstance(issues, (list, tuple)) or not issues:
|
||||
if not isinstance(items, (list, tuple)) or not items:
|
||||
return ""
|
||||
parts = [model._safe_str(i).strip() for i in issues]
|
||||
parts = [model._safe_str(i).strip() for i in items]
|
||||
parts = [p for p in parts if p]
|
||||
if not parts:
|
||||
return ""
|
||||
@@ -142,6 +172,33 @@ def _columns_with_quality(profile: dict):
|
||||
yield c, _quality_of(c)
|
||||
|
||||
|
||||
def _fmt_unit_pct_or_pct(value) -> str:
|
||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
pct = num * 100 if num <= 1.0 else num
|
||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text}%"
|
||||
|
||||
|
||||
def _row_uniqueness(profile: dict):
|
||||
"""Return row uniqueness (1 - duplicate_pct) in [0,1], or None if unknown."""
|
||||
dup = profile.get("duplicate_pct")
|
||||
if dup is None:
|
||||
return None
|
||||
try:
|
||||
d = float(dup)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if d > 1.0: # tolerate a 0-100 scale
|
||||
d = d / 100.0
|
||||
return max(0.0, min(1.0, 1.0 - d))
|
||||
|
||||
|
||||
def _summary_block(profile: dict, evaluated: list):
|
||||
"""Table-level KVTable: global score and quality aggregates."""
|
||||
rows = []
|
||||
@@ -153,14 +210,15 @@ def _summary_block(profile: dict, evaluated: list):
|
||||
if isinstance(q.get("completeness"), (int, float))]
|
||||
vals = [q.get("validity") for _, q in evaluated
|
||||
if isinstance(q.get("validity"), (int, float))]
|
||||
cons = [q.get("consistency") for _, q in evaluated
|
||||
if isinstance(q.get("consistency"), (int, float))]
|
||||
if comps:
|
||||
rows.append(("Completitud media", _fmt_unit_pct(sum(comps) / len(comps))))
|
||||
if vals:
|
||||
rows.append(("Validez media", _fmt_unit_pct(sum(vals) / len(vals))))
|
||||
if cons:
|
||||
rows.append(("Consistencia media", _fmt_unit_pct(sum(cons) / len(cons))))
|
||||
rows.append(("Validez media (donde aplica)",
|
||||
_fmt_unit_pct(sum(vals) / len(vals))))
|
||||
|
||||
ru = _row_uniqueness(profile)
|
||||
if ru is not None:
|
||||
rows.append(("Unicidad de registro", _fmt_unit_pct(ru)))
|
||||
|
||||
n_problem = sum(1 for _, q in evaluated if q.get("issues"))
|
||||
rows.append(("Columnas con problemas", str(n_problem)))
|
||||
@@ -182,22 +240,9 @@ def _summary_block(profile: dict, evaluated: list):
|
||||
return model.KVTable(rows=rows, title="Resumen de calidad")
|
||||
|
||||
|
||||
def _fmt_unit_pct_or_pct(value) -> str:
|
||||
"""Format a value that may be a 0-1 fraction or an already-0-100 percentage."""
|
||||
try:
|
||||
num = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if num != num: # NaN
|
||||
return "—"
|
||||
pct = num * 100 if num <= 1.0 else num
|
||||
text = f"{pct:.1f}".rstrip("0").rstrip(".")
|
||||
return f"{text}%"
|
||||
|
||||
|
||||
def _scores_block(evaluated: list):
|
||||
"""DataTable with per-column score and its three-criteria breakdown."""
|
||||
header = ["Columna", "Calidad", "Completitud", "Validez", "Consistencia"]
|
||||
"""DataTable with per-column score and its completeness/validity breakdown."""
|
||||
header = ["Columna", "Calidad", "Completitud", "Validez"]
|
||||
rows = []
|
||||
# Worst columns first so the reader sees the problems at the top.
|
||||
ordered = sorted(
|
||||
@@ -210,22 +255,22 @@ def _scores_block(evaluated: list):
|
||||
col.get("name") or "(col)",
|
||||
_fmt_score(q.get("score")),
|
||||
_fmt_unit_pct(q.get("completeness")),
|
||||
_fmt_unit_pct(q.get("validity")),
|
||||
_fmt_unit_pct(q.get("consistency")),
|
||||
_fmt_validity(q.get("validity")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Scores de calidad por columna",
|
||||
note="0 = peor, 100 = mejor; ordenado de peor a mejor")
|
||||
note="0 = peor, 100 = mejor; «n/a» = dimensión no "
|
||||
"medible; ordenado de peor a mejor")
|
||||
|
||||
|
||||
def _issues_block(evaluated: list):
|
||||
"""DataTable listing Spanish issues per column, or a Note when there are none."""
|
||||
header = ["Columna", "Problemas detectados (español)"]
|
||||
"""DataTable listing ONLY real quality defects per column, or a Note."""
|
||||
header = ["Columna", "Problemas de calidad (español)"]
|
||||
rows = []
|
||||
for col, q in evaluated:
|
||||
joined = _join_issues(q.get("issues"))
|
||||
joined = _join_cells(q.get("issues"))
|
||||
if joined:
|
||||
rows.append([col.get("name") or "(col)", joined])
|
||||
if not rows:
|
||||
@@ -235,6 +280,55 @@ def _issues_block(evaluated: list):
|
||||
title="Problemas de calidad por columna")
|
||||
|
||||
|
||||
def _observations_block(evaluated: list):
|
||||
"""DataTable listing analytical observations per column, or None.
|
||||
|
||||
Observations (outliers, constant columns, ids, strong skew) are NOT quality
|
||||
defects: they do not affect the score. Returned as a separate table from the
|
||||
issues so the report never presents a legitimate outlier as a problem.
|
||||
"""
|
||||
header = ["Columna", "Observaciones analíticas"]
|
||||
rows = []
|
||||
for col, q in evaluated:
|
||||
joined = _join_cells(q.get("observations"))
|
||||
if joined:
|
||||
rows.append([col.get("name") or "(col)", joined])
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(
|
||||
header=header, rows=rows,
|
||||
title="Observaciones analíticas por columna",
|
||||
note="No son defectos de calidad y NO afectan al score; orientan el "
|
||||
"análisis (atípicos, columnas constantes, identificadores).")
|
||||
|
||||
|
||||
def _term(key: str, label: str, mark: bool) -> str:
|
||||
"""Render a term as a clickable glossary span when marking is enabled."""
|
||||
if mark:
|
||||
return f"[[term:{key}]]**{label}**[[/term]]"
|
||||
return f"**{label}**"
|
||||
|
||||
|
||||
def _criteria_intro(mark: bool) -> str:
|
||||
"""Intro: how the score is composed, with every term marked clickable.
|
||||
|
||||
Concise on purpose: the definitions of each term (calidad de datos,
|
||||
completitud, validez, unicidad de registro) now live in the GLOSARIO
|
||||
chapter, so the body no longer repeats them — it only states how the score
|
||||
is composed and keeps each term marked so it stays a clickable jump.
|
||||
"""
|
||||
calidad = _term("calidad_datos", "calidad de datos", mark)
|
||||
completitud = _term("completitud", "completitud", mark)
|
||||
validez = _term("validez", "validez", mark)
|
||||
unicidad = _term("unicidad_registro", "unicidad de registro", mark)
|
||||
return (
|
||||
f"La {calidad} de cada columna es un score de 0 a 100 que combina "
|
||||
f"{completitud} (peso 60%) y {validez} (peso 40%, cuando es medible); "
|
||||
f"a nivel de tabla se añade la {unicidad}. Los valores atípicos no "
|
||||
"bajan el score: se listan aparte como **observaciones analíticas**."
|
||||
)
|
||||
|
||||
|
||||
def build_calidad(profile: dict, ctx: dict):
|
||||
"""Build the data-quality Chapter, or None if the profile has no columns.
|
||||
|
||||
@@ -250,17 +344,35 @@ def build_calidad(profile: dict, ctx: dict):
|
||||
if not evaluated:
|
||||
return None # no columns to score -> chapter does not apply.
|
||||
|
||||
# Register the criteria terms in the shared glossary (if present) and mark
|
||||
# their first appearance clickable. Contract §11.1.
|
||||
glossary = ctx.get("glossary")
|
||||
mark = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
for key, (label, definition) in _TERMS.items():
|
||||
glossary.add(key, label, definition)
|
||||
mark = True
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Cómo se calcula la calidad", level=2),
|
||||
model.Markdown(text=_CRITERIA_INTRO),
|
||||
model.Markdown(text=_criteria_intro(mark)),
|
||||
_summary_block(profile, evaluated),
|
||||
model.Heading(text="Scores por columna", level=2),
|
||||
]
|
||||
scores = _scores_block(evaluated)
|
||||
if scores is not None:
|
||||
blocks.append(scores)
|
||||
blocks.append(model.Heading(text="Problemas detectados", level=2))
|
||||
|
||||
blocks.append(model.Heading(text="Problemas de calidad", level=2))
|
||||
blocks.append(_issues_block(evaluated))
|
||||
|
||||
observations = _observations_block(evaluated)
|
||||
if observations is not None:
|
||||
blocks.append(model.Heading(text="Observaciones analíticas", level=2))
|
||||
blocks.append(model.Note(
|
||||
"Las observaciones siguientes NO son defectos de calidad y no "
|
||||
"afectan al score: son señales para orientar el análisis."))
|
||||
blocks.append(observations)
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut.
|
||||
"""Tests for the CALIDAD chapter — DoD: golden + edges + anti-cut + glossary.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that the chapter explains the quality criteria, shows
|
||||
per-column scores with the completeness/validity/consistency breakdown, lists the
|
||||
issues in Spanish (separate from the type flags), returns None when it does not
|
||||
apply, and that a wide profile with long names renders to PDF and PPTX without
|
||||
cutting any cell text (long content wraps, it is never truncated).
|
||||
and deterministic. Verifies the report-2046 quality model: the chapter explains
|
||||
the two scored dimensions (completitud 60% / validez 40%), shows per-column
|
||||
scores without a consistency column, keeps quality DEFECTS (issues) separate
|
||||
from analytical OBSERVATIONS (outliers, constant, ids), hooks the criteria terms
|
||||
into the glossary, returns None when it does not apply, and renders a wide
|
||||
profile to PDF and PPTX without cutting any cell text.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -20,28 +21,30 @@ from datascience.automatic_eda.chapters.calidad import (
|
||||
CHAPTER_VERSION,
|
||||
)
|
||||
from datascience.automatic_eda import build_document, render_pdf, render_pptx
|
||||
from datascience.automatic_eda import model
|
||||
|
||||
|
||||
def _profile() -> dict:
|
||||
"""A small profile with one column per quality problem (nulls, outliers,
|
||||
constant, high-cardinality id) plus one clean column."""
|
||||
constant, high-cardinality id) plus one clean column. ``outlier_pct`` is in
|
||||
the 0-100 scale that describe_numeric actually emits."""
|
||||
return {
|
||||
"table": "demo",
|
||||
"quality_score": 72.5,
|
||||
"quality_score": 82.0,
|
||||
"duplicate_pct": 0.04,
|
||||
"null_cell_pct": 0.11,
|
||||
"constant_cols": ["flag_const"],
|
||||
"all_null_cols": [],
|
||||
"columns": [
|
||||
{"name": "edad", "inferred_type": "integer", "null_pct": 0.2,
|
||||
"numeric": {"outlier_pct": 0.15, "min": 0, "max": 99},
|
||||
"quality_score": 60},
|
||||
{"name": "edad", "inferred_type": "numeric", "null_pct": 0.2,
|
||||
"n_rows": 100, "unique_pct": 0.5,
|
||||
"numeric": {"outlier_pct": 15.0, "min": 0, "max": 99}},
|
||||
{"name": "nombre", "inferred_type": "text", "null_pct": 0.0,
|
||||
"unique_pct": 0.98, "quality_score": 80},
|
||||
"unique_pct": 0.98, "flags": ["possible_id"]},
|
||||
{"name": "flag_const", "inferred_type": "text", "null_pct": 0.0,
|
||||
"flags": ["constant"], "quality_score": 50},
|
||||
{"name": "limpia", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}, "quality_score": 100},
|
||||
"unique_pct": 0.01, "flags": ["constant"]},
|
||||
{"name": "limpia", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||
],
|
||||
}
|
||||
|
||||
@@ -50,16 +53,9 @@ def _tables(chapter):
|
||||
return [b for b in chapter.blocks if getattr(b, "kind", None) == "data_table"]
|
||||
|
||||
|
||||
def _scores_table(chapter):
|
||||
def _table_by_title(chapter, needle):
|
||||
for t in _tables(chapter):
|
||||
if "Scores" in (t.title or ""):
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def _issues_table(chapter):
|
||||
for t in _tables(chapter):
|
||||
if "Problemas" in (t.title or ""):
|
||||
if needle in (t.title or ""):
|
||||
return t
|
||||
return None
|
||||
|
||||
@@ -73,41 +69,86 @@ def test_golden_chapter_estructura_y_version():
|
||||
assert ch.id == "calidad"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = [b.kind for b in ch.blocks]
|
||||
# intro heading + markdown criteria + summary kv + scores table + issues table
|
||||
assert "markdown" in kinds and "kv_table" in kinds and "data_table" in kinds
|
||||
|
||||
|
||||
def test_golden_intro_explica_criterios_y_pesos():
|
||||
def test_golden_intro_nombra_dos_dimensiones_y_pesos():
|
||||
# La intro nombra las dos dimensiones, sus pesos y la unicidad, pero ya NO
|
||||
# repite sus definiciones largas: estas viven ahora en el capítulo GLOSARIO.
|
||||
ch = build_calidad(_profile(), {})
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
for needle in ("Completitud", "Validez", "Consistencia",
|
||||
"50%", "30%", "20%"):
|
||||
for needle in ("completitud", "validez", "60%", "40%",
|
||||
"unicidad de registro"):
|
||||
assert needle in intro, f"falta {needle!r} en la intro de criterios"
|
||||
# El principio: los outliers NO bajan la calidad.
|
||||
assert "atípicos" in intro and "no bajan" in intro
|
||||
# Ya no se menciona la dimensión consistencia eliminada.
|
||||
assert "20%" not in intro
|
||||
|
||||
|
||||
def test_golden_scores_incluyen_desglose_por_criterio():
|
||||
def test_golden_scores_sin_columna_consistencia():
|
||||
ch = build_calidad(_profile(), {})
|
||||
scores = _scores_table(ch)
|
||||
scores = _table_by_title(ch, "Scores")
|
||||
assert scores is not None
|
||||
assert scores.header == ["Columna", "Calidad", "Completitud",
|
||||
"Validez", "Consistencia"]
|
||||
# 4 columns scored, none dropped.
|
||||
assert scores.header == ["Columna", "Calidad", "Completitud", "Validez"]
|
||||
assert "Consistencia" not in scores.header
|
||||
assert len(scores.rows) == 4
|
||||
names = {r[0] for r in scores.rows}
|
||||
assert names == {"edad", "nombre", "flag_const", "limpia"}
|
||||
|
||||
|
||||
def test_golden_issues_en_espanol_separados_de_flags():
|
||||
def test_golden_outliers_en_observaciones_no_en_problemas():
|
||||
ch = build_calidad(_profile(), {})
|
||||
issues = _issues_table(ch)
|
||||
assert issues is not None
|
||||
flat = " | ".join(" ".join(r) for r in issues.rows)
|
||||
assert "nulos" in flat # completeness issue (ES)
|
||||
assert "outliers" in flat # validity issue (ES)
|
||||
assert "columna constante" in flat
|
||||
assert "posible id de alta cardinalidad" in flat
|
||||
# The raw type flag string must NOT leak as a "problem".
|
||||
assert "constant" not in flat or "columna constante" in flat
|
||||
problemas = _table_by_title(ch, "Problemas de calidad")
|
||||
observaciones = _table_by_title(ch, "Observaciones")
|
||||
assert problemas is not None
|
||||
assert observaciones is not None
|
||||
|
||||
problemas_txt = " | ".join(" ".join(r) for r in problemas.rows)
|
||||
observaciones_txt = " | ".join(" ".join(r) for r in observaciones.rows)
|
||||
|
||||
# Los nulos SÍ son problema de calidad.
|
||||
assert "nulos" in problemas_txt
|
||||
# Los outliers NO aparecen como problema...
|
||||
assert "atípic" not in problemas_txt and "outlier" not in problemas_txt
|
||||
# ...sino como observación analítica.
|
||||
assert "atípic" in observaciones_txt
|
||||
# Constante e id: observaciones, no problemas.
|
||||
assert "constante" in observaciones_txt
|
||||
assert "identificador" in observaciones_txt
|
||||
assert "constante" not in problemas_txt
|
||||
|
||||
|
||||
def test_golden_score_columna_limpia_es_100():
|
||||
"""Columna sin nulos, numérica nativa: score 100 aunque tenga (o no) outliers."""
|
||||
ch = build_calidad(_profile(), {})
|
||||
scores = _table_by_title(ch, "Scores")
|
||||
by_name = {r[0]: r for r in scores.rows}
|
||||
assert by_name["limpia"][1] == "100 / 100"
|
||||
# edad: 20% nulos -> 100*(0.6*0.8 + 0.4*1.0) = 88; los outliers no bajan nada.
|
||||
assert by_name["edad"][1] == "88 / 100"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Glosario (contrato §11.1)
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_glosario_registra_los_cuatro_terminos_y_marca_clicable():
|
||||
glossary = model.GlossaryCollector()
|
||||
ch = build_calidad(_profile(), {"glossary": glossary})
|
||||
for key in ("calidad_datos", "completitud", "validez", "unicidad_registro"):
|
||||
assert glossary.has(key), f"término {key!r} no registrado en el glosario"
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
# Con colector presente, la primera aparición se marca clicable.
|
||||
assert "[[term:completitud]]" in intro
|
||||
assert "[[term:validez]]" in intro
|
||||
assert "[[term:calidad_datos]]" in intro
|
||||
assert "[[term:unicidad_registro]]" in intro
|
||||
|
||||
|
||||
def test_sin_glosario_no_marca_terminos():
|
||||
ch = build_calidad(_profile(), {}) # ctx sin glossary
|
||||
intro = [b for b in ch.blocks if b.kind == "markdown"][0].text
|
||||
assert "[[term:" not in intro
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
@@ -124,17 +165,17 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
||||
prof = {
|
||||
"quality_score": 100,
|
||||
"columns": [
|
||||
{"name": "a", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "b", "inferred_type": "float", "null_pct": 0.0,
|
||||
"numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "a", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||
{"name": "b", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"unique_pct": 0.5, "numeric": {"outlier_pct": 0.0}},
|
||||
],
|
||||
}
|
||||
ch = build_calidad(prof, {})
|
||||
assert ch is not None
|
||||
assert _issues_table(ch) is None # no issues table
|
||||
assert _table_by_title(ch, "Problemas de calidad") is None # no issues table
|
||||
notes = [b for b in ch.blocks if b.kind == "note"]
|
||||
assert notes and "No se detectaron problemas" in notes[0].text
|
||||
assert any("No se detectaron problemas" in n.text for n in notes)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
@@ -143,44 +184,42 @@ def test_edge_perfil_limpio_sin_problemas_usa_nota():
|
||||
def _wide_profile(ncols: int = 22) -> dict:
|
||||
cols = [
|
||||
{"name": "identificador_unico_de_transaccion_con_nombre_muy_largo",
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99},
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.99,
|
||||
"flags": ["possible_id"]},
|
||||
{"name": "columna_constante_sin_ninguna_variacion_de_valor",
|
||||
"inferred_type": "text", "null_pct": 0.0, "flags": ["constant"]},
|
||||
"inferred_type": "text", "null_pct": 0.0, "unique_pct": 0.01,
|
||||
"flags": ["constant"]},
|
||||
]
|
||||
for k in range(ncols - 2):
|
||||
cols.append({
|
||||
"name": f"metrica_numerica_de_negocio_{k:02d}_con_nombre_largo",
|
||||
"inferred_type": "float", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||
"numeric": {"outlier_pct": 0.08, "min": 0, "max": 1000},
|
||||
"inferred_type": "numeric", "null_pct": 0.1 + (k % 3) * 0.05,
|
||||
"unique_pct": 0.5,
|
||||
"numeric": {"outlier_pct": 8.0, "min": 0, "max": 1000},
|
||||
})
|
||||
return {"table": "ancha", "quality_score": 70.0, "columns": cols}
|
||||
return {"table": "ancha", "quality_score": 70.0, "duplicate_pct": 0.0,
|
||||
"columns": cols}
|
||||
|
||||
|
||||
def test_anticut_pdf_y_pptx_no_truncan_nombres_largos():
|
||||
prof = _wide_profile(22)
|
||||
full = build_document(prof, {"dataset_name": "ancha"})
|
||||
assert any(c.id == "calidad" for c in full)
|
||||
# Render ONLY the calidad chapter so the anti-cut assertions are scoped to
|
||||
# this chapter (other chapters, e.g. portada, legitimately contain '…').
|
||||
chapters = [c for c in full if c.id == "calidad"]
|
||||
long_name = "metrica_numerica_de_negocio_00_con_nombre_largo"
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "q.pdf")
|
||||
pptx = os.path.join(d, "q.pptx")
|
||||
rp = render_pdf(chapters, pdf, {"title": "EDA"})
|
||||
rx = render_pptx(chapters, pptx, {"title": "EDA"})
|
||||
render_pptx(chapters, pptx, {"title": "EDA"})
|
||||
assert os.path.exists(pdf) and os.path.exists(pptx)
|
||||
# The wide table forces pagination across several pages/slides.
|
||||
assert (rp or {}).get("n_pages", 0) >= 2
|
||||
|
||||
# PDF: the long name survives whole once wraps (spaces/newlines) removed,
|
||||
# and there is no truncation marker.
|
||||
pdf_txt = "".join((pg.extract_text() or "") for pg in PdfReader(pdf).pages)
|
||||
assert "…" not in pdf_txt and "..." not in pdf_txt
|
||||
norm = re.sub(r"\s+", "", pdf_txt)
|
||||
assert long_name in norm, "el nombre largo se cortó en el PDF"
|
||||
|
||||
# PPTX: long name present in some cell, untruncated.
|
||||
allt = []
|
||||
for s in Presentation(pptx).slides:
|
||||
for sh in s.shapes:
|
||||
|
||||
@@ -1,19 +1,25 @@
|
||||
"""Categorical distributions chapter (CAT DISTR).
|
||||
|
||||
Third reference chapter for AutomaticEDA. For every categorical column it shows,
|
||||
fulfilling the user's request:
|
||||
Third reference chapter for AutomaticEDA. Each categorical column gets **its own
|
||||
page (PDF) / slide (PPTX)**: every column is wrapped in a keep-together
|
||||
``model.Group`` with ``page_break_before=True`` (except the first, which may share
|
||||
the intro's page), so its chart sits next to its tables and no column is split.
|
||||
|
||||
1. A short opening explanation of **Shannon entropy** (what it measures, its 0
|
||||
and log2(k) bounds, the normalized 0–1 version) and the dataset row total used
|
||||
as a comparison baseline.
|
||||
2. Per column, a cardinality key/value table: distinct values, ``% distinct``
|
||||
(distinct / total rows), total dataset rows, singleton values (frequency 1),
|
||||
entropy with its theoretical maximum and the normalized ratio, mode, imbalance
|
||||
and string-length stats.
|
||||
3. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
A short intro names the clickable **[[term:entropia]]entropía[[/term]]** term —
|
||||
the full definition lives in the GLOSARIO chapter, so it is NOT repeated inline
|
||||
here (one click jumps to the glossary entry). The intro also carries the dataset
|
||||
row total used as a comparison baseline.
|
||||
|
||||
Per column the Group contains, in order:
|
||||
|
||||
1. A cardinality key/value table: distinct values, ``% distinct`` (distinct /
|
||||
total rows), total dataset rows, singleton values (frequency 1), entropy with
|
||||
its theoretical maximum and the normalized ratio, mode, imbalance and
|
||||
string-length stats.
|
||||
2. A short note flagging problematic cardinality (id-like ≈100% distinct, or a
|
||||
single dominating category).
|
||||
4. A ``top-k`` table (value / count / %).
|
||||
5. A **donut pie chart** of the most common categories (top-k + an "Otros"
|
||||
3. A ``top-k`` table (value / count / %).
|
||||
4. A **donut pie chart** of the most common categories (top-k + an "Otros"
|
||||
bucket), drawn lazily so the renderers scale it to fit entirely.
|
||||
|
||||
Data comes from the ``eda`` group: each ``columns[i]['categorical']`` is the
|
||||
@@ -33,7 +39,7 @@ import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "cat_distr"
|
||||
CHAPTER_TITLE = "Distribuciones categóricas"
|
||||
|
||||
@@ -53,11 +59,17 @@ _TERM_ENTROPIA_DEF = (
|
||||
# Cap the number of categorical columns rendered to keep the document bounded;
|
||||
# the rest are summarized in a closing note (no silent truncation).
|
||||
MAX_COLS = 40
|
||||
# Rows shown in each top-k table and explicit slices in the pie.
|
||||
TOP_TABLE_ROWS = 15
|
||||
# Rows shown in each top-k table and explicit slices in the pie. Kept moderate so
|
||||
# the whole column — cardinality table + top-k table + donut — fits on ONE
|
||||
# page/slide with the chart next to its tables; the table note still reports
|
||||
# "top N of M" so nothing is silently hidden. For id-like columns (≈100%
|
||||
# distinct) the top-k table is dropped entirely (it would be a list of unique
|
||||
# values — pure noise), which also frees the room the donut needs (see build).
|
||||
TOP_TABLE_ROWS = 8
|
||||
PIE_TOP_K = 6
|
||||
# Truncate very long category labels in tables (the renderer also wraps).
|
||||
LABEL_MAX = 48
|
||||
# Truncate very long category labels in tables (the renderer also wraps). Kept
|
||||
# tight so a column with long id-like values (names, tickets) still fits its page.
|
||||
LABEL_MAX = 28
|
||||
|
||||
|
||||
def _fmt_int(value) -> str:
|
||||
@@ -267,45 +279,55 @@ def _normalize_card(card: dict) -> dict:
|
||||
|
||||
|
||||
def _cardinality_block(card: dict):
|
||||
"""KVTable with the cardinality / entropy metrics for one column."""
|
||||
"""KVTable with the cardinality / entropy metrics for one column.
|
||||
|
||||
Related metrics are grouped onto a single row each (distinct/%/unique;
|
||||
entropy bits/max/normalized; length min/mean/max) so the whole column —
|
||||
table + chart — fits one page/slide without dropping any datum; the short
|
||||
16:9 PPTX slide does not fit one metric per row plus a chart otherwise."""
|
||||
n_singletons = card.get("n_singletons")
|
||||
if n_singletons is not None and card.get("n_singletons_partial"):
|
||||
singletons = f"≥{_fmt_int(n_singletons)} (en top mostrado)"
|
||||
singletons = f"≥{_fmt_int(n_singletons)}"
|
||||
elif n_singletons is not None:
|
||||
singletons = _fmt_int(n_singletons)
|
||||
else:
|
||||
singletons = "—"
|
||||
|
||||
entropy_ref = _fmt_num(card.get("entropy"))
|
||||
emax = card.get("entropy_max")
|
||||
if emax is not None:
|
||||
entropy_ref = f"{entropy_ref} (máx {_fmt_num(emax)})"
|
||||
# Distinct count · % distinct · unique (frequency 1) on one row.
|
||||
distinct_combo = (f"{_fmt_int(card.get('n_distinct'))} · "
|
||||
f"{_fmt_pct_value(card.get('pct_distinct'))} · "
|
||||
f"{singletons} únicos")
|
||||
|
||||
# Entropy bits · theoretical max · normalized 0–1 on one row.
|
||||
entropy_combo = (f"{_fmt_num(card.get('entropy'))} bits · "
|
||||
f"máx {_fmt_num(card.get('entropy_max'))} · "
|
||||
f"norm {_fmt_num(card.get('entropy_norm'))}")
|
||||
|
||||
mode = card.get("mode")
|
||||
mode_pct = card.get("mode_pct")
|
||||
mode_str = "—" if mode is None else model._safe_str(mode)
|
||||
mode_str = "—" if mode is None else _truncate(mode, 32)
|
||||
if mode is not None and mode_pct is not None:
|
||||
mode_str = f"{mode_str} ({_fmt_pct_value(mode_pct)})"
|
||||
|
||||
rows = [
|
||||
("Valores distintos", _fmt_int(card.get("n_distinct"))),
|
||||
("% distintos", _fmt_pct_value(card.get("pct_distinct"))),
|
||||
("Distintos · % · únicos", distinct_combo),
|
||||
("Total filas (dataset)", _fmt_int(card.get("n_rows"))),
|
||||
("Valores únicos (frecuencia 1)", singletons),
|
||||
("Entropía (bits)", entropy_ref),
|
||||
("Entropía normalizada (0–1)", _fmt_num(card.get("entropy_norm"))),
|
||||
("Entropía (bits · máx · norm)", entropy_combo),
|
||||
("Moda", mode_str),
|
||||
]
|
||||
imbalance = card.get("imbalance")
|
||||
if imbalance is not None:
|
||||
rows.append(("Desbalance", _fmt_num(imbalance)))
|
||||
lm = card.get("len_min")
|
||||
lmean = card.get("len_mean")
|
||||
lmax = card.get("len_max")
|
||||
# Imbalance and string length (both secondary) share one closing row.
|
||||
extras = []
|
||||
if imbalance is not None:
|
||||
extras.append(f"desbalance {_fmt_num(imbalance)}")
|
||||
if any(v is not None for v in (lm, lmean, lmax)):
|
||||
rows.append((
|
||||
"Longitud (mín/media/máx)",
|
||||
f"{_fmt_num(lm)} / {_fmt_num(lmean)} / {_fmt_num(lmax)}"))
|
||||
extras.append(
|
||||
f"long. {_fmt_num(lm)}/{_fmt_num(lmean)}/{_fmt_num(lmax)}")
|
||||
if extras:
|
||||
rows.append(("Desbalance · longitud", " · ".join(extras)))
|
||||
return model.KVTable(rows=rows, title="Cardinalidad")
|
||||
|
||||
|
||||
@@ -315,7 +337,8 @@ def _flag_note(card: dict):
|
||||
return model.Note(
|
||||
"Casi todos los valores son distintos (≈100% distintos): la columna "
|
||||
"se comporta como un identificador y aporta poco para agrupar o "
|
||||
"comparar categorías.")
|
||||
"comparar categorías. No se lista el top de categorías (serían "
|
||||
"valores casi todos únicos).")
|
||||
if card.get("dominated"):
|
||||
mp = card.get("mode_pct")
|
||||
mp_str = _fmt_pct_value(mp) if mp is not None else "muy alta"
|
||||
@@ -335,7 +358,7 @@ def _topk_table(cat: dict):
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
rows.append([
|
||||
model._safe_str(t.get("value")),
|
||||
_truncate(t.get("value")),
|
||||
_fmt_int(t.get("count")),
|
||||
_pct_from_maybe_fraction(t.get("pct")),
|
||||
])
|
||||
@@ -353,20 +376,16 @@ def _topk_table(cat: dict):
|
||||
def _intro_blocks(n_rows, mark_term: bool = False):
|
||||
total = _fmt_int(n_rows)
|
||||
# Mark the first appearance of the term as a clickable glossary jump when the
|
||||
# term was registered (mark_term). The visible text is identical either way.
|
||||
entropia = ("[[term:entropia]]**entropía de Shannon**[[/term]]" if mark_term
|
||||
else "**entropía de Shannon**")
|
||||
# term was registered (mark_term). The full definition of entropy lives in the
|
||||
# GLOSARIO chapter, so the intro only names the clickable term here instead of
|
||||
# repeating the long explanation (avoids the redundancy with the glossary).
|
||||
entropia = ("[[term:entropia]]entropía[[/term]]" if mark_term
|
||||
else "entropía")
|
||||
text = (
|
||||
f"La {entropia} mide cómo de repartidos están los valores de "
|
||||
"una columna categórica, en bits. Vale 0 cuando una sola categoría "
|
||||
"concentra todas las filas (máxima previsibilidad) y alcanza su máximo, "
|
||||
"log2(k) para k categorías distintas, cuando todas aparecen por igual "
|
||||
"(máxima diversidad). La **entropía normalizada** (entropía dividida por "
|
||||
"su máximo) la lleva al rango 0–1 para comparar columnas con distinto "
|
||||
"número de categorías. Para cada columna se muestran los valores "
|
||||
"distintos, el porcentaje que representan sobre el total de filas, los "
|
||||
"valores únicos (que aparecen una sola vez), la tabla de las categorías "
|
||||
"más frecuentes y un gráfico de tarta (donut) de las más comunes."
|
||||
f"Cada columna categórica ocupa su propia página: sus métricas de "
|
||||
f"cardinalidad —incluida la {entropia}—, una nota que señala cardinalidad "
|
||||
"problemática, la tabla de las categorías más frecuentes y un gráfico de "
|
||||
"tarta (donut) de las más comunes, todo junto."
|
||||
)
|
||||
if n_rows is not None:
|
||||
text += f" El dataset tiene {total} filas en total como referencia."
|
||||
@@ -398,24 +417,37 @@ def build_cat_distr(profile: dict, ctx: dict):
|
||||
blocks = list(_intro_blocks(n_rows, mark_term=mark_term))
|
||||
|
||||
rendered = cat_cols[:MAX_COLS]
|
||||
for col in rendered:
|
||||
for idx, col in enumerate(rendered):
|
||||
name = col.get("name") or "(columna)"
|
||||
cat = col.get("categorical") or {}
|
||||
card = _normalize_card(_cardinality(cat, n_rows))
|
||||
|
||||
blocks.append(model.Heading(text=str(name), level=2))
|
||||
blocks.append(_cardinality_block(card))
|
||||
# One Group per categorical column: heading + cardinality table + flag
|
||||
# note + top-k table + donut figure are kept together and the renderer
|
||||
# starts each on a fresh page/slide (page_break_before) so every column
|
||||
# gets its own page with its chart next to its tables. The first column
|
||||
# may share the intro's page (no forced break) to avoid a near-empty page.
|
||||
col_blocks = [
|
||||
model.Heading(text=str(name), level=2),
|
||||
_cardinality_block(card),
|
||||
]
|
||||
note = _flag_note(card)
|
||||
if note is not None:
|
||||
blocks.append(note)
|
||||
topk = _topk_table(cat)
|
||||
if topk is not None:
|
||||
blocks.append(topk)
|
||||
blocks.append(model.Figure(
|
||||
col_blocks.append(note)
|
||||
# For id-like columns (≈100% distinct) the top-k is a list of unique
|
||||
# values — pure noise; skip it (the flag note already explains why) and
|
||||
# let the donut take that room so the whole column fits one page/slide.
|
||||
if not card.get("id_like"):
|
||||
topk = _topk_table(cat)
|
||||
if topk is not None:
|
||||
col_blocks.append(topk)
|
||||
col_blocks.append(model.Figure(
|
||||
make=_pie_make(cat.get("top") or [], card.get("n_distinct"),
|
||||
str(name), n_rows),
|
||||
caption=(f"Categorías más comunes de «{_truncate(name, 32)}» "
|
||||
"(donut: top-k + «Otros»)")))
|
||||
blocks.append(model.Group(blocks=col_blocks,
|
||||
page_break_before=(idx > 0)))
|
||||
|
||||
if len(cat_cols) > len(rendered):
|
||||
omitted = len(cat_cols) - len(rendered)
|
||||
|
||||
@@ -2,11 +2,14 @@
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies that ``build_cat_distr`` emits the blocks the user
|
||||
asked for (entropy intro, distinct/total/%-distinct/unique metrics, top-k table
|
||||
and a donut figure), that the chapter renders inside the full document to both
|
||||
PDF and PPTX showing that content, that a profile with no categorical columns
|
||||
yields ``None`` without raising, and that long labels / many columns are never
|
||||
cut in either output.
|
||||
asked for (distinct/total/%-distinct/unique metrics, top-k table and a donut
|
||||
figure), that EACH categorical column is wrapped in its own keep-together
|
||||
``Group`` that starts on a fresh page/slide (one column per page, chart next to
|
||||
its tables), that the long entropy explanation is NOT repeated inline (it lives
|
||||
in the glossary — only the clickable term is kept), that the chapter renders
|
||||
inside the full document to both PDF and PPTX showing that content, that a
|
||||
profile with no categorical columns yields ``None`` without raising, and that
|
||||
long labels / many columns are never cut in either output.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -17,7 +20,8 @@ from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import (
|
||||
DataTable, Figure, Heading, KVTable, Note,
|
||||
DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown,
|
||||
Note,
|
||||
)
|
||||
from datascience.automatic_eda.chapters.cat_distr import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_cat_distr,
|
||||
@@ -81,8 +85,20 @@ def _pptx_text(path: str) -> str:
|
||||
return re.sub(r"\s+", " ", " ".join(parts))
|
||||
|
||||
|
||||
def _kinds(chapter):
|
||||
return [b.kind for b in chapter.blocks]
|
||||
def _flatten(blocks):
|
||||
"""Expand keep-together Groups so the per-column heading/table/figure are
|
||||
inspectable as a flat block list (the chapter wraps each column in a Group)."""
|
||||
out = []
|
||||
for b in blocks:
|
||||
if getattr(b, "kind", "") == "group":
|
||||
out.extend(_flatten(getattr(b, "blocks", []) or []))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
def _column_groups(chapter):
|
||||
return [b for b in chapter.blocks if isinstance(b, Group)]
|
||||
|
||||
|
||||
def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
@@ -90,36 +106,101 @@ def test_golden_build_cat_distr_emite_bloques_pedidos():
|
||||
assert ch is not None
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
kinds = _kinds(ch)
|
||||
# Entropy intro present.
|
||||
|
||||
# Entropy intro present, but the long explanation is gone (it lives in the
|
||||
# glossary now): only the term is named, no log2/normalizada walkthrough.
|
||||
headings = [b.text for b in ch.blocks if isinstance(b, Heading)]
|
||||
assert any("Entrop" in h for h in headings)
|
||||
md = next(b for b in ch.blocks if b.kind == "markdown")
|
||||
assert "entropía" in md.text.lower() and "log2" in md.text
|
||||
# Cardinality metrics: distinct, total rows, %-distinct, unique values.
|
||||
kv = next(b for b in ch.blocks if isinstance(b, KVTable))
|
||||
md = next(b for b in ch.blocks if isinstance(b, Markdown))
|
||||
assert "entropía" in md.text.lower()
|
||||
assert "log2" not in md.text # redundant explanation removed.
|
||||
assert "máxima diversidad" not in md.text
|
||||
|
||||
# Per-column blocks are wrapped in keep-together Groups: flatten to inspect.
|
||||
flat = _flatten(ch.blocks)
|
||||
kv = next(b for b in flat if isinstance(b, KVTable))
|
||||
labels = [r[0] for r in kv.rows]
|
||||
assert "Valores distintos" in labels
|
||||
assert "% distintos" in labels
|
||||
values = " ".join(str(r[1]) for r in kv.rows)
|
||||
# Cardinality metrics: distinct count, %-distinct, unique values and total
|
||||
# rows are present (grouped onto compact rows so the chart fits the page).
|
||||
assert "Distintos · % · únicos" in labels
|
||||
assert "Total filas (dataset)" in labels
|
||||
assert "Valores únicos (frecuencia 1)" in labels
|
||||
assert any("Entropía" in lbl for lbl in labels)
|
||||
assert "únicos" in values and "%" in values
|
||||
assert "bits" in values and "norm" in values # entropy + max + normalized.
|
||||
# Top-k table + pie figure.
|
||||
dt = next(b for b in ch.blocks if isinstance(b, DataTable))
|
||||
dt = next(b for b in flat if isinstance(b, DataTable))
|
||||
assert dt.header == ["Valor", "Conteo", "%"]
|
||||
assert any("neumaticos" in str(cell) for row in dt.rows for cell in row)
|
||||
assert any(isinstance(b, Figure) for b in ch.blocks)
|
||||
# id-like column flagged with a Note.
|
||||
assert any(isinstance(b, Note) and "identificador" in b.text
|
||||
for b in ch.blocks)
|
||||
assert any(isinstance(b, Figure) for b in flat)
|
||||
# id-like column flagged with a Note that also explains the top-k is dropped.
|
||||
idnote = next((b for b in flat
|
||||
if isinstance(b, Note) and "identificador" in b.text), None)
|
||||
assert idnote is not None
|
||||
assert "No se lista el top" in idnote.text
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_categoricas():
|
||||
def test_golden_idlike_omite_topk_y_conserva_donut():
|
||||
# The id-like column (uuid, 100% distinct) must NOT carry a top-k DataTable
|
||||
# (it would be a list of unique values), but must still keep its donut Figure
|
||||
# and its cardinality table so it stays a full per-column page.
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
groups = _column_groups(ch)
|
||||
uuid_group = next(g for g in groups
|
||||
if any(getattr(b, "text", "") == "uuid" for b in g.blocks))
|
||||
kinds = [b.kind for b in uuid_group.blocks]
|
||||
assert "data_table" not in kinds # top-k of unique values dropped.
|
||||
assert "kv_table" in kinds # cardinality kept.
|
||||
assert "figure" in kinds # donut kept (chart per column).
|
||||
# A non-id-like column keeps its top-k table.
|
||||
cat_group = next(g for g in groups
|
||||
if any(getattr(b, "text", "") == "categoria"
|
||||
for b in g.blocks))
|
||||
assert "data_table" in [b.kind for b in cat_group.blocks]
|
||||
|
||||
|
||||
def test_golden_una_pagina_por_columna_groups():
|
||||
ch = build_cat_distr(_profile(), {})
|
||||
groups = _column_groups(ch)
|
||||
# Two categorical columns -> two column Groups (numeric column excluded).
|
||||
assert len(groups) == 2
|
||||
# Each Group carries one column: a heading + its cardinality table + figure.
|
||||
for g in groups:
|
||||
kinds = [b.kind for b in g.blocks]
|
||||
assert kinds[0] == "heading"
|
||||
assert "kv_table" in kinds
|
||||
assert "figure" in kinds
|
||||
# The first column may share the intro page (no forced break); every later
|
||||
# column starts on a fresh page/slide so each column gets its own page.
|
||||
assert groups[0].page_break_before is False
|
||||
assert all(g.page_break_before is True for g in groups[1:])
|
||||
|
||||
|
||||
def test_golden_entropia_clicable_y_definicion_en_glosario():
|
||||
# With a glossary collector the intro marks the clickable term and the FULL
|
||||
# definition (the long explanation removed from the intro) lands in the
|
||||
# glossary, not inline — no data lost, just relocated.
|
||||
gc = GlossaryCollector()
|
||||
ch = build_cat_distr(_profile(), {"glossary": gc})
|
||||
md = next(b for b in ch.blocks if isinstance(b, Markdown))
|
||||
assert "[[term:entropia]]entropía[[/term]]" in md.text
|
||||
assert gc.has("entropia")
|
||||
entry = gc.get("entropia")
|
||||
assert entry is not None
|
||||
# The definition kept in the glossary still carries the detail removed inline.
|
||||
assert "log2" in entry["definition"]
|
||||
assert "normalizada" in entry["definition"].lower()
|
||||
|
||||
|
||||
def test_golden_render_pdf_una_pagina_por_columna():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
res = render_automatic_eda_pdf(_profile(), out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID)
|
||||
# Two categorical columns, each on its own page -> >= 2 pages for the
|
||||
# chapter (intro shares the first column's page).
|
||||
assert cat_meta["n_pages"] >= 2
|
||||
txt = _pdf_text(out)
|
||||
assert "Entrop" in txt
|
||||
assert "distintos" in txt
|
||||
@@ -133,13 +214,91 @@ def test_golden_render_pptx_muestra_categoricas():
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(_profile(), out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
cat_meta = next(c for c in res["chapters"] if c["id"] == CHAPTER_ID)
|
||||
assert cat_meta["n_slides"] >= 2 # one slide per categorical column.
|
||||
txt = _pptx_text(out)
|
||||
assert "Entrop" in txt
|
||||
assert "categoria" in txt and "neumaticos" in txt
|
||||
assert "distintos" in txt
|
||||
|
||||
|
||||
def _profile_high_card() -> dict:
|
||||
"""Profile with a high-cardinality NON-id-like categorical column whose top-k
|
||||
of long values would split from its donut on a short 16:9 slide unless the
|
||||
renderer trims the table — the exact case the adversarial check flagged
|
||||
(Ticket / Cabin)."""
|
||||
long_vals = [f"Valor largo de categoria numero {i:02d} con texto extra"
|
||||
for i in range(40)]
|
||||
top = [{"value": v, "count": 60 - i, "pct": (60 - i) / 5000.0}
|
||||
for i, v in enumerate(long_vals)]
|
||||
return {
|
||||
"table": "t", "source": "t.csv", "n_rows": 5000, "n_cols": 3,
|
||||
"quality_score": 80.0,
|
||||
"columns": [
|
||||
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
|
||||
"std": 0.5}},
|
||||
# 40 distinct over 5000 rows = 0.8% distinct -> NOT id-like, keeps
|
||||
# its (long) top-k table; the tall table must not push the donut off.
|
||||
{"name": "alta_card_col", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "distinct_count": 40,
|
||||
"categorical": {"top": top, "mode": long_vals[0], "n_distinct": 40,
|
||||
"entropy": 5.2, "imbalance": 1.2, "len_min": 40,
|
||||
"len_mean": 45, "len_max": 50}},
|
||||
{"name": "baja_card_col", "inferred_type": "categorical",
|
||||
"null_pct": 0.0, "distinct_count": 4,
|
||||
"categorical": {
|
||||
"top": [{"value": "norte", "count": 2000, "pct": 0.4},
|
||||
{"value": "sur", "count": 1500, "pct": 0.3},
|
||||
{"value": "este", "count": 1000, "pct": 0.2},
|
||||
{"value": "oeste", "count": 500, "pct": 0.1}],
|
||||
"mode": "norte", "n_distinct": 4, "entropy": 1.8}},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_golden_pptx_una_slide_por_columna_con_su_grafico():
|
||||
"""Each categorical column occupies EXACTLY ONE cat_distr slide that carries
|
||||
BOTH its cardinality table and its donut figure (picture) — i.e. the chart is
|
||||
never separated from its table, even for a high-cardinality column."""
|
||||
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||
|
||||
prof = _profile_high_card()
|
||||
cat_names = ["alta_card_col", "baja_card_col"]
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
prs = Presentation(out)
|
||||
|
||||
# Per column: the cat_distr slides whose text mentions it, and whether the
|
||||
# owning slide also has the donut caption + an actual picture shape.
|
||||
slides_with_col = {n: [] for n in cat_names}
|
||||
owner_has_chart = {n: False for n in cat_names}
|
||||
for i, sl in enumerate(prs.slides):
|
||||
texts, has_pic = [], False
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
texts.append(sh.text_frame.text)
|
||||
if sh.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
||||
has_pic = True
|
||||
txt = re.sub(r"\s+", " ", " ".join(texts))
|
||||
if "Distribuciones categ" not in txt: # footer stamp of the chapter.
|
||||
continue
|
||||
for n in cat_names:
|
||||
if n in txt:
|
||||
slides_with_col[n].append(i)
|
||||
has_table = "Cardinalidad" in txt or "distintos" in txt
|
||||
if has_pic and "donut" in txt and has_table:
|
||||
owner_has_chart[n] = True
|
||||
|
||||
for n in cat_names:
|
||||
# Exactly one slide carries the column (not split across slides).
|
||||
assert len(slides_with_col[n]) == 1, (n, slides_with_col[n])
|
||||
# That single slide also holds its table AND its donut picture.
|
||||
assert owner_has_chart[n], (n, "tabla y donut no están en el mismo slide")
|
||||
|
||||
|
||||
def test_edge_sin_categoricas_devuelve_none():
|
||||
only_numeric = {
|
||||
"n_rows": 10, "columns": [
|
||||
@@ -170,11 +329,15 @@ def test_anti_corte_label_largo_y_muchas_columnas():
|
||||
|
||||
ch = build_cat_distr(profile, {})
|
||||
assert ch is not None
|
||||
# One Group per column, each forcing its own page (except the first).
|
||||
groups = _column_groups(ch)
|
||||
assert len(groups) == 30
|
||||
assert sum(1 for g in groups if g.page_break_before) == 29
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "anti.pdf")
|
||||
res = render_automatic_eda_pdf(profile, pdf, {"write_manifest": False})
|
||||
assert res["path"] == pdf
|
||||
assert res["n_pages"] > 1 # many columns spilled across pages, OK.
|
||||
assert res["n_pages"] > 1 # one page per column, OK.
|
||||
txt = _pdf_text(pdf)
|
||||
# Long label wrapped (not truncated): every word survives.
|
||||
for word in ("Lorem", "incididunt", "reprehenderit", "voluptate"):
|
||||
|
||||
@@ -31,7 +31,7 @@ import math
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_ID = "correlacion"
|
||||
CHAPTER_TITLE = "Correlación"
|
||||
|
||||
@@ -47,6 +47,60 @@ _MAX_MATRIX_LABELS = 16
|
||||
# How many pairs to show in each of the top-positive / top-negative tables.
|
||||
_TOP_N = 10
|
||||
|
||||
# How many of the strongest numeric-numeric pairs to draw as scatter plots on
|
||||
# each sign (positive / negative). A scatter per pair carries a fitted line/curve
|
||||
# and a relationship-type label; keeping the count small keeps the chapter
|
||||
# readable on a phone / a slide. Only signed (Pearson/Spearman) pairs qualify —
|
||||
# Cramér's V / correlation ratio pairs are not numeric-numeric, so no scatter.
|
||||
_SCATTER_TOP_N = 3
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance in the
|
||||
# body — the canonical two-step pattern (see ``cat_distr`` for the reference
|
||||
# implementation): ``glossary.add(key, label, definition)`` + the inline span
|
||||
# ``[[term:KEY]]texto visible[[/term]]`` in a Markdown block. Mapping key ->
|
||||
# (label, definition). ``fdr`` is only registered when the FDR summary is present.
|
||||
_TERM_DEFS = {
|
||||
"pearson": (
|
||||
"Pearson (coeficiente r)",
|
||||
"Coeficiente de correlación lineal de Pearson (r) entre dos variables "
|
||||
"numéricas. Va de −1 (relación lineal inversa perfecta) a +1 (directa "
|
||||
"perfecta); 0 indica ausencia de relación lineal. Sólo capta relaciones "
|
||||
"lineales, por eso lleva signo."),
|
||||
"spearman": (
|
||||
"Spearman (correlación de rangos)",
|
||||
"Correlación de rangos de Spearman: el coeficiente de Pearson calculado "
|
||||
"sobre los puestos (rangos) de los valores en vez de sus magnitudes. Mide "
|
||||
"relaciones monótonas (no necesariamente lineales), va de −1 a +1 y es "
|
||||
"robusta frente a valores atípicos."),
|
||||
"cramers_v": (
|
||||
"Cramér's V",
|
||||
"Medida de asociación entre dos variables categóricas, derivada del "
|
||||
"estadístico chi-cuadrado y normalizada al rango 0–1 (0 = independientes, "
|
||||
"1 = asociación total). No tiene signo: sólo mide la intensidad."),
|
||||
"correlation_ratio": (
|
||||
"Razón de correlación (η)",
|
||||
"Razón de correlación (eta) entre una variable numérica y una "
|
||||
"categórica: la fracción de la varianza de la numérica explicada por los "
|
||||
"grupos de la categórica. Va de 0 (los grupos no explican nada) a 1 (la "
|
||||
"explican toda); no tiene signo."),
|
||||
"fdr": (
|
||||
"Comparaciones múltiples (FDR)",
|
||||
"Al evaluar muchos pares a la vez, algunos parecen significativos por "
|
||||
"puro azar. La corrección por tasa de falsos descubrimientos (FDR, "
|
||||
"Benjamini-Hochberg) ajusta los p-valores para controlar la proporción "
|
||||
"esperada de falsos positivos entre los pares declarados significativos."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
the marker), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _is_num(v) -> bool:
|
||||
"""True for a real, finite int/float (not bool, not NaN/inf)."""
|
||||
@@ -245,7 +299,7 @@ def _methods_block(corr: dict):
|
||||
return model.KVTable(rows=rows, title="Métodos de asociación")
|
||||
|
||||
|
||||
def _fdr_text(corr: dict) -> str | None:
|
||||
def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
|
||||
"""One-line summary of the multiple-testing (FDR) correction, or None."""
|
||||
mt = corr.get("multiple_testing")
|
||||
if not isinstance(mt, dict) or not mt:
|
||||
@@ -254,7 +308,8 @@ def _fdr_text(corr: dict) -> str | None:
|
||||
alpha = mt.get("alpha")
|
||||
n_tests = mt.get("n_tests")
|
||||
n_rej = mt.get("n_rejected")
|
||||
parts = [f"Corrección por comparaciones múltiples ({method}"]
|
||||
multi = _term(mark_term, "fdr", "comparaciones múltiples")
|
||||
parts = [f"Corrección por {multi} ({method}"]
|
||||
if _is_num(alpha):
|
||||
parts[0] += f", α={float(alpha):g}"
|
||||
parts[0] += ")."
|
||||
@@ -266,6 +321,139 @@ def _fdr_text(corr: dict) -> str | None:
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _is_seq(values) -> bool:
|
||||
"""True for a non-empty list/tuple of values (a raw numeric column)."""
|
||||
return isinstance(values, (list, tuple)) and len(values) > 0
|
||||
|
||||
|
||||
def _select_scatter_pairs(pairs: list, top_n: int = _SCATTER_TOP_N):
|
||||
"""Pick the strongest numeric-numeric pairs to draw as scatters.
|
||||
|
||||
Only signed (Pearson/Spearman) pairs are numeric-numeric and thus eligible
|
||||
for a scatter with a fitted curve. Returns up to ``top_n`` of the strongest
|
||||
positive pairs followed by up to ``top_n`` of the strongest negative ones,
|
||||
each ranked by magnitude. Mixed-type metrics (Cramér's V, correlation ratio,
|
||||
mutual information) are excluded — they have no x/y scatter interpretation.
|
||||
"""
|
||||
positive = []
|
||||
negative = []
|
||||
for pair in pairs:
|
||||
if not isinstance(pair, dict) or not _is_signed(pair):
|
||||
continue
|
||||
value = pair.get("value")
|
||||
if not _is_num(value):
|
||||
continue
|
||||
if value > 0:
|
||||
positive.append(pair)
|
||||
elif value < 0:
|
||||
negative.append(pair)
|
||||
positive.sort(key=lambda p: abs(float(p.get("value", 0.0))), reverse=True)
|
||||
negative.sort(key=lambda p: abs(float(p.get("value", 0.0))), reverse=True)
|
||||
return positive[:top_n] + negative[:top_n]
|
||||
|
||||
|
||||
def _classification_note(a: str, b: str, cls: dict) -> str:
|
||||
"""Human-readable sentence describing the relationship of a pair.
|
||||
|
||||
Plain text (not baked into the figure image) so the type label is selectable
|
||||
in the PDF / extractable by pdftotext, and sits right next to its scatter
|
||||
inside the keep-together Group.
|
||||
"""
|
||||
tipo = model._safe_str(cls.get("tipo")) or "sin forma clara"
|
||||
bits = []
|
||||
pearson = cls.get("pearson")
|
||||
spearman = cls.get("spearman")
|
||||
r2_lin = cls.get("r2_linear")
|
||||
r2_poly = None
|
||||
for key in ("r2_poly2", "r2_poly3"):
|
||||
v = cls.get(key)
|
||||
if _is_num(v) and (r2_poly is None or float(v) > r2_poly):
|
||||
r2_poly = float(v)
|
||||
if _is_num(pearson):
|
||||
bits.append(f"Pearson r={float(pearson):+.2f}")
|
||||
if _is_num(spearman):
|
||||
bits.append(f"Spearman ρ={float(spearman):+.2f}")
|
||||
if _is_num(r2_lin):
|
||||
bits.append(f"R² lineal={float(r2_lin):.2f}")
|
||||
if r2_poly is not None:
|
||||
bits.append(f"R² polinómico={r2_poly:.2f}")
|
||||
metrics = "; ".join(bits)
|
||||
text = (f"Relación **{tipo}** entre «{a}» y «{b}»."
|
||||
+ (f" {metrics}." if metrics else ""))
|
||||
return text
|
||||
|
||||
|
||||
def _scatter_blocks(pairs: list, raw_numeric):
|
||||
"""Build keep-together scatter Groups for the strongest num-num pairs.
|
||||
|
||||
Returns a list of blocks (a Heading plus one Group per pair), or an empty
|
||||
list when there is no raw numeric data (e.g. the lite profile drops
|
||||
``ctx['raw_numeric']`` to skip live recomputation) or the relationship
|
||||
helpers are unavailable. Never raises: any failure degrades to no scatters,
|
||||
leaving the matrix + tables intact.
|
||||
"""
|
||||
if not isinstance(raw_numeric, dict) or not raw_numeric:
|
||||
return []
|
||||
selected = _select_scatter_pairs(pairs)
|
||||
if not selected:
|
||||
return []
|
||||
|
||||
# The relationship helpers live in the datascience package. Import lazily so
|
||||
# the chapter still builds (matrix + tables) when they are absent.
|
||||
try:
|
||||
from datascience.classify_relationship_type import (
|
||||
classify_relationship_type,
|
||||
)
|
||||
from datascience.relationship_scatter_figure import (
|
||||
relationship_scatter_figure,
|
||||
)
|
||||
except Exception: # noqa: BLE001 — degrade, never break the chapter.
|
||||
return []
|
||||
|
||||
groups = []
|
||||
for pair in selected:
|
||||
a = pair.get("a")
|
||||
b = pair.get("b")
|
||||
xs = raw_numeric.get(a)
|
||||
ys = raw_numeric.get(b)
|
||||
# Edge: a selected pair has no raw column (aggregated profile, renamed
|
||||
# column, …) — skip just that pair, keep the rest.
|
||||
if not _is_seq(xs) or not _is_seq(ys):
|
||||
continue
|
||||
try:
|
||||
cls = classify_relationship_type(list(xs), list(ys)) or {}
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
a_lbl = model._safe_str(a)
|
||||
b_lbl = model._safe_str(b)
|
||||
|
||||
def _make(xs=xs, ys=ys, a_lbl=a_lbl, b_lbl=b_lbl, cls=cls):
|
||||
return relationship_scatter_figure(
|
||||
list(xs), list(ys), x_label=a_lbl, y_label=b_lbl,
|
||||
classification=cls)
|
||||
|
||||
groups.append(model.Group(blocks=[
|
||||
model.Heading(text=f"{a_lbl} ↔ {b_lbl}", level=2),
|
||||
model.Figure(
|
||||
make=_make,
|
||||
caption=(f"Dispersión de «{a_lbl}» frente a «{b_lbl}» con la "
|
||||
"curva de ajuste del mejor modelo.")),
|
||||
model.Markdown(text=_classification_note(a_lbl, b_lbl, cls)),
|
||||
]))
|
||||
|
||||
if not groups:
|
||||
return []
|
||||
intro = model.Markdown(text=(
|
||||
"Para los pares numéricos más fuertes (positivos y negativos) se dibuja "
|
||||
"la nube de puntos con su ajuste y se clasifica el **tipo de relación**: "
|
||||
"**lineal** (una recta basta), **polinómica** (curva de grado 2/3 que "
|
||||
"mejora claramente el ajuste lineal), **monótona no-lineal** (crece o "
|
||||
"decrece siempre pero no en línea recta; Spearman ≫ Pearson) o "
|
||||
"**débil/sin forma**."))
|
||||
return [model.Heading(text="Relaciones más fuertes (scatter)", level=2),
|
||||
intro] + groups
|
||||
|
||||
|
||||
def build_correlacion(profile: dict, ctx: dict):
|
||||
"""Build the Correlation Chapter, or None if there are no pairs to show.
|
||||
|
||||
@@ -289,13 +477,30 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
|
||||
blocks: list = []
|
||||
|
||||
# Intro: what this chapter shows and how to read the sign.
|
||||
# Register the always-present method terms in the shared glossary and mark
|
||||
# their first appearance clickable (the FDR term is registered lazily below,
|
||||
# only when the FDR summary is actually emitted). Degrades silently when no
|
||||
# collector is in ctx (standalone render) — mark_term stays False.
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
if gloss is not None:
|
||||
for key in ("pearson", "spearman", "cramers_v", "correlation_ratio"):
|
||||
label, definition = _TERM_DEFS[key]
|
||||
gloss.add(key, label, definition)
|
||||
|
||||
# Intro: what this chapter shows and how to read the sign. Build the marked
|
||||
# method names as locals first (avoids backslash-in-f-string for "Cramér's V").
|
||||
t_pearson = _term(mark_term, "pearson", "Pearson")
|
||||
t_spearman = _term(mark_term, "spearman", "Spearman")
|
||||
t_cramers = _term(mark_term, "cramers_v", "Cramér's V")
|
||||
t_corr_ratio = _term(mark_term, "correlation_ratio", "razón de correlación")
|
||||
blocks.append(model.Markdown(text=(
|
||||
"Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
|
||||
"sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
|
||||
"entre categóricas; razón de correlación num-categórica; información mutua "
|
||||
"como medida común no lineal). Sólo las correlaciones **num-num** tienen "
|
||||
"dirección: por eso los pares **negativos** son siempre num-num.")))
|
||||
"Asociación entre columnas. Cada par se evalúa con la métrica adecuada "
|
||||
f"a sus tipos: {t_pearson}/{t_spearman} (numéricas), {t_cramers} "
|
||||
f"(categóricas), {t_corr_ratio} (num-categórica) e información mutua. "
|
||||
"Sólo las correlaciones **num-num** llevan **signo** (dirección): por "
|
||||
"eso los pares **negativos** son siempre num-num.")))
|
||||
|
||||
# 1) Association matrix (heatmap).
|
||||
labels, trimmed = _ordered_labels(pairs)
|
||||
@@ -327,6 +532,18 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
"No se han hallado correlaciones negativas significativas entre "
|
||||
"columnas numéricas.")))
|
||||
|
||||
# 2.5) Scatter plots of the strongest numeric-numeric pairs, each with its
|
||||
# fitted curve and a relationship-type label (lineal / polinómica / monótona
|
||||
# / débil). Needs the raw numeric sample (ctx['raw_numeric'], row-aligned);
|
||||
# when it is absent (aggregated/lite profile) the scatters are simply omitted
|
||||
# and the matrix + tables above stand on their own.
|
||||
raw_numeric = None
|
||||
if isinstance(ctx, dict):
|
||||
raw_numeric = ctx.get("raw_numeric") or profile.get("raw_numeric")
|
||||
else:
|
||||
raw_numeric = profile.get("raw_numeric")
|
||||
blocks.extend(_scatter_blocks(pairs, raw_numeric))
|
||||
|
||||
# 3) Spuriousness caveat for level-based correlations (Granger–Newbold).
|
||||
caveat = corr.get("levels_caveat")
|
||||
if isinstance(caveat, str) and caveat.strip():
|
||||
@@ -337,9 +554,13 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
"no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
|
||||
"sobre los retornos/diferencias antes de interpretarlas.")))
|
||||
|
||||
# 4) FDR summary + methods legend.
|
||||
fdr_text = _fdr_text(corr)
|
||||
# 4) FDR summary + methods legend. Register the FDR term only when its
|
||||
# summary is emitted, so the glossary never lists an unreferenced entry.
|
||||
fdr_text = _fdr_text(corr, mark_term=mark_term)
|
||||
if fdr_text:
|
||||
if gloss is not None:
|
||||
label, definition = _TERM_DEFS["fdr"]
|
||||
gloss.add("fdr", label, definition)
|
||||
blocks.append(model.Markdown(text=fdr_text))
|
||||
methods = _methods_block(corr)
|
||||
if methods is not None:
|
||||
|
||||
@@ -173,3 +173,124 @@ def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
|
||||
assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
|
||||
# A short, unbreakable fragment of the long label survives the wrap.
|
||||
assert "azufre" in _pdf_text(pdf)
|
||||
|
||||
|
||||
def _raw_numeric_for_profile(n: int = 80) -> dict:
|
||||
"""Row-aligned raw numeric sample matching the signed pairs of _profile().
|
||||
|
||||
Builds columns with a clear, deterministic shape so the relationship-type
|
||||
classifier has something unambiguous to label:
|
||||
- density vs alcohol: strong negative linear (the top-negative pair).
|
||||
- alcohol vs quality: positive linear.
|
||||
- ph, fixed_acidity, sulphates: filler columns for the remaining pairs.
|
||||
"""
|
||||
import math as _m
|
||||
|
||||
alcohol = [8.0 + 0.05 * i for i in range(n)]
|
||||
density = [1.0 - 0.002 * a for a in alcohol] # neg linear vs alcohol
|
||||
quality = [3.0 + 0.4 * a + (0.1 if i % 2 else -0.1) # pos linear vs alcohol
|
||||
for i, a in enumerate(alcohol)]
|
||||
ph = [3.0 + 0.3 * _m.sin(i / 5.0) for i in range(n)]
|
||||
fixed_acidity = [7.0 - 0.5 * p for p in ph] # neg linear vs ph
|
||||
sulphates = [0.5 + 0.01 * (i % 7) for i in range(n)]
|
||||
return {
|
||||
"alcohol": alcohol, "density": density, "quality": quality,
|
||||
"ph": ph, "fixed_acidity": fixed_acidity, "sulphates": sulphates,
|
||||
}
|
||||
|
||||
|
||||
def test_golden_scatters_de_pares_num_num_con_tipo_de_relacion():
|
||||
"""Con ctx['raw_numeric'], el capítulo añade scatters (Figure dentro de Group)
|
||||
de los pares num-num más fuertes, cada uno con su etiqueta de tipo en texto."""
|
||||
from datascience.automatic_eda.model import Group
|
||||
|
||||
ctx = {"raw_numeric": _raw_numeric_for_profile()}
|
||||
ch = build_correlacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
groups = [b for b in ch.blocks if isinstance(b, Group)]
|
||||
assert groups, "debe emitir al menos un Group con scatter"
|
||||
# Cada Group lleva su figura (lazy) y una nota de texto con el tipo.
|
||||
for g in groups:
|
||||
gkinds = [b.kind for b in g.blocks]
|
||||
assert "figure" in gkinds and "markdown" in gkinds
|
||||
# La sección y la etiqueta de tipo aparecen como texto plano (extraíble).
|
||||
headings = " ".join(b.text for b in ch.blocks if b.kind == "heading")
|
||||
assert "Relaciones más fuertes" in headings
|
||||
body = " ".join(b.text for g in groups for b in g.blocks
|
||||
if b.kind == "markdown")
|
||||
assert any(t in body for t in
|
||||
("lineal", "polinómica", "monótona", "sin forma"))
|
||||
# El par num-num más fuerte (density ↔ alcohol) tiene scatter; el par cat-cat
|
||||
# (region ↔ type) NO — no es numérico.
|
||||
assert "density" in body or "alcohol" in body
|
||||
assert "region" not in body and "type" not in body
|
||||
|
||||
|
||||
def test_golden_pdf_muestra_scatters_con_etiqueta_de_tipo():
|
||||
"""En el PDF, el capítulo Correlación incluye los scatters y su etiqueta de
|
||||
tipo en texto seleccionable (pdftotext la encuentra)."""
|
||||
prof = _profile()
|
||||
ctx = {"raw_numeric": _raw_numeric_for_profile()}
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "corr_scatter.pdf")
|
||||
rp = render_automatic_eda_pdf(prof, pdf, {"title": "EDA — wine",
|
||||
"ctx": ctx})
|
||||
assert rp["path"] == pdf and rp["n_pages"] >= 1
|
||||
txt = _pdf_text(pdf)
|
||||
assert "Relaciones" in txt and "scatter" in txt.lower()
|
||||
# Alguna etiqueta de tipo de relación, en texto.
|
||||
assert any(t in txt for t in
|
||||
("lineal", "polin", "monóton", "monoton", "sin forma"))
|
||||
|
||||
|
||||
def test_edge_sin_raw_numeric_omite_scatters_sin_lanzar():
|
||||
"""profile lite / ctx None: sin raw_numeric el capítulo omite los scatters
|
||||
pero sigue emitiendo matriz + tablas (no lanza)."""
|
||||
from datascience.automatic_eda.model import Group
|
||||
|
||||
for ctx in (None, {}, {"raw_numeric": None}, {"raw_numeric": {}}):
|
||||
ch = build_correlacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
assert not [b for b in ch.blocks if isinstance(b, Group)]
|
||||
# La matriz y al menos una tabla top siguen presentes.
|
||||
assert any(b.kind == "figure" for b in ch.blocks)
|
||||
assert any(b.kind == "data_table" for b in ch.blocks)
|
||||
|
||||
|
||||
def test_edge_par_sin_columna_cruda_se_omite_sin_lanzar():
|
||||
"""Si un par seleccionado no tiene su columna en raw_numeric, se omite ese
|
||||
par (no lanza); los demás scatters se construyen igual."""
|
||||
from datascience.automatic_eda.model import Group
|
||||
|
||||
raw = _raw_numeric_for_profile()
|
||||
raw.pop("density", None) # rompe el par density ↔ alcohol
|
||||
ch = build_correlacion(_profile(), {"raw_numeric": raw})
|
||||
assert ch is not None
|
||||
groups = [b for b in ch.blocks if isinstance(b, Group)]
|
||||
body = " ".join(b.text for g in groups for b in g.blocks
|
||||
if b.kind == "markdown")
|
||||
# density desaparece de los scatters; otros pares (p.ej. ph↔fixed_acidity,
|
||||
# alcohol↔quality) pueden seguir presentes sin error.
|
||||
assert "density" not in body
|
||||
|
||||
|
||||
def test_glosario_engancha_metodos_y_fdr():
|
||||
"""Mejora 4b: los métodos de correlación (Pearson, Spearman, Cramér's V,
|
||||
razón de correlación) y la corrección por comparaciones múltiples (FDR) se
|
||||
registran en el colector compartido y se marcan clicables en el cuerpo. Sin
|
||||
colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ch = build_correlacion(_profile(), {"glossary": g})
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
for k in ("pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"):
|
||||
assert f"[[term:{k}]]" in body, k
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_correlacion(_profile(), {})
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -0,0 +1,594 @@
|
||||
"""Missingness chapter (MISSINGNESS) — patterns of missing data.
|
||||
|
||||
Complements the CALIDAD chapter: where CALIDAD reports *how much* is missing per
|
||||
column (the null percentage that lowers the completeness score), this chapter
|
||||
reports the **pattern** of the missing data — whether columns tend to be missing
|
||||
*together* (co-occurrence of absences) or independently. That distinction is what
|
||||
separates data that is missing completely at random ([[term:mcar]]MCAR[[/term]])
|
||||
from data missing as a function of another variable ([[term:mar]]MAR[[/term]]),
|
||||
which is the key question to settle before imputing or modelling.
|
||||
|
||||
The chapter activates only when the table actually has missing data (at least one
|
||||
column with a null in the aggregated profile); otherwise it returns ``None`` and
|
||||
disappears from the document.
|
||||
|
||||
Sections, in order:
|
||||
|
||||
1. **Resumen global** — % of missing cells in the dataset, number of columns with
|
||||
nulls, and complete rows (no missing) vs incomplete rows (≥1 missing).
|
||||
2. **Ranking por columna** — columns sorted by their null percentage, with a
|
||||
horizontal bar figure.
|
||||
3. **Co-ocurrencia de ausencias** — the correlation of the binary is-null masks
|
||||
between columns (which columns tend to be missing together): a heatmap plus a
|
||||
table of the top column pairs that co-miss.
|
||||
4. **Patrones de fila** — the most frequent "which columns are missing together"
|
||||
row patterns, in the style of missingno's pattern matrix.
|
||||
5. **Lectura MCAR/MAR** — an interpretive, *exploratory* note (not a confirmatory
|
||||
test such as Little's) reading the absence correlations as a hint of MCAR
|
||||
(independent absences) vs MAR (co-occurring absences).
|
||||
|
||||
The aggregate per-column null counts come from the ``eda`` group ``TableProfile``
|
||||
(``columns[i]['null_count'] / 'null_pct'`` and the table-level ``null_cell_pct``).
|
||||
The per-row is-null mask needed for co-occurrence is built from raw data: a single
|
||||
DuckDB push-down over ``ctx['db_path'] / ctx['table']`` (same pattern as the
|
||||
AGREGACION chapter) covering ALL columns, with a fallback to the numeric-only
|
||||
``ctx['raw_numeric']`` when no database is reachable. All the heavy lifting is
|
||||
delegated to pure registry functions (``missingness_overview``,
|
||||
``missingness_correlation``, ``missingness_row_patterns``) and two figure helpers
|
||||
(``missingness_rank_bar_figure``, ``missingness_corr_heatmap_figure``); every one
|
||||
is imported lazily and degrades to an honest note so this chapter never raises.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "missingness"
|
||||
CHAPTER_TITLE = "Datos faltantes"
|
||||
|
||||
# Sample cap for the per-row is-null mask push-down. Co-occurrence and row
|
||||
# patterns are computed on this sample; the global % of missing cells and the
|
||||
# per-column ranking come from the (exact) aggregated profile instead.
|
||||
MASK_SAMPLE = 5000
|
||||
# Thresholds for the MCAR/MAR heuristic note. A pair counts as a *strong*
|
||||
# co-occurrence when the absence correlation alone is high; as a *partial*
|
||||
# co-occurrence when the absences overlap materially (high Jaccard) even if the
|
||||
# Pearson correlation is modest — the usual case when one column is missing far
|
||||
# more often than the other (e.g. Cabin 77% vs Age 20% in Titanic), which dilutes
|
||||
# the correlation while the rows still co-miss in absolute terms.
|
||||
_CORR_STRONG = 0.30
|
||||
_JACCARD_NOTABLE = 0.20
|
||||
# Rows shown in the top-pairs and row-patterns tables (bounded, never silently
|
||||
# truncated: the table note reports the full count).
|
||||
_TOP_PAIRS = 12
|
||||
_TOP_PATTERNS = 12
|
||||
# Truncate long column names in tables (the renderer also wraps).
|
||||
_LABEL_MAX = 28
|
||||
|
||||
# Glossary terms this chapter explains (contract §11.1). Registered in the shared
|
||||
# collector and marked clickable on their first appearance.
|
||||
_TERMS = {
|
||||
"missingness": (
|
||||
"Patrón de datos faltantes (missingness)",
|
||||
"El patrón con el que faltan los datos: cuánto falta, en qué columnas y "
|
||||
"si las ausencias de unas columnas coinciden (co-ocurren) con las de "
|
||||
"otras. Analizarlo —no solo contar nulos— distingue datos que faltan al "
|
||||
"azar (MCAR) de los que faltan en función de otra variable (MAR), lo que "
|
||||
"decide cómo imputar o si descartar filas sin sesgar el análisis.",
|
||||
),
|
||||
"mcar": (
|
||||
"MCAR (Missing Completely At Random)",
|
||||
"Los valores faltan de forma independiente de cualquier dato, observado o "
|
||||
"no: las ausencias de unas columnas no se relacionan entre sí ni con los "
|
||||
"valores. Es el caso más benigno —descartar filas o imputar la media no "
|
||||
"introduce sesgo—, pero rara vez se cumple del todo en datos reales.",
|
||||
),
|
||||
"mar": (
|
||||
"MAR (Missing At Random)",
|
||||
"La probabilidad de que un valor falte depende de OTRAS variables "
|
||||
"observadas (p. ej. una medición que falta más en cierto grupo). Las "
|
||||
"ausencias co-ocurren entre columnas o se relacionan con los valores de "
|
||||
"otras; imputar exige condicionar en esas variables para no sesgar. La "
|
||||
"co-ocurrencia fuerte de ausencias es un indicio (exploratorio) de MAR.",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Small defensive formatters (own copy: the chapter never imports siblings).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _fmt_int(value) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{int(round(float(value))):,}".replace(",", ".")
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _fmt_pct(value, decimals: int = 1) -> str:
|
||||
"""Format an already-0-100 value as a percentage. None -> placeholder."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value):.{decimals}f}%"
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
f = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if f != f: # NaN
|
||||
return "—"
|
||||
text = f"{f:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
|
||||
|
||||
def _truncate(text, limit: int = _LABEL_MAX) -> str:
|
||||
s = model._safe_str(text)
|
||||
if len(s) <= limit:
|
||||
return s
|
||||
return s[: max(1, limit - 1)].rstrip() + "…"
|
||||
|
||||
|
||||
def _term(key: str, label: str, mark: bool) -> str:
|
||||
if mark:
|
||||
return f"[[term:{key}]]**{label}**[[/term]]"
|
||||
return f"**{label}**"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Profile reads (exact, all rows).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _null_count_of(col: dict):
|
||||
"""Best-effort null count of a column: ``null_count`` or null_pct*n_rows."""
|
||||
nc = col.get("null_count")
|
||||
if isinstance(nc, (int, float)) and not isinstance(nc, bool):
|
||||
return int(nc)
|
||||
np_ = col.get("null_pct")
|
||||
nr = col.get("n_rows")
|
||||
if isinstance(np_, (int, float)) and isinstance(nr, (int, float)):
|
||||
return int(round(float(np_) * float(nr)))
|
||||
return 0
|
||||
|
||||
|
||||
def _columns_with_nulls(profile: dict):
|
||||
"""Return ``[(name, null_count, null_pct_0_100)]`` for columns with nulls,
|
||||
sorted by null percentage descending. Reads the aggregated profile (exact)."""
|
||||
cols = profile.get("columns") or []
|
||||
out = []
|
||||
for c in cols:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
nc = _null_count_of(c)
|
||||
if nc <= 0:
|
||||
continue
|
||||
np_ = c.get("null_pct")
|
||||
nr = c.get("n_rows") or profile.get("n_rows")
|
||||
if isinstance(np_, (int, float)) and not isinstance(np_, bool):
|
||||
pct = float(np_) * 100.0 if np_ <= 1.0 else float(np_)
|
||||
elif nr:
|
||||
pct = nc / float(nr) * 100.0
|
||||
else:
|
||||
pct = None
|
||||
out.append((c.get("name") or "(col)", nc, pct))
|
||||
out.sort(key=lambda t: (t[2] if t[2] is not None else -1.0), reverse=True)
|
||||
return out
|
||||
|
||||
|
||||
def _global_missing_pct(profile: dict):
|
||||
"""Table-level % of missing cells (0-100), exact, from the profile."""
|
||||
v = profile.get("null_cell_pct")
|
||||
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
||||
return float(v) * 100.0 if v <= 1.0 else float(v)
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Per-row is-null mask (sample): DuckDB push-down, fallback to raw_numeric.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _build_query_fn(ctx: dict):
|
||||
"""Return ``(query_fn, table)`` for a DuckDB-backed ctx, or ``(None, None)``.
|
||||
|
||||
Mirrors build_eda_render_ctx: a read-only closure over the registry wrapper.
|
||||
Only DuckDB is supported here; any other backend degrades to raw_numeric."""
|
||||
db_path = ctx.get("db_path")
|
||||
table = ctx.get("table")
|
||||
if not db_path or not table:
|
||||
return None, None
|
||||
try:
|
||||
from infra import duckdb_query_readonly
|
||||
except Exception: # noqa: BLE001 — wrapper unavailable -> degrade.
|
||||
return None, None
|
||||
|
||||
def query_fn(sql):
|
||||
return duckdb_query_readonly(db_path, sql)
|
||||
|
||||
return query_fn, table
|
||||
|
||||
|
||||
def _null_mask(profile: dict, ctx: dict):
|
||||
"""Build the per-row is-null mask ``{col: [0/1, ...]}``.
|
||||
|
||||
Tries a single DuckDB push-down over ALL columns first (so categorical
|
||||
columns like Cabin are covered, not only numeric ones); falls back to the
|
||||
numeric-only ``ctx['raw_numeric']`` (None -> missing); returns ``(None, 0,
|
||||
None)`` when neither is reachable. Never raises.
|
||||
Returns ``(mask, n_sampled, source)`` with source in {"db","raw_numeric"}.
|
||||
"""
|
||||
cols = profile.get("columns") or []
|
||||
names = [c.get("name") for c in cols
|
||||
if isinstance(c, dict) and c.get("name")]
|
||||
# 1) DuckDB push-down over every column (covers categoricals too).
|
||||
query_fn, table = _build_query_fn(ctx)
|
||||
if query_fn is not None and names:
|
||||
try:
|
||||
from datascience.extract_null_mask import extract_null_mask
|
||||
|
||||
res = extract_null_mask(query_fn, table, names, max_rows=MASK_SAMPLE)
|
||||
if isinstance(res, dict) and res.get("status") == "ok":
|
||||
mask = res.get("mask") or {}
|
||||
if mask:
|
||||
return mask, int(res.get("n") or 0), "db"
|
||||
except Exception: # noqa: BLE001 — degrade to raw_numeric.
|
||||
pass
|
||||
# 2) Fallback: numeric-only mask derived from raw_numeric (None -> missing).
|
||||
rn = ctx.get("raw_numeric")
|
||||
if isinstance(rn, dict) and rn:
|
||||
mask = {}
|
||||
for col, vals in rn.items():
|
||||
if isinstance(vals, (list, tuple)):
|
||||
mask[col] = [1 if v is None else 0 for v in vals]
|
||||
if mask:
|
||||
n = max((len(v) for v in mask.values()), default=0)
|
||||
return mask, n, "raw_numeric"
|
||||
return None, 0, None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Lazy registry delegations (each degrades to None on any failure).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _overview(mask: dict):
|
||||
try:
|
||||
from datascience.missingness_overview import missingness_overview
|
||||
|
||||
out = missingness_overview(mask)
|
||||
return out if isinstance(out, dict) else None
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
def _correlation(mask: dict, top_k: int):
|
||||
try:
|
||||
from datascience.missingness_correlation import missingness_correlation
|
||||
|
||||
out = missingness_correlation(mask, top_k=top_k)
|
||||
return out if isinstance(out, dict) else None
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
def _row_patterns(mask: dict, top_n: int):
|
||||
try:
|
||||
from datascience.missingness_row_patterns import missingness_row_patterns
|
||||
|
||||
out = missingness_row_patterns(mask, top_n=top_n)
|
||||
return out if isinstance(out, dict) else None
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
def _rank_bar_make(names, pcts, title):
|
||||
def make():
|
||||
try:
|
||||
from datascience.missingness_rank_bar_figure import (
|
||||
missingness_rank_bar_figure,
|
||||
)
|
||||
|
||||
return missingness_rank_bar_figure(names, pcts, title=title)
|
||||
except Exception: # noqa: BLE001 — minimal fallback figure.
|
||||
return _fallback_fig("ranking de nulos no disponible")
|
||||
|
||||
return make
|
||||
|
||||
|
||||
def _heatmap_make(matrix, labels, title):
|
||||
def make():
|
||||
try:
|
||||
from datascience.missingness_corr_heatmap_figure import (
|
||||
missingness_corr_heatmap_figure,
|
||||
)
|
||||
|
||||
return missingness_corr_heatmap_figure(matrix, labels, title=title)
|
||||
except Exception: # noqa: BLE001 — minimal fallback figure.
|
||||
return _fallback_fig("heatmap de co-ocurrencia no disponible")
|
||||
|
||||
return make
|
||||
|
||||
|
||||
def _fallback_fig(message: str):
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
fig = Figure(figsize=(5.0, 2.2))
|
||||
ax = fig.add_subplot(111)
|
||||
ax.text(0.5, 0.5, message, ha="center", va="center")
|
||||
ax.axis("off")
|
||||
return fig
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Block builders.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _summary_block(profile: dict, with_nulls: list, overview, sampled, n_total):
|
||||
rows = []
|
||||
gpct = _global_missing_pct(profile)
|
||||
rows.append(("Celdas faltantes (global)", _fmt_pct(gpct)))
|
||||
rows.append(("Columnas con faltantes", str(len(with_nulls))))
|
||||
all_null = profile.get("all_null_cols")
|
||||
if isinstance(all_null, (list, tuple)) and all_null:
|
||||
rows.append(("Columnas 100% faltantes", str(len(all_null))))
|
||||
if isinstance(overview, dict):
|
||||
cr = overview.get("complete_rows")
|
||||
ir = overview.get("incomplete_rows")
|
||||
suffix = ""
|
||||
if (isinstance(sampled, int) and isinstance(n_total, (int, float))
|
||||
and sampled and n_total and sampled < n_total):
|
||||
suffix = f" (sobre muestra de {_fmt_int(sampled)} filas)"
|
||||
if cr is not None:
|
||||
rows.append(("Filas completas (sin faltantes)",
|
||||
f"{_fmt_int(cr)} ({_fmt_pct(overview.get('complete_pct'))})"
|
||||
+ suffix))
|
||||
if ir is not None:
|
||||
rows.append(("Filas con ≥1 faltante",
|
||||
f"{_fmt_int(ir)} "
|
||||
f"({_fmt_pct(overview.get('incomplete_pct'))})" + suffix))
|
||||
return model.KVTable(rows=rows, title="Resumen de datos faltantes")
|
||||
|
||||
|
||||
def _ranking_block(with_nulls: list):
|
||||
header = ["Columna", "Faltantes", "% faltante"]
|
||||
rows = [[_truncate(n), _fmt_int(c), _fmt_pct(p)] for (n, c, p) in with_nulls]
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(
|
||||
header=header, rows=rows, title="Faltantes por columna",
|
||||
note="ordenado de más a menos faltante")
|
||||
|
||||
|
||||
def _ranking_figure(with_nulls: list):
|
||||
names = [n for (n, _, p) in with_nulls if p is not None]
|
||||
pcts = [p for (_, _, p) in with_nulls if p is not None]
|
||||
if not names:
|
||||
return None
|
||||
return model.Figure(
|
||||
make=_rank_bar_make(names, pcts, "% de valores faltantes por columna"),
|
||||
caption="Porcentaje de valores faltantes por columna (barras).")
|
||||
|
||||
|
||||
def _pairs_block(corr: dict):
|
||||
"""Top column pairs whose absences co-occur, as a table, or None."""
|
||||
pairs = (corr or {}).get("pairs") or []
|
||||
header = ["Columna A", "Columna B", "Corr. ausencia", "Co-faltan", "Jaccard"]
|
||||
rows = []
|
||||
for p in pairs[:_TOP_PAIRS]:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
rows.append([
|
||||
_truncate(p.get("a")),
|
||||
_truncate(p.get("b")),
|
||||
_fmt_num(p.get("corr")),
|
||||
_fmt_int(p.get("co_missing")),
|
||||
_fmt_num(p.get("jaccard")),
|
||||
])
|
||||
if not rows:
|
||||
return None
|
||||
shown = len(rows)
|
||||
total = len(pairs)
|
||||
note = ("correlación de las máscaras is-null entre columnas; "
|
||||
"«Co-faltan» = nº de filas en que ambas faltan a la vez")
|
||||
if total > shown:
|
||||
note += f" — top {shown} de {total} pares"
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Pares de columnas que co-faltan", note=note)
|
||||
|
||||
|
||||
def _heatmap_block(corr: dict):
|
||||
cols = (corr or {}).get("columns") or []
|
||||
matrix = (corr or {}).get("matrix") or []
|
||||
if len(cols) < 2 or not matrix:
|
||||
return None
|
||||
labels = [_truncate(c, 16) for c in cols]
|
||||
return model.Figure(
|
||||
make=_heatmap_make(matrix, labels, "Co-ocurrencia de ausencias"),
|
||||
caption=("Correlación de las ausencias entre columnas (azul = faltan "
|
||||
"juntas; rojo = cuando una falta la otra tiende a estar)."))
|
||||
|
||||
|
||||
def _patterns_block(patterns_res: dict):
|
||||
patterns = (patterns_res or {}).get("patterns") or []
|
||||
header = ["Columnas que faltan juntas", "Filas", "%"]
|
||||
rows = []
|
||||
for p in patterns[:_TOP_PATTERNS]:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
cols = p.get("missing_cols") or []
|
||||
if cols:
|
||||
label = ", ".join(_truncate(c, 18) for c in cols)
|
||||
else:
|
||||
label = "(fila completa — sin faltantes)"
|
||||
rows.append([label, _fmt_int(p.get("n_rows")), _fmt_pct(p.get("pct"))])
|
||||
if not rows:
|
||||
return None
|
||||
total = (patterns_res or {}).get("n_patterns")
|
||||
shown = len(rows)
|
||||
note = "cada fila es un patrón de «qué columnas faltan juntas»"
|
||||
if isinstance(total, int) and total > shown:
|
||||
note += f" — top {shown} de {total} patrones distintos"
|
||||
return model.DataTable(header=header, rows=rows,
|
||||
title="Patrones de fila más comunes", note=note)
|
||||
|
||||
|
||||
def _mcar_mar_note(corr: dict, mark: bool):
|
||||
"""Interpretive, exploratory MCAR/MAR note from the absence correlations.
|
||||
|
||||
Reads the absence correlations at two levels so the verdict never contradicts
|
||||
the visible evidence: a *strong* correlation flags a clear non-random (MAR)
|
||||
pattern; a *partial* overlap (many rows co-miss — high Jaccard — even if the
|
||||
correlation is diluted by one column being missing far more often) flags a
|
||||
localized possible-MAR and cites the concrete co-missing pair; only when
|
||||
neither holds does it read the absences as compatible with MCAR."""
|
||||
|
||||
def _pairs_with(attr_ok):
|
||||
out = []
|
||||
for p in (corr or {}).get("pairs") or []:
|
||||
if isinstance(p, dict) and attr_ok(p):
|
||||
out.append(p)
|
||||
return out
|
||||
|
||||
def _cf(v):
|
||||
try:
|
||||
return float(v)
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
|
||||
strong = _pairs_with(lambda p: abs(_cf(p.get("corr"))) >= _CORR_STRONG)
|
||||
partial = _pairs_with(
|
||||
lambda p: _cf(p.get("corr")) > 0 and _cf(p.get("jaccard")) >= _JACCARD_NOTABLE)
|
||||
mcar = _term("mcar", "MCAR", mark)
|
||||
mar = _term("mar", "MAR", mark)
|
||||
head = (
|
||||
"**Lectura exploratoria MCAR/MAR.** Esta es una heurística basada en la "
|
||||
"correlación de las ausencias entre columnas, NO un test confirmatorio "
|
||||
"(como el de Little); orienta, no demuestra. ")
|
||||
if strong:
|
||||
top = strong[0]
|
||||
ev = (f"«{model._safe_str(top.get('a'))}» y "
|
||||
f"«{model._safe_str(top.get('b'))}» "
|
||||
f"(corr {_fmt_num(top.get('corr'))})")
|
||||
body = (
|
||||
f"Hay ausencias que co-ocurren con fuerza —{ev}—: las columnas no "
|
||||
f"faltan de forma independiente, lo que es un indicio de un patrón no "
|
||||
f"aleatorio ({mar}). Antes de imputar o descartar filas conviene "
|
||||
f"comprobar si la ausencia depende de otra variable observada; en ese "
|
||||
f"caso la imputación debería condicionar en ella para no sesgar.")
|
||||
elif partial:
|
||||
top = max(partial, key=lambda p: _cf(p.get("jaccard")))
|
||||
ev = (f"«{model._safe_str(top.get('a'))}» y "
|
||||
f"«{model._safe_str(top.get('b'))}» faltan a la vez en "
|
||||
f"{_fmt_int(top.get('co_missing'))} filas "
|
||||
f"(Jaccard {_fmt_num(top.get('jaccard'))})")
|
||||
body = (
|
||||
f"Hay co-ocurrencia parcial de ausencias —{ev}—: algunas columnas "
|
||||
f"tienden a faltar juntas aunque la correlación global sea modesta "
|
||||
f"(habitual cuando una columna falta mucho más que la otra). Es un "
|
||||
f"indicio de un posible patrón localizado no aleatorio ({mar}); "
|
||||
f"conviene revisar si esa ausencia depende de otra variable observada "
|
||||
f"antes de imputar, en lugar de asumir que faltan al azar.")
|
||||
else:
|
||||
body = (
|
||||
f"Las ausencias entre columnas no muestran correlación ni solape "
|
||||
f"relevante: parecen independientes, lo que es compatible con que "
|
||||
f"falten al azar ({mcar}). Aun así, la ausencia podría depender de "
|
||||
f"variables no observadas (la heurística no lo descarta).")
|
||||
return model.Markdown(text=head + body)
|
||||
|
||||
|
||||
def _intro_block(mark: bool, source):
|
||||
missingness = _term("missingness", "missingness", mark)
|
||||
text = (
|
||||
f"Este capítulo analiza el {missingness} de la tabla: no solo cuánto "
|
||||
"falta (eso lo cubre la calidad), sino DÓNDE falta y si las columnas "
|
||||
"faltan juntas. La co-ocurrencia de ausencias se calcula sobre la matriz "
|
||||
"binaria «is-null» por fila.")
|
||||
if source == "raw_numeric":
|
||||
text += (" Nota: no se pudo leer la tabla cruda completa, así que la "
|
||||
"co-ocurrencia se limita a las columnas numéricas disponibles.")
|
||||
return model.Markdown(text=text)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def build_missingness(profile: dict, ctx: dict):
|
||||
"""Build the missingness Chapter, or None if the table has no missing data."""
|
||||
if not isinstance(profile, dict):
|
||||
profile = {}
|
||||
ctx = ctx or {}
|
||||
|
||||
with_nulls = _columns_with_nulls(profile)
|
||||
if not with_nulls:
|
||||
return None # no missing data anywhere -> chapter does not apply.
|
||||
|
||||
# Register glossary terms (if a collector is present) and mark them clickable.
|
||||
glossary = ctx.get("glossary")
|
||||
mark = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
for key, (label, definition) in _TERMS.items():
|
||||
glossary.add(key, label, definition)
|
||||
mark = True
|
||||
|
||||
# Per-row is-null mask (sample) for co-occurrence and row patterns.
|
||||
mask, sampled, source = _null_mask(profile, ctx)
|
||||
overview = _overview(mask) if mask else None
|
||||
n_total = profile.get("n_rows")
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Cuánto y dónde faltan datos", level=2),
|
||||
_intro_block(mark, source),
|
||||
_summary_block(profile, with_nulls, overview, sampled, n_total),
|
||||
model.Heading(text="Faltantes por columna", level=2),
|
||||
]
|
||||
ranking = _ranking_block(with_nulls)
|
||||
if ranking is not None:
|
||||
blocks.append(ranking)
|
||||
rank_fig = _ranking_figure(with_nulls)
|
||||
if rank_fig is not None:
|
||||
blocks.append(rank_fig)
|
||||
|
||||
# Co-occurrence + row patterns need the per-row mask. Without it, say so.
|
||||
if not mask:
|
||||
blocks.append(model.Note(
|
||||
"No se pudo construir la matriz «is-null» por fila (sin acceso a los "
|
||||
"datos crudos), así que no se analiza la co-ocurrencia de ausencias "
|
||||
"ni los patrones de fila en este informe."))
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
corr = _correlation(mask, _TOP_PAIRS) or {}
|
||||
co_blocks = [model.Heading(text="Co-ocurrencia de ausencias", level=2)]
|
||||
heatmap = _heatmap_block(corr)
|
||||
if heatmap is not None:
|
||||
co_blocks.append(heatmap)
|
||||
pairs = _pairs_block(corr)
|
||||
if pairs is not None:
|
||||
co_blocks.append(pairs)
|
||||
if heatmap is None and pairs is None:
|
||||
co_blocks.append(model.Note(
|
||||
"Ninguna pareja de columnas comparte ausencias con variación "
|
||||
"suficiente para correlacionarlas (p. ej. una sola columna con "
|
||||
"faltantes), así que no hay co-ocurrencia que mostrar."))
|
||||
# Keep the co-occurrence heading next to its heatmap and table.
|
||||
blocks.append(model.Group(blocks=co_blocks))
|
||||
|
||||
patterns_res = _row_patterns(mask, _TOP_PATTERNS) or {}
|
||||
patterns = _patterns_block(patterns_res)
|
||||
if patterns is not None:
|
||||
blocks.append(model.Heading(text="Patrones de fila", level=2))
|
||||
blocks.append(patterns)
|
||||
|
||||
blocks.append(model.Heading(text="Lectura MCAR / MAR", level=2))
|
||||
blocks.append(_mcar_mar_note(corr, mark))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,162 @@
|
||||
"""Tests for the MISSINGNESS chapter.
|
||||
|
||||
Covers the Definition of Done for this chapter:
|
||||
* Activates (non-None Chapter with the expected sections) when the profile has
|
||||
missing data, building the co-occurrence from the per-row is-null mask.
|
||||
* Returns None when the table has no missing data at all (edge case).
|
||||
* Registers the MCAR/MAR/missingness glossary terms.
|
||||
* The DuckDB push-down path covers categorical columns (not only numeric),
|
||||
so a categorical column that co-misses with a numeric one is detected.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions
|
||||
if _FUNCTIONS not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS)
|
||||
|
||||
from datascience.automatic_eda import model # noqa: E402
|
||||
from datascience.automatic_eda.chapters.missingness import ( # noqa: E402
|
||||
build_missingness,
|
||||
)
|
||||
|
||||
|
||||
def _titles(chapter):
|
||||
"""Collect heading texts and table/figure titles for assertions."""
|
||||
out = []
|
||||
for b in chapter.blocks:
|
||||
kind = getattr(b, "kind", None)
|
||||
if kind == "heading":
|
||||
out.append(("heading", getattr(b, "text", "")))
|
||||
elif kind in ("data_table", "kv_table"):
|
||||
out.append((kind, getattr(b, "title", "")))
|
||||
elif kind == "group":
|
||||
for inner in getattr(b, "blocks", []):
|
||||
ik = getattr(inner, "kind", None)
|
||||
if ik == "heading":
|
||||
out.append(("heading", getattr(inner, "text", "")))
|
||||
elif ik in ("data_table", "kv_table"):
|
||||
out.append((ik, getattr(inner, "title", "")))
|
||||
elif ik == "figure":
|
||||
out.append(("figure", getattr(inner, "caption", "")))
|
||||
elif kind == "figure":
|
||||
out.append(("figure", getattr(b, "caption", "")))
|
||||
return out
|
||||
|
||||
|
||||
def _all_text(chapter):
|
||||
parts = []
|
||||
def walk(blocks):
|
||||
for b in blocks:
|
||||
for attr in ("text", "title", "note", "caption"):
|
||||
v = getattr(b, attr, None)
|
||||
if v:
|
||||
parts.append(str(v))
|
||||
if getattr(b, "kind", None) == "group":
|
||||
walk(getattr(b, "blocks", []))
|
||||
walk(chapter.blocks)
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def test_returns_none_when_no_missing_data():
|
||||
profile = {
|
||||
"n_rows": 4,
|
||||
"null_cell_pct": 0.0,
|
||||
"columns": [
|
||||
{"name": "a", "null_count": 0, "null_pct": 0.0, "n_rows": 4},
|
||||
{"name": "b", "null_count": 0, "null_pct": 0.0, "n_rows": 4},
|
||||
],
|
||||
}
|
||||
assert build_missingness(profile, {}) is None
|
||||
|
||||
|
||||
def test_activates_with_cooccurrence_via_raw_numeric():
|
||||
# a and b are missing in EXACTLY the same rows (0,1,2) -> perfect absence
|
||||
# correlation. c has no nulls. No db_path -> the chapter falls back to the
|
||||
# numeric raw_numeric mask.
|
||||
profile = {
|
||||
"n_rows": 6,
|
||||
"null_cell_pct": (0.5 + 0.5 + 0.0) / 3.0,
|
||||
"columns": [
|
||||
{"name": "a", "null_count": 3, "null_pct": 0.5, "n_rows": 6},
|
||||
{"name": "b", "null_count": 3, "null_pct": 0.5, "n_rows": 6},
|
||||
{"name": "c", "null_count": 0, "null_pct": 0.0, "n_rows": 6},
|
||||
],
|
||||
}
|
||||
glossary = model.GlossaryCollector()
|
||||
ctx = {
|
||||
"raw_numeric": {
|
||||
"a": [None, None, None, 1.0, 2.0, 3.0],
|
||||
"b": [None, None, None, 4.0, 5.0, 6.0],
|
||||
},
|
||||
"glossary": glossary,
|
||||
}
|
||||
ch = build_missingness(profile, ctx)
|
||||
assert ch is not None
|
||||
assert ch.id == "missingness"
|
||||
assert ch.blocks
|
||||
|
||||
titles = _titles(ch)
|
||||
headings = {t for (k, t) in titles if k == "heading"}
|
||||
# Core sections present.
|
||||
assert any("Cuánto y dónde" in h for h in headings)
|
||||
assert any("Faltantes por columna" in h for h in headings)
|
||||
assert any("Co-ocurrencia" in h for h in headings)
|
||||
assert any("MCAR" in h for h in headings)
|
||||
# A summary KVTable, a ranking DataTable, a co-occurrence figure and the
|
||||
# pairs table all exist.
|
||||
kinds = {k for (k, _) in titles}
|
||||
assert "kv_table" in kinds
|
||||
assert "data_table" in kinds
|
||||
assert "figure" in kinds
|
||||
|
||||
# Glossary terms registered.
|
||||
keys = {t["key"] for t in glossary.terms()}
|
||||
assert {"missingness", "mcar", "mar"} <= keys
|
||||
|
||||
# The MCAR/MAR note reads the co-occurrence; with a perfect overlap it must
|
||||
# flag the non-random (MAR) reading.
|
||||
text = _all_text(ch)
|
||||
assert "MAR" in text
|
||||
|
||||
|
||||
def test_db_pushdown_covers_categorical_column(tmp_path):
|
||||
"""The is-null mask push-down must cover a categorical column, so a
|
||||
categorical that co-misses with a numeric one shows up in the pairs."""
|
||||
import duckdb
|
||||
|
||||
db = str(tmp_path / "miss.duckdb")
|
||||
con = duckdb.connect(db)
|
||||
con.execute("CREATE TABLE t (num1 DOUBLE, num2 DOUBLE, cat VARCHAR)")
|
||||
# num1 and cat are NULL together in the first 4 of 10 rows; num2 never null.
|
||||
rows = []
|
||||
for i in range(10):
|
||||
if i < 4:
|
||||
rows.append((None, float(i), None))
|
||||
else:
|
||||
rows.append((float(i), float(i), f"c{i}"))
|
||||
con.executemany("INSERT INTO t VALUES (?,?,?)", rows)
|
||||
con.close()
|
||||
|
||||
profile = {
|
||||
"n_rows": 10,
|
||||
"null_cell_pct": (0.4 + 0.0 + 0.4) / 3.0,
|
||||
"columns": [
|
||||
{"name": "num1", "null_count": 4, "null_pct": 0.4, "n_rows": 10},
|
||||
{"name": "num2", "null_count": 0, "null_pct": 0.0, "n_rows": 10},
|
||||
{"name": "cat", "null_count": 4, "null_pct": 0.4, "n_rows": 10},
|
||||
],
|
||||
}
|
||||
ctx = {"db_path": db, "table": "t", "glossary": model.GlossaryCollector()}
|
||||
ch = build_missingness(profile, ctx)
|
||||
assert ch is not None
|
||||
|
||||
# The pairs table must mention both num1 and cat (they co-miss perfectly),
|
||||
# which is only possible if the mask covered the categorical column.
|
||||
text = _all_text(ch)
|
||||
assert "num1" in text and "cat" in text
|
||||
# Co-occurrence section + a pairs data table exist.
|
||||
titles = _titles(ch)
|
||||
assert any("co-faltan" in (t or "").lower() for (k, t) in titles)
|
||||
@@ -6,15 +6,16 @@ normality}``). It renders, as structured markdown/tables/figures that the core
|
||||
paginator never cuts:
|
||||
|
||||
1. **Normalization note** — every multivariate model below standardizes the
|
||||
columns with z-score first; the chapter explains why (different scales would
|
||||
otherwise dominate distance/variance).
|
||||
columns with z-score first (the term is marked clickable; its definition
|
||||
lives in the GLOSARIO chapter, not inline).
|
||||
2. **PCA** — a scree plot (explained + cumulative variance, single Y axis) plus
|
||||
variance and top-loadings tables.
|
||||
3. **KMeans segments** — a PCA scatter **coloured by cluster** (its own
|
||||
page/slide), the cluster-size table, and a per-cluster LLM micro-analysis
|
||||
with a title for each segment.
|
||||
4. **Isolation Forest outliers** — a short explanation of how anomalous rows are
|
||||
isolated multivariately and how the threshold is chosen, plus the counts.
|
||||
4. **Isolation Forest outliers** — the multivariate anomaly counts and decision
|
||||
threshold (the method is marked clickable; its definition lives in the
|
||||
GLOSARIO chapter, not inline).
|
||||
5. **Normality** — per-column Jarque-Bera / D'Agostino / Shapiro verdicts.
|
||||
|
||||
The raw numeric data needed to colour the cluster scatter is **not** in the
|
||||
@@ -55,6 +56,62 @@ _CLUSTER_COLORS = [
|
||||
"#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac",
|
||||
]
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance — the
|
||||
# canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
|
||||
# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
|
||||
# block. A term is registered only when its section is actually rendered, so the
|
||||
# glossary never lists an entry no in-text appearance points to.
|
||||
_TERM_DEFS = {
|
||||
"zscore": (
|
||||
"Estandarización z-score",
|
||||
"Transformación que lleva cada columna numérica a media 0 y desviación "
|
||||
"típica 1: a cada valor le resta la media de su columna y lo divide por "
|
||||
"la desviación típica. Así variables con escalas muy distintas (euros "
|
||||
"frente a un ratio 0–1) pesan por igual en las distancias y la varianza."),
|
||||
"pca": (
|
||||
"PCA (componentes principales)",
|
||||
"El análisis de componentes principales resume muchas variables "
|
||||
"numéricas correlacionadas en pocos ejes nuevos (componentes), "
|
||||
"ortogonales entre sí y ordenados por la cantidad de varianza que "
|
||||
"capturan. Permite ver la estructura de los datos en 2D y saber cuántas "
|
||||
"dimensiones bastan para explicarlos."),
|
||||
"kmeans": (
|
||||
"KMeans (segmentación)",
|
||||
"Algoritmo de agrupamiento no supervisado que reparte las filas en k "
|
||||
"segmentos: asigna cada fila al centro (centroide) más cercano y recoloca "
|
||||
"los centroides de forma iterativa hasta minimizar la distancia interna "
|
||||
"de cada grupo. Aquí k se elige automáticamente."),
|
||||
"silhouette": (
|
||||
"Coeficiente de silueta (silhouette)",
|
||||
"Métrica de calidad de un agrupamiento, en el rango −1 a 1: para cada "
|
||||
"fila compara cómo de cerca está de su propio segmento frente al segmento "
|
||||
"vecino más próximo. Cuanto más alto el promedio, más compactos y "
|
||||
"separados están los segmentos."),
|
||||
"isolation_forest": (
|
||||
"Isolation Forest (anomalías)",
|
||||
"Algoritmo de detección de anomalías multivariante: construye árboles que "
|
||||
"parten el espacio con cortes aleatorios y mide cuántos cortes hacen "
|
||||
"falta para aislar cada fila. Las filas raras se aíslan con muy pocos "
|
||||
"cortes y se marcan como outliers según un umbral de contaminación."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
it), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _register(gloss, key: str) -> None:
|
||||
"""Register term ``key`` in the collector (idempotent); no-op if gloss None."""
|
||||
if gloss is not None:
|
||||
label, definition = _TERM_DEFS[key]
|
||||
gloss.add(key, label, definition)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the overview chapter's defensive style).
|
||||
@@ -252,34 +309,33 @@ def _make_cluster_scatter(projection: dict):
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Section builders. Each returns a list of blocks (possibly empty).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _normalization_intro() -> list:
|
||||
def _normalization_intro(gloss=None, mark_term: bool = False) -> list:
|
||||
_register(gloss, "zscore")
|
||||
zscore = _term(mark_term, "zscore", "**estandarizan con z-score**")
|
||||
text = (
|
||||
"Estos modelos son **no supervisados**: buscan estructura latente sin "
|
||||
"una variable objetivo. Antes de aplicarlos, todas las columnas "
|
||||
"numéricas se **estandarizan con z-score** (cada valor menos la media, "
|
||||
"dividido por la desviación típica). Sin esta normalización, una "
|
||||
"variable con escala grande (p.ej. ingresos en euros) dominaría las "
|
||||
"distancias y la varianza frente a otra de escala pequeña (p.ej. un "
|
||||
"ratio entre 0 y 1), sesgando tanto el PCA como el KMeans. Tras la "
|
||||
"estandarización todas las variables pesan por igual."
|
||||
f"numéricas se {zscore}, para que todas pesen por igual con "
|
||||
"independencia de su escala."
|
||||
)
|
||||
return [model.Heading(text="Modelos no supervisados", level=1),
|
||||
model.Markdown(text=text)]
|
||||
|
||||
|
||||
def _pca_section(pca: dict) -> list:
|
||||
def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
if not _is_dict(pca) or not pca.get("explained_variance_ratio"):
|
||||
return []
|
||||
_register(gloss, "pca")
|
||||
blocks = [model.Heading(text="PCA — varianza explicada", level=2)]
|
||||
|
||||
n_used = pca.get("n_rows_used")
|
||||
n_feat = pca.get("n_features")
|
||||
intro = (
|
||||
f"El PCA resume {_fmt_num(n_feat)} variables numéricas en componentes "
|
||||
f"ortogonales ordenados por la varianza que capturan "
|
||||
f"({_fmt_num(n_used)} filas usadas tras eliminar nulos). El gráfico de "
|
||||
"sedimentación (scree) muestra cuánta varianza aporta cada componente y "
|
||||
"su acumulado: un codo marca cuántos componentes bastan."
|
||||
f"El {_term(mark_term, 'pca', 'PCA')} se aplica sobre "
|
||||
f"{_fmt_num(n_feat)} variables numéricas ({_fmt_num(n_used)} filas "
|
||||
"usadas tras eliminar nulos). El gráfico de sedimentación (scree) "
|
||||
"muestra cuánta varianza aporta cada componente y su acumulado: un "
|
||||
"codo marca cuántos componentes bastan."
|
||||
)
|
||||
blocks.append(model.Markdown(text=intro))
|
||||
|
||||
@@ -325,11 +381,14 @@ def _pca_section(pca: dict) -> list:
|
||||
return blocks
|
||||
|
||||
|
||||
def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
|
||||
def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
gloss=None, mark_term: bool = False) -> list:
|
||||
has_km = _is_dict(kmeans) and kmeans.get("best_k")
|
||||
has_proj = _is_dict(projection) and projection.get("points")
|
||||
if not has_km and not has_proj:
|
||||
return []
|
||||
_register(gloss, "kmeans")
|
||||
_register(gloss, "silhouette")
|
||||
|
||||
blocks = [model.Heading(text="Segmentación (KMeans)", level=2)]
|
||||
|
||||
@@ -337,11 +396,12 @@ def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
|
||||
sil = (projection or {}).get("silhouette")
|
||||
if sil is None:
|
||||
sil = (kmeans or {}).get("silhouette")
|
||||
t_kmeans = _term(mark_term, "kmeans", "KMeans")
|
||||
t_sil = _term(mark_term, "silhouette", "*silhouette*")
|
||||
intro = (
|
||||
f"KMeans agrupa las filas en **{_fmt_num(best_k)} segmentos** elegidos "
|
||||
"automáticamente maximizando el coeficiente de *silhouette* "
|
||||
f"(**{_fmt_num(sil)}**, rango −1 a 1: cuanto más alto, segmentos más "
|
||||
"compactos y separados). Los segmentos se proyectan sobre el plano de "
|
||||
f"{t_kmeans} agrupa las filas en **{_fmt_num(best_k)} segmentos** "
|
||||
f"elegidos automáticamente por el coeficiente de {t_sil} "
|
||||
f"(**{_fmt_num(sil)}**). Los segmentos se proyectan sobre el plano de "
|
||||
"los dos primeros componentes principales para visualizarlos."
|
||||
)
|
||||
blocks.append(model.Markdown(text=intro))
|
||||
@@ -394,23 +454,21 @@ def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
|
||||
return blocks
|
||||
|
||||
|
||||
def _outliers_section(outliers: dict) -> list:
|
||||
def _outliers_section(outliers: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
if not _is_dict(outliers) or outliers.get("n_outliers") is None:
|
||||
return []
|
||||
if outliers.get("note") and not outliers.get("n_rows_used"):
|
||||
# insufficient data — nothing meaningful to show.
|
||||
return []
|
||||
_register(gloss, "isolation_forest")
|
||||
blocks = [model.Heading(text="Detección de anomalías (Isolation Forest)",
|
||||
level=2)]
|
||||
isof = _term(mark_term, "isolation_forest", "**Isolation Forest**")
|
||||
explain = (
|
||||
"**Isolation Forest** detecta filas anómalas de forma *multivariante*: "
|
||||
"construye árboles que parten el espacio con cortes aleatorios y mide "
|
||||
"cuántos cortes hacen falta para aislar cada fila. Las filas raras "
|
||||
"(combinaciones de valores poco frecuentes considerando **todas las "
|
||||
"columnas a la vez**, no una sola) se aíslan con muy pocos cortes y "
|
||||
"obtienen un score bajo. El **umbral** de decisión separa las filas "
|
||||
"normales de las anómalas según la contaminación esperada del modelo: "
|
||||
"una fila es outlier cuando su score queda por debajo de ese umbral."
|
||||
f"{isof} marca filas anómalas de forma *multivariante*: combinaciones "
|
||||
"de valores poco frecuentes considerando **todas las columnas a la "
|
||||
"vez**, no una sola. La tabla resume cuántas se detectaron y el umbral "
|
||||
"de decisión empleado."
|
||||
)
|
||||
blocks.append(model.Markdown(text=explain))
|
||||
blocks.append(model.KVTable(rows=[
|
||||
@@ -484,15 +542,21 @@ def build_modelos(profile: dict, ctx: dict):
|
||||
(kmeans and kmeans.get("best_k")) or (projection and projection.get("points"))
|
||||
) else None
|
||||
|
||||
# Shared glossary collector: terms are registered + marked clickable inside
|
||||
# each section, only when that section actually renders (no orphan entries).
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
|
||||
sections = []
|
||||
sections += _pca_section(pca) if pca else []
|
||||
sections += _kmeans_section(kmeans, projection, titles)
|
||||
sections += _outliers_section(outliers) if outliers else []
|
||||
sections += _pca_section(pca, gloss, mark_term) if pca else []
|
||||
sections += _kmeans_section(kmeans, projection, titles, gloss, mark_term)
|
||||
sections += _outliers_section(outliers, gloss, mark_term) if outliers else []
|
||||
sections += _normality_section(normality) if normality else []
|
||||
|
||||
if not sections:
|
||||
return None # models block present but nothing renderable.
|
||||
|
||||
blocks = _normalization_intro() + sections
|
||||
blocks = _normalization_intro(gloss, mark_term) + sections
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -257,3 +257,26 @@ def test_anticortes_tabla_normalidad_larga_no_corta():
|
||||
# Every column name survives (wrapped/split, never truncated).
|
||||
for i in (0, 19, 39):
|
||||
assert f"col_{i}" in txt
|
||||
|
||||
|
||||
def test_glosario_engancha_terminos_modelos():
|
||||
"""Mejora 4b: PCA, KMeans, silhouette, Isolation Forest y la estandarización
|
||||
z-score se registran en el colector compartido y se marcan clicables en el
|
||||
cuerpo. Sin colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ctx = dict(_ctx_full())
|
||||
ctx["glossary"] = g
|
||||
ch = build_modelos(_profile(), ctx)
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"zscore", "pca", "kmeans", "silhouette", "isolation_forest"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
for k in ("zscore", "pca", "kmeans", "silhouette", "isolation_forest"):
|
||||
assert f"[[term:{k}]]" in body, k
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_modelos(_profile(), _ctx_full())
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
"""Numeric distributions chapter (NUM DISTR) for AutomaticEDA.
|
||||
|
||||
For every numeric column the chapter draws, as a single indivisible figure, a
|
||||
histogram with the **mean, median and ±1σ band drawn as reference lines** and a
|
||||
**Tukey boxplot right below it** sharing the same X axis — exactly the user
|
||||
requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
|
||||
histogram with the **mean, median and ±1σ band drawn as reference lines** (the
|
||||
legend reports the numeric value of the mean, the median **and the standard
|
||||
deviation σ**) and a **Tukey boxplot right below it** sharing the same X axis —
|
||||
exactly the user requirement for this chapter. Each figure is emitted as a lazy ``Figure`` block
|
||||
so the renderers rasterize and scale it to fit a whole page/slide and nothing is
|
||||
ever cut; columns with many numerics simply flow across pages as small
|
||||
multiples.
|
||||
@@ -34,7 +35,7 @@ try:
|
||||
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
|
||||
build_boxplot_stats = None # type: ignore[assignment]
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "num_distr"
|
||||
CHAPTER_TITLE = "Distribuciones numéricas"
|
||||
|
||||
@@ -140,9 +141,11 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
|
||||
std = numeric.get("std")
|
||||
|
||||
# ±1σ band first (behind the lines), then median (solid) and mean (dashed).
|
||||
# The band's legend entry also reports the numeric value of the standard
|
||||
# deviation, so the reader sees mean, median AND σ at a glance.
|
||||
if mean is not None and std is not None and std > 0:
|
||||
ax_h.axvspan(mean - std, mean + std, color="#f0c27b", alpha=0.22,
|
||||
zorder=1, label="±1σ")
|
||||
zorder=1, label=f"±1σ (σ = {_fmt_num(std)})")
|
||||
if median is not None:
|
||||
ax_h.axvline(median, color="#2e8b57", linestyle="-", linewidth=1.6,
|
||||
zorder=4, label=f"mediana = {_fmt_num(median)}")
|
||||
@@ -152,7 +155,19 @@ def _make_hist_box(name: str, numeric: dict, box: dict):
|
||||
|
||||
ax_h.set_ylabel("frecuencia", fontsize=8)
|
||||
ax_h.tick_params(labelsize=7)
|
||||
ax_h.legend(fontsize=6.5, loc="upper right", framealpha=0.85)
|
||||
# Always surface σ in the legend: if the ±1σ band could not be drawn (no mean
|
||||
# or std<=0) but σ is still known, add a label-only proxy handle so the value
|
||||
# of the standard deviation is reported regardless of the band.
|
||||
handles, labels = ax_h.get_legend_handles_labels()
|
||||
if std is not None and not any("σ =" in lbl for lbl in labels):
|
||||
from matplotlib.lines import Line2D
|
||||
proxy = Line2D([], [], linestyle="none", marker="",
|
||||
label=f"σ = {_fmt_num(std)}")
|
||||
handles.append(proxy)
|
||||
labels.append(f"σ = {_fmt_num(std)}")
|
||||
if handles:
|
||||
ax_h.legend(handles, labels, fontsize=6.5, loc="upper right",
|
||||
framealpha=0.85)
|
||||
for spine in ("top", "right"):
|
||||
ax_h.spines[spine].set_visible(False)
|
||||
|
||||
|
||||
@@ -159,6 +159,50 @@ def test_anti_corte_muchas_columnas_pdf_y_pptx():
|
||||
assert res_pptx["n_slides"] >= 8 # at least one slide per column figure.
|
||||
|
||||
|
||||
def _hist_legend_texts(numeric, box=None):
|
||||
"""Build the per-column figure and return its histogram-legend label texts."""
|
||||
from datascience.automatic_eda.chapters.num_distr import _make_hist_box
|
||||
import matplotlib.pyplot as plt
|
||||
fig = _make_hist_box("col", numeric, box or {})
|
||||
ax_h = fig.axes[0] # the histogram is the top axis.
|
||||
leg = ax_h.get_legend()
|
||||
texts = [t.get_text() for t in leg.get_texts()] if leg else []
|
||||
plt.close(fig)
|
||||
return texts
|
||||
|
||||
|
||||
def test_golden_leyenda_histograma_reporta_valor_std():
|
||||
# The histogram legend must report the numeric value of the standard
|
||||
# deviation σ next to mean and median.
|
||||
numeric = _numeric_block(42.5, 40.0, 12.3, 1.0, 100.0, "right-skewed", 5)
|
||||
texts = _hist_legend_texts(numeric)
|
||||
joined = " ".join(texts)
|
||||
assert any("σ =" in t for t in texts), f"σ value missing in legend: {texts}"
|
||||
assert "12.3" in joined, f"std value 12.3 not in legend: {texts}"
|
||||
assert any("media =" in t for t in texts)
|
||||
assert any("mediana =" in t for t in texts)
|
||||
|
||||
|
||||
def test_edge_std_en_leyenda_aunque_no_haya_banda():
|
||||
# When the ±1σ band cannot be drawn (no mean) but σ is known, the legend
|
||||
# still surfaces the σ value via a label-only proxy handle.
|
||||
numeric = _numeric_block(42.5, 40.0, 7.5, 1.0, 100.0, "right-skewed", 0)
|
||||
numeric["mean"] = None # forces the band off; σ must still appear.
|
||||
texts = _hist_legend_texts(numeric)
|
||||
assert any("σ = 7.5" in t for t in texts), f"σ proxy missing: {texts}"
|
||||
|
||||
|
||||
def test_edge_sin_std_no_revienta_la_figura():
|
||||
# A numeric block without σ must not raise and simply omits the σ entry.
|
||||
import matplotlib.pyplot as plt
|
||||
numeric = _numeric_block(42.5, 40.0, 0.0, 1.0, 100.0, "discrete", 0)
|
||||
numeric["std"] = None
|
||||
texts = _hist_legend_texts(numeric)
|
||||
assert not any("σ =" in t for t in texts)
|
||||
# mean/median lines still produce their own legend entries.
|
||||
assert any("media =" in t for t in texts)
|
||||
|
||||
|
||||
def test_distribution_gloss_cubre_todas_las_etiquetas():
|
||||
# Every label detect_distribution_type can emit has a Spanish gloss.
|
||||
for label in ("normal-ish", "right-skewed", "left-skewed", "heavy-tail",
|
||||
|
||||
@@ -0,0 +1,593 @@
|
||||
"""Outliers chapter (OUTLIERS) — univariate + multivariate atypical values.
|
||||
|
||||
Today the analysis of atypical values is scattered across the document: the
|
||||
NUM DISTR chapter mentions the per-column outlier count inside each distribution
|
||||
figure, and the MODELOS chapter runs Isolation Forest as one of several cheap
|
||||
models. This chapter gathers and deepens the whole outlier story in a single
|
||||
place, with its interpretation: an [[term:outlier]]outlier[[/term]] is **not
|
||||
necessarily an error** — it can be a legitimate, extreme but real observation —
|
||||
so the reading is exploratory (what to look at), never confirmatory (what to
|
||||
delete).
|
||||
|
||||
Sections, in order:
|
||||
|
||||
1. **Resumen univariante por columna** — for every numeric column, the number
|
||||
and percentage of atypical values by two complementary criteria: Tukey's
|
||||
1.5·IQR rule ([[term:tukey_fence]]vallas de Tukey[[/term]]) and the
|
||||
[[term:zscore]]z-score[[/term]] rule (|z| > 3). The most contaminated columns
|
||||
are flagged. The fences come from the pure registry function
|
||||
``build_boxplot_stats`` (derived from the profile percentiles); the per-column
|
||||
counts use the raw sample in ``ctx['raw_numeric']`` when available (the exact
|
||||
count), degrading to the profile's own z-score counts otherwise.
|
||||
2. **Boxplots** — a single figure with the Tukey boxplots of the most
|
||||
contaminated columns (box, whiskers and atypical points), delegated to the
|
||||
reusable registry helper ``build_boxplots_figure``.
|
||||
3. **Multivariante (filas anómalas)** — rows that are atypical considering ALL
|
||||
columns at once, via the registry function ``isolation_forest_outliers``: the
|
||||
count and percentage of anomalous rows, the most anomalous rows with their
|
||||
score, and the dimensions that make each one rare (top columns by |z|, via
|
||||
``summarize_outlier_dims``). Run live on ``ctx['raw_numeric']`` (the same
|
||||
numeric columns ``summarize_outlier_dims`` uses, so the row indexing stays
|
||||
coherent and the dimension breakdown is correct); falls back to the
|
||||
precomputed ``profile['models']['outliers']`` only when no raw sample is
|
||||
available (e.g. the lite preset), where no per-row breakdown is shown.
|
||||
4. **Interpretación** — outlier ≠ error: how to tell a data-entry error from a
|
||||
genuine extreme value, and what to do (inspect, winsorize, or re-express —
|
||||
linking to the Tukey re-expression the profile already computes).
|
||||
|
||||
The chapter activates whenever the table has at least one numeric column; with
|
||||
no numeric column it returns ``None`` and disappears from the document.
|
||||
|
||||
Reads everything defensively (``.get``) and never raises: every registry
|
||||
delegation is imported lazily and degraded to an honest note on any failure.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "outliers"
|
||||
CHAPTER_TITLE = "Valores atípicos"
|
||||
|
||||
# z-score threshold for the univariate z rule: |z| > 3 flags a value ~3 standard
|
||||
# deviations from the mean (≈99.7% of a normal distribution lies within ±3σ).
|
||||
_Z_THRESH = 3.0
|
||||
# How many columns to draw in the boxplots figure (most contaminated first) and
|
||||
# how many anomalous rows to list in the multivariate table.
|
||||
_TOP_BOX = 12
|
||||
_TOP_ROWS = 12
|
||||
# Cap on the raw atypical values passed as boxplot fliers, so a heavy-tailed
|
||||
# column does not flood the figure with thousands of points.
|
||||
_MAX_FLIERS = 200
|
||||
# How many columns flagged as "most contaminated" in the summary note.
|
||||
_TOP_FLAGGED = 3
|
||||
|
||||
# Glossary terms this chapter explains (contract §11.1). Registered in the shared
|
||||
# collector and marked clickable on first appearance. ``isolation_forest`` and
|
||||
# ``zscore`` may also be registered by the MODELOS chapter — ``add`` is
|
||||
# idempotent (first definition wins), so registering them here is harmless and
|
||||
# keeps this chapter self-contained when MODELOS does not render.
|
||||
_TERM_DEFS = {
|
||||
"outlier": (
|
||||
"Valor atípico (outlier)",
|
||||
"Una observación que se aparta mucho del grueso de los datos. Un atípico "
|
||||
"NO es necesariamente un error: puede ser un fallo de medida o de "
|
||||
"registro, pero también un dato real extremo (un cliente que gasta diez "
|
||||
"veces la media, un día de ventas excepcional). Por eso se señalan para "
|
||||
"revisarlos, no para borrarlos automáticamente.",
|
||||
),
|
||||
"tukey_fence": (
|
||||
"Vallas de Tukey (1,5·IQR)",
|
||||
"Regla clásica para marcar atípicos a partir de los cuartiles: se calcula "
|
||||
"el rango intercuartílico IQR = P75 − P25 y se trazan dos vallas, una "
|
||||
"inferior en P25 − 1,5·IQR y otra superior en P75 + 1,5·IQR. Los valores "
|
||||
"que caen fuera de esas vallas se consideran atípicos. Es robusta porque "
|
||||
"se apoya en la mediana y los cuartiles, no en la media.",
|
||||
),
|
||||
"zscore": (
|
||||
"z-score (puntuación típica)",
|
||||
"Mide a cuántas desviaciones típicas está un valor de la media de su "
|
||||
"columna: z = (valor − media) / desviación típica. Un |z| grande (aquí > "
|
||||
"3) señala un valor alejado del centro. A diferencia de las vallas de "
|
||||
"Tukey, el z-score usa media y desviación, así que es más sensible a la "
|
||||
"presencia de los propios atípicos.",
|
||||
),
|
||||
"isolation_forest": (
|
||||
"Isolation Forest (anomalías multivariantes)",
|
||||
"Algoritmo de detección de anomalías que considera TODAS las columnas a "
|
||||
"la vez: construye árboles que parten el espacio con cortes aleatorios y "
|
||||
"mide cuántos cortes hacen falta para aislar cada fila. Las filas raras "
|
||||
"se aíslan con muy pocos cortes y se marcan como atípicas según un umbral "
|
||||
"de contaminación. Detecta combinaciones de valores poco frecuentes que "
|
||||
"ninguna columna por separado revelaría.",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Lazy registry delegations (each degrades to None / no-op on any failure).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _load_build_boxplot_stats():
|
||||
try:
|
||||
from datascience.build_boxplot_stats import build_boxplot_stats
|
||||
return build_boxplot_stats
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
def _load_detect_outliers():
|
||||
# detect_outliers lives in the monolithic ``datascience.datascience`` module
|
||||
# (file_path datascience.py), not in its own submodule — try both shapes.
|
||||
try:
|
||||
from datascience.datascience import detect_outliers
|
||||
return detect_outliers
|
||||
except Exception: # noqa: BLE001
|
||||
try:
|
||||
from datascience import detect_outliers
|
||||
return detect_outliers
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
def _load_isolation_forest():
|
||||
try:
|
||||
from datascience.isolation_forest_outliers import isolation_forest_outliers
|
||||
return isolation_forest_outliers
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
def _load_summarize_dims():
|
||||
try:
|
||||
from datascience.summarize_outlier_dims import summarize_outlier_dims
|
||||
return summarize_outlier_dims
|
||||
except Exception: # noqa: BLE001
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Defensive formatters (own copy: the chapter never imports siblings).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _fmt_num(value, decimals: int = 3) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
if isinstance(value, bool):
|
||||
return "sí" if value else "no"
|
||||
if isinstance(value, int):
|
||||
return f"{value:,}".replace(",", ".")
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "—"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _fmt_int(value) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{int(round(float(value))):,}".replace(",", ".")
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _fmt_pct(value, decimals: int = 2) -> str:
|
||||
"""Format an already-0-100 value as a percentage. None -> placeholder."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value):.{decimals}f}%"
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _is_dict(v) -> bool:
|
||||
return isinstance(v, dict)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Profile reads.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _numeric_columns(profile: dict) -> list:
|
||||
"""Return [(name, numeric_dict)] for numeric columns with usable stats."""
|
||||
out = []
|
||||
for col in profile.get("columns") or []:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
if col.get("inferred_type") != "numeric":
|
||||
continue
|
||||
num = col.get("numeric")
|
||||
if not isinstance(num, dict) or not num:
|
||||
continue
|
||||
if num.get("mean") is None and num.get("median") is None:
|
||||
continue
|
||||
out.append((col.get("name") or "(columna)", num))
|
||||
return out
|
||||
|
||||
|
||||
def _clean_values(raw):
|
||||
"""Return the finite float values of a raw column list (drop None/NaN/inf)."""
|
||||
if not isinstance(raw, (list, tuple)):
|
||||
return None
|
||||
vals = []
|
||||
for v in raw:
|
||||
if v is None or isinstance(v, bool):
|
||||
continue
|
||||
try:
|
||||
f = float(v)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if f != f or f in (float("inf"), float("-inf")):
|
||||
continue
|
||||
vals.append(f)
|
||||
return vals
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Per-column univariate summary.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _univariate_row(name, numeric, raw_vals, box_fn, detect_fn):
|
||||
"""Compute one univariate summary row + boxplot inputs for a column.
|
||||
|
||||
Returns a dict with the table cells and, when raw values are available, the
|
||||
exact Tukey/z counts and the list of atypical (flier) values; otherwise it
|
||||
degrades to the profile's own z-score counts and the fence flags.
|
||||
"""
|
||||
box = {}
|
||||
if box_fn is not None:
|
||||
try:
|
||||
box = box_fn(numeric) or {}
|
||||
except Exception: # noqa: BLE001
|
||||
box = {}
|
||||
lf = box.get("lower_fence")
|
||||
uf = box.get("upper_fence")
|
||||
|
||||
vals = _clean_values(raw_vals)
|
||||
n_tukey = pct_tukey = None
|
||||
n_z = pct_z = None
|
||||
low_extreme = high_extreme = None
|
||||
fliers = []
|
||||
contamination = None # metric used to rank columns (prefer Tukey %).
|
||||
|
||||
if vals:
|
||||
n = len(vals)
|
||||
tukey_out = []
|
||||
for v in vals:
|
||||
below = (lf is not None and v < lf)
|
||||
above = (uf is not None and v > uf)
|
||||
if below or above:
|
||||
tukey_out.append(v)
|
||||
n_tukey = len(tukey_out)
|
||||
pct_tukey = 100.0 * n_tukey / n if n else None
|
||||
if tukey_out:
|
||||
low_extreme = min(tukey_out)
|
||||
high_extreme = max(tukey_out)
|
||||
fliers = tukey_out[:_MAX_FLIERS]
|
||||
# z-score rule via the registry function (returns parallel bools).
|
||||
if detect_fn is not None:
|
||||
try:
|
||||
flags = detect_fn(vals, _Z_THRESH) or []
|
||||
n_z = int(sum(1 for b in flags if b))
|
||||
pct_z = 100.0 * n_z / n if n else None
|
||||
except Exception: # noqa: BLE001
|
||||
n_z = pct_z = None
|
||||
contamination = pct_tukey
|
||||
else:
|
||||
# Degrade: no raw sample for this column. The profile's own outlier
|
||||
# count/pct come from the z-score block (build_boxplot_stats note); the
|
||||
# Tukey count is unknown, only the fence flags are.
|
||||
n_z = numeric.get("n_outliers")
|
||||
pct_z = numeric.get("outlier_pct")
|
||||
if box.get("has_low_outliers") and box.get("min") is not None:
|
||||
low_extreme = box.get("min")
|
||||
if box.get("has_high_outliers") and box.get("max") is not None:
|
||||
high_extreme = box.get("max")
|
||||
contamination = pct_z if isinstance(pct_z, (int, float)) else None
|
||||
|
||||
# Compact "extremos atípicos" cell: down/up arrows for the low/high tail.
|
||||
extremes = []
|
||||
if low_extreme is not None:
|
||||
extremes.append(f"↓ {_fmt_num(low_extreme)}")
|
||||
if high_extreme is not None:
|
||||
extremes.append(f"↑ {_fmt_num(high_extreme)}")
|
||||
extremes_cell = " ".join(extremes) if extremes else "—"
|
||||
|
||||
return {
|
||||
"name": model._safe_str(name),
|
||||
"n_tukey": n_tukey,
|
||||
"pct_tukey": pct_tukey,
|
||||
"n_z": n_z,
|
||||
"pct_z": pct_z,
|
||||
"lower_fence": lf,
|
||||
"upper_fence": uf,
|
||||
"extremes": extremes_cell,
|
||||
"box": box,
|
||||
"fliers": fliers,
|
||||
"has_raw": bool(vals),
|
||||
"contamination": contamination if isinstance(contamination, (int, float)) else -1.0,
|
||||
}
|
||||
|
||||
|
||||
def _univariate_table(rows: list) -> model.DataTable:
|
||||
header = ["Columna", "Atípicos Tukey", "% Tukey", "Atípicos z", "% z",
|
||||
"Valla inf.", "Valla sup.", "Extremos atípicos"]
|
||||
table_rows = []
|
||||
for r in rows:
|
||||
table_rows.append([
|
||||
r["name"],
|
||||
_fmt_int(r["n_tukey"]) if r["n_tukey"] is not None else "—",
|
||||
_fmt_pct(r["pct_tukey"]) if r["pct_tukey"] is not None else "—",
|
||||
_fmt_int(r["n_z"]) if r["n_z"] is not None else "—",
|
||||
_fmt_pct(r["pct_z"]) if r["pct_z"] is not None else "—",
|
||||
_fmt_num(r["lower_fence"]),
|
||||
_fmt_num(r["upper_fence"]),
|
||||
r["extremes"],
|
||||
])
|
||||
return model.DataTable(
|
||||
header=header, rows=table_rows,
|
||||
title="Valores atípicos por columna",
|
||||
note="Tukey = fuera de las vallas 1,5·IQR · z = |z-score| > 3 · "
|
||||
"ordenado de más a menos contaminada")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Multivariate (Isolation Forest) section.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _resolve_multivariate(profile: dict, ctx: dict, raw_numeric):
|
||||
"""Return (outliers_dict_or_None, source).
|
||||
|
||||
Prefers a LIVE Isolation Forest over ``raw_numeric`` so the detector and
|
||||
``summarize_outlier_dims`` use EXACTLY the same numeric columns and the same
|
||||
valid-row indexing — otherwise the precomputed ``profile['models']
|
||||
['outliers']`` (run by MODELOS over a possibly different column subset) would
|
||||
yield ``row_index`` values that no longer point at the rows
|
||||
``summarize_outlier_dims`` reconstructs, mislabelling the "dimensions that
|
||||
make each row rare". Falls back to the precomputed block when no raw sample
|
||||
is available (e.g. the lite preset drops ``raw_numeric``)."""
|
||||
if _is_dict(raw_numeric) and raw_numeric:
|
||||
iso = _load_isolation_forest()
|
||||
if iso is not None:
|
||||
try:
|
||||
out = iso(raw_numeric)
|
||||
if _is_dict(out) and out.get("n_outliers") is not None and out.get("n_rows_used"):
|
||||
return out, "live"
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
# Fallback: the model the MODELOS chapter already computed (no raw sample to
|
||||
# recompute against, so no per-row dimension breakdown either).
|
||||
models = profile.get("models") if _is_dict(profile.get("models")) else {}
|
||||
pre = models.get("outliers") if _is_dict(models) else None
|
||||
if _is_dict(pre) and pre.get("n_outliers") is not None and pre.get("n_rows_used"):
|
||||
return pre, "precomputed"
|
||||
return None, "none"
|
||||
|
||||
|
||||
def _multivariate_blocks(outliers: dict, raw_numeric, mark: bool) -> list:
|
||||
isof = _term(mark, "isolation_forest", "**Isolation Forest**")
|
||||
blocks = [
|
||||
model.Heading(text="Filas atípicas (multivariante)", level=2),
|
||||
model.Markdown(text=(
|
||||
f"Hasta aquí cada columna se ha mirado por separado. {isof} busca "
|
||||
"filas raras considerando **todas las columnas a la vez**: una fila "
|
||||
"puede ser normal en cada variable y aun así ser atípica por la "
|
||||
"**combinación** de sus valores (p. ej. una edad baja con una tarifa "
|
||||
"muy alta). La tabla resume cuántas filas se marcaron y el umbral de "
|
||||
"decisión.")),
|
||||
model.KVTable(rows=[
|
||||
("Filas analizadas", _fmt_int(outliers.get("n_rows_used"))),
|
||||
("Columnas consideradas", _fmt_int(outliers.get("n_features"))),
|
||||
("Filas atípicas", _fmt_int(outliers.get("n_outliers"))),
|
||||
("% filas atípicas", _fmt_pct(outliers.get("outlier_pct"))),
|
||||
("Umbral de decisión", _fmt_num(outliers.get("threshold"), 4)),
|
||||
], title="Anomalías multivariantes"),
|
||||
]
|
||||
|
||||
rows_in = outliers.get("outlier_rows") or []
|
||||
if not rows_in:
|
||||
return blocks
|
||||
|
||||
# Enrich each anomalous row with the dimensions that make it rare, when the
|
||||
# raw sample is available (summarize_outlier_dims reconstructs the same
|
||||
# valid-row indexing as isolation_forest_outliers).
|
||||
dims_by_row = {}
|
||||
if _is_dict(raw_numeric) and raw_numeric:
|
||||
summ = _load_summarize_dims()
|
||||
if summ is not None:
|
||||
try:
|
||||
enriched = summ(raw_numeric, rows_in, top_k=3) or []
|
||||
for e in enriched:
|
||||
if _is_dict(e) and e.get("row_index") is not None:
|
||||
dims_by_row[e.get("row_index")] = e.get("dims") or []
|
||||
except Exception: # noqa: BLE001
|
||||
dims_by_row = {}
|
||||
|
||||
has_dims = bool(dims_by_row)
|
||||
header = ["Fila (entre válidas)", "Score"]
|
||||
if has_dims:
|
||||
header.append("Dimensiones que la hacen rara (col = valor, z)")
|
||||
table_rows = []
|
||||
for r in rows_in[:_TOP_ROWS]:
|
||||
if not _is_dict(r):
|
||||
continue
|
||||
ridx = r.get("row_index")
|
||||
cells = [_fmt_int(ridx), _fmt_num(r.get("score"), 4)]
|
||||
if has_dims:
|
||||
dims = dims_by_row.get(ridx) or []
|
||||
parts = []
|
||||
for d in dims:
|
||||
if not _is_dict(d):
|
||||
continue
|
||||
parts.append(
|
||||
f"{model._safe_str(d.get('col'))} = {_fmt_num(d.get('value'))} "
|
||||
f"(z {_fmt_num(d.get('z'), 2)})")
|
||||
cells.append("; ".join(parts) if parts else "—")
|
||||
table_rows.append(cells)
|
||||
|
||||
if table_rows:
|
||||
shown = len(table_rows)
|
||||
total = outliers.get("n_outliers")
|
||||
note = "las filas más anómalas primero (score más bajo = más rara)"
|
||||
if isinstance(total, int) and total > shown:
|
||||
note += f" — top {shown} de {total}"
|
||||
if not has_dims:
|
||||
note += (" · no se pudo recuperar la muestra cruda para explicar las "
|
||||
"dimensiones de cada fila")
|
||||
blocks.append(model.DataTable(
|
||||
header=header, rows=table_rows,
|
||||
title="Filas más atípicas", note=note))
|
||||
return blocks
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Interpretation section.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _interpretation_block(mark: bool) -> model.Markdown:
|
||||
outlier = _term(mark, "outlier", "atípico")
|
||||
text = (
|
||||
f"**Un {outlier} no es necesariamente un error.** Conviene distinguir "
|
||||
"dos casos antes de actuar:\n\n"
|
||||
"- **Error de dato** (medida, registro o unidad equivocada): una edad de "
|
||||
"200 años, un importe negativo donde no puede haberlo, un decimal "
|
||||
"desplazado. Estos sí se corrigen o se eliminan, idealmente en el origen.\n"
|
||||
"- **Dato real extremo**: una observación legítima de la cola de la "
|
||||
"distribución (un cliente que gasta mucho más, una tarifa de lujo, un día "
|
||||
"de ventas excepcional). Borrarla sesga el análisis y oculta información "
|
||||
"valiosa.\n\n"
|
||||
"**Qué hacer.** Primero, **revisar** los valores señalados arriba contra "
|
||||
"su origen para decidir cuál de los dos casos es. Si son errores, "
|
||||
"corregirlos. Si son datos reales que distorsionan medias y modelos, hay "
|
||||
"alternativas a borrarlos: **winsorizar** (recortar los extremos a un "
|
||||
"percentil), o **re-expresar** la variable (por ejemplo una "
|
||||
"transformación logarítmica o la escalera de re-expresión de Tukey que "
|
||||
"este mismo perfil ya calcula para las columnas asimétricas), que suele "
|
||||
"domar la cola sin perder ninguna fila. La elección depende del objetivo: "
|
||||
"esta lectura es **exploratoria** —orienta dónde mirar—, no una regla "
|
||||
"automática de limpieza.")
|
||||
return model.Markdown(text=text)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def build_outliers(profile: dict, ctx: dict):
|
||||
"""Build the OUTLIERS Chapter, or None if the dataset has no numeric column."""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
if not isinstance(profile, dict):
|
||||
return None
|
||||
|
||||
numerics = _numeric_columns(profile)
|
||||
if not numerics:
|
||||
return None # chapter does not apply to a dataset with no numerics.
|
||||
|
||||
# Register glossary terms (if a collector is present) and mark them clickable.
|
||||
glossary = ctx.get("glossary")
|
||||
mark = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
for key, (label, definition) in _TERM_DEFS.items():
|
||||
glossary.add(key, label, definition)
|
||||
mark = True
|
||||
|
||||
raw_numeric = ctx.get("raw_numeric")
|
||||
raw_numeric = raw_numeric if isinstance(raw_numeric, dict) else {}
|
||||
|
||||
box_fn = _load_build_boxplot_stats()
|
||||
detect_fn = _load_detect_outliers()
|
||||
|
||||
# --- Univariate summary ------------------------------------------------- #
|
||||
uni_rows = []
|
||||
for name, numeric in numerics:
|
||||
uni_rows.append(_univariate_row(
|
||||
name, numeric, raw_numeric.get(name), box_fn, detect_fn))
|
||||
# Rank columns by contamination (Tukey % when available, else z %).
|
||||
uni_rows.sort(key=lambda r: r.get("contamination", -1.0), reverse=True)
|
||||
|
||||
intro = (
|
||||
"Este capítulo reúne en un solo sitio el análisis de los **valores "
|
||||
"atípicos** de la tabla, que en el resto del informe aparecen dispersos. "
|
||||
f"Un {_term(mark, 'outlier', 'atípico')} es una observación que se aparta "
|
||||
"mucho del grueso de los datos. Cada columna numérica se evalúa con dos "
|
||||
f"criterios complementarios: las {_term(mark, 'tukey_fence', 'vallas de Tukey')} "
|
||||
"(fuera de P25−1,5·IQR o P75+1,5·IQR, robusto a la propia cola) y el "
|
||||
f"{_term(mark, 'zscore', 'z-score')} (|z| > 3, sensible a la media). La "
|
||||
"tabla está ordenada de la columna más contaminada a la menos.")
|
||||
|
||||
blocks = [
|
||||
model.Heading(text=CHAPTER_TITLE, level=1),
|
||||
model.Markdown(text=intro),
|
||||
_univariate_table(uni_rows),
|
||||
]
|
||||
|
||||
# Flag the most contaminated columns explicitly.
|
||||
flagged = [r["name"] for r in uni_rows
|
||||
if r.get("contamination", -1.0) > 0][:_TOP_FLAGGED]
|
||||
if flagged:
|
||||
names = ", ".join(f"**{n}**" for n in flagged)
|
||||
blocks.append(model.Markdown(text=(
|
||||
f"Las columnas con mayor proporción de atípicos son {names}: "
|
||||
"concentran el grueso de los valores fuera de las vallas y son las "
|
||||
"primeras a revisar.")))
|
||||
|
||||
# --- Boxplots figure ---------------------------------------------------- #
|
||||
box_entries = [
|
||||
{"name": r["name"], "box": r["box"], "fliers": r.get("fliers")}
|
||||
for r in uni_rows
|
||||
if r.get("box")
|
||||
][:_TOP_BOX]
|
||||
if box_entries:
|
||||
def _boxplots_make(entries=box_entries):
|
||||
try:
|
||||
from datascience.build_boxplots_figure import build_boxplots_figure
|
||||
return build_boxplots_figure(
|
||||
entries, title="Boxplots de Tukey por columna",
|
||||
max_boxes=_TOP_BOX)
|
||||
except Exception: # noqa: BLE001 — minimal fallback figure.
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
fig = Figure(figsize=(5.0, 2.2))
|
||||
ax = fig.add_subplot(111)
|
||||
ax.text(0.5, 0.5, "(boxplots no disponibles)",
|
||||
ha="center", va="center")
|
||||
ax.axis("off")
|
||||
return fig
|
||||
|
||||
blocks.append(model.Group(blocks=[
|
||||
model.Heading(text="Boxplots", level=2),
|
||||
model.Markdown(text=(
|
||||
"Cada caja abarca del primer al tercer cuartil (P25–P75), la línea "
|
||||
"interior es la mediana y los bigotes llegan hasta 1,5·IQR; los "
|
||||
"puntos son los valores que caen fuera de las vallas (atípicos por "
|
||||
"Tukey).")),
|
||||
model.Figure(
|
||||
make=_boxplots_make,
|
||||
caption="Boxplots de Tukey de las columnas más contaminadas."),
|
||||
]))
|
||||
|
||||
# --- Multivariate ------------------------------------------------------- #
|
||||
outliers, _src = _resolve_multivariate(profile, ctx, raw_numeric)
|
||||
if outliers is not None:
|
||||
blocks.extend(_multivariate_blocks(outliers, raw_numeric, mark))
|
||||
else:
|
||||
blocks.append(model.Heading(text="Filas atípicas (multivariante)", level=2))
|
||||
blocks.append(model.Note(
|
||||
"No se pudo analizar la anomalía multivariante: hacen falta al menos "
|
||||
"dos columnas numéricas y la muestra cruda (o los modelos del perfil) "
|
||||
"para correr Isolation Forest."))
|
||||
|
||||
# --- Interpretation ----------------------------------------------------- #
|
||||
blocks.append(model.Heading(text="Cómo interpretar los atípicos", level=2))
|
||||
blocks.append(_interpretation_block(mark))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,304 @@
|
||||
"""Tests for the OUTLIERS chapter — DoD: golden + edges + error path.
|
||||
|
||||
Self-contained: builds synthetic ``numeric`` blocks + a raw_numeric sample (no
|
||||
DuckDB) so the suite is fast and deterministic. Verifies that the chapter emits
|
||||
the univariate per-column table, a boxplots figure, the multivariate Isolation
|
||||
Forest section and the outlier≠error interpretation; that the most contaminated
|
||||
column is ranked first; that a profile with no numeric column yields None; that
|
||||
None/empty never raises; that the glossary terms are registered; and that the
|
||||
chapter renders into both PDF and PPTX without cutting its title.
|
||||
"""
|
||||
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from datascience.automatic_eda.chapters.outliers import (
|
||||
build_outliers, CHAPTER_VERSION, CHAPTER_TITLE, _TERM_DEFS,
|
||||
)
|
||||
from datascience.automatic_eda import model
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _percentile(sorted_vals, q):
|
||||
"""Linear-interpolation percentile (q in 0..1) on an already-sorted list."""
|
||||
if not sorted_vals:
|
||||
return None
|
||||
if len(sorted_vals) == 1:
|
||||
return float(sorted_vals[0])
|
||||
pos = q * (len(sorted_vals) - 1)
|
||||
lo = int(math.floor(pos))
|
||||
hi = int(math.ceil(pos))
|
||||
if lo == hi:
|
||||
return float(sorted_vals[lo])
|
||||
frac = pos - lo
|
||||
return float(sorted_vals[lo] * (1 - frac) + sorted_vals[hi] * frac)
|
||||
|
||||
|
||||
def _col_from_values(values, nbins=10):
|
||||
"""Build a ``numeric`` sub-block shaped like describe_numeric's output from a
|
||||
concrete list of raw values, so the profile percentiles and the raw sample
|
||||
are consistent (the boxplot fences match the crudo)."""
|
||||
vals = [float(v) for v in values]
|
||||
s = sorted(vals)
|
||||
n = len(s)
|
||||
mean = sum(vals) / n
|
||||
var = sum((v - mean) ** 2 for v in vals) / n
|
||||
std = math.sqrt(var)
|
||||
median = _percentile(s, 0.5)
|
||||
p25 = _percentile(s, 0.25)
|
||||
p75 = _percentile(s, 0.75)
|
||||
mn, mx = s[0], s[-1]
|
||||
# z-score outlier count (population), what the profile's n_outliers carries.
|
||||
n_out = sum(1 for v in vals if std > 0 and abs((v - mean) / std) > 3.0)
|
||||
width = (mx - mn) / nbins if mx > mn else 1.0
|
||||
hist = [{"lo": mn + i * width, "hi": mn + (i + 1) * width, "count": 1}
|
||||
for i in range(nbins)]
|
||||
return {
|
||||
"min": mn, "max": mx, "mean": mean, "median": median, "std": std,
|
||||
"p25": p25, "p50": median, "p75": p75, "iqr": (p75 - p25),
|
||||
"n_outliers": n_out, "outlier_pct": 100.0 * n_out / n,
|
||||
"distribution_type": "right-skewed", "histogram": hist,
|
||||
}
|
||||
|
||||
|
||||
def _fare_values():
|
||||
"""A heavy-tailed column (most ~10-30, a few 200-512): clear Tukey/z outliers."""
|
||||
base = [7.0 + (i % 25) for i in range(120)] # bulk 7..31
|
||||
tail = [180.0, 210.0, 263.0, 512.0] # extreme upper tail
|
||||
return base + tail
|
||||
|
||||
|
||||
def _age_values():
|
||||
"""A roughly symmetric column with one extreme low value."""
|
||||
base = [22.0 + (i % 40) for i in range(120)] # 22..61
|
||||
return base + [80.0, 0.5, 74.0, 1.0]
|
||||
|
||||
|
||||
def _quiet_values():
|
||||
"""A clean column with no atypical values."""
|
||||
return [50.0 + (i % 5) for i in range(124)]
|
||||
|
||||
|
||||
def _profile_and_ctx(with_models=True, with_raw=True):
|
||||
fare = _fare_values()
|
||||
age = _age_values()
|
||||
quiet = _quiet_values()
|
||||
cols = [
|
||||
{"name": "Fare", "inferred_type": "numeric", "numeric": _col_from_values(fare)},
|
||||
{"name": "Age", "inferred_type": "numeric", "numeric": _col_from_values(age)},
|
||||
{"name": "Quiet", "inferred_type": "numeric", "numeric": _col_from_values(quiet)},
|
||||
{"name": "Sexo", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "male", "count": 80}]}},
|
||||
]
|
||||
profile = {"table": "titanic", "n_rows": len(fare), "n_cols": len(cols),
|
||||
"columns": cols}
|
||||
if with_models:
|
||||
profile["models"] = {
|
||||
"outliers": {
|
||||
"n_outliers": 4, "outlier_pct": 3.2,
|
||||
"outlier_rows": [
|
||||
{"row_index": 123, "score": -0.21},
|
||||
{"row_index": 121, "score": -0.15},
|
||||
],
|
||||
"threshold": -0.02, "n_rows_used": 124, "n_features": 3,
|
||||
}
|
||||
}
|
||||
ctx = {}
|
||||
if with_raw:
|
||||
ctx["raw_numeric"] = {"Fare": fare, "Age": age, "Quiet": quiet}
|
||||
return profile, ctx
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def _flatten(blocks):
|
||||
out = []
|
||||
for b in blocks:
|
||||
if getattr(b, "kind", "") == "group":
|
||||
out.extend(_flatten(getattr(b, "blocks", []) or []))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_estructura_y_secciones():
|
||||
profile, ctx = _profile_and_ctx()
|
||||
ctx["glossary"] = model.GlossaryCollector()
|
||||
ch = build_outliers(profile, ctx)
|
||||
assert ch is not None
|
||||
assert ch.id == "outliers"
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
|
||||
flat = _flatten(ch.blocks)
|
||||
kinds = [b.kind for b in flat]
|
||||
# Title heading + univariate DataTable + boxplots Figure + multivariate
|
||||
# KVTable + interpretation Markdown.
|
||||
assert kinds[0] == "heading" and flat[0].text == CHAPTER_TITLE
|
||||
tables = [b for b in flat if b.kind == "data_table"]
|
||||
titles = [t.title for t in tables]
|
||||
assert any(t and "atípicos por columna" in t for t in titles)
|
||||
assert any(b.kind == "figure" for b in flat), "falta la figura de boxplots"
|
||||
assert any(b.kind == "kv_table" for b in flat), "falta el resumen multivariante"
|
||||
|
||||
# The boxplots figure maker yields a real matplotlib figure (or its fallback).
|
||||
fig = next(b for b in flat if b.kind == "figure").make()
|
||||
assert fig is not None
|
||||
import matplotlib.pyplot as plt
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_golden_fare_es_la_mas_contaminada():
|
||||
# The univariate table must rank Fare (heavy tail) first and report a
|
||||
# non-zero Tukey percentage for it.
|
||||
profile, ctx = _profile_and_ctx()
|
||||
ch = build_outliers(profile, ctx)
|
||||
table = next(b for b in _flatten(ch.blocks)
|
||||
if b.kind == "data_table" and b.title
|
||||
and "atípicos por columna" in b.title)
|
||||
first_col = table.rows[0][0]
|
||||
assert first_col == "Fare", f"esperaba Fare primera, fue {first_col}"
|
||||
# % Tukey column (index 2) of the first row must be > 0.
|
||||
pct_cell = table.rows[0][2]
|
||||
assert pct_cell not in ("—", "0%", "0.00%"), f"% Tukey de Fare vacío: {pct_cell}"
|
||||
# The z-score rule (detect_outliers) must actually run with raw_numeric: at
|
||||
# least one column reports a non-empty z count/percentage (regression guard
|
||||
# for the detect_outliers import path).
|
||||
z_pcts = [r[4] for r in table.rows]
|
||||
assert any(c not in ("—",) for c in z_pcts), f"columna z toda vacía: {z_pcts}"
|
||||
z_counts = [r[3] for r in table.rows]
|
||||
assert any(c not in ("—",) for c in z_counts), f"conteo z vacío: {z_counts}"
|
||||
|
||||
|
||||
def test_golden_interpretacion_outlier_no_es_error():
|
||||
profile, ctx = _profile_and_ctx()
|
||||
ch = build_outliers(profile, ctx)
|
||||
md = " ".join(b.text for b in _flatten(ch.blocks) if b.kind == "markdown")
|
||||
assert "no es necesariamente un error" in md.lower()
|
||||
# Mentions the actionable options (winsorize / re-express).
|
||||
assert "winsoriz" in md.lower()
|
||||
assert "re-expres" in md.lower() or "logarítmic" in md.lower()
|
||||
|
||||
|
||||
def test_golden_terminos_glosario_registrados():
|
||||
profile, ctx = _profile_and_ctx()
|
||||
gloss = model.GlossaryCollector()
|
||||
ctx["glossary"] = gloss
|
||||
build_outliers(profile, ctx)
|
||||
for key in _TERM_DEFS:
|
||||
assert gloss.has(key), f"término '{key}' no registrado en el glosario"
|
||||
# Terms are marked clickable in the body text.
|
||||
md = " ".join(b.text for b in _flatten(build_outliers(profile, ctx).blocks)
|
||||
if b.kind == "markdown")
|
||||
assert "[[term:outlier]]" in md and "[[term:tukey_fence]]" in md
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Multivariate.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_multivariante_live_con_raw_y_dims():
|
||||
# With a raw sample the chapter runs Isolation Forest live (over the same
|
||||
# columns summarize_outlier_dims uses) and lists the anomalous rows with the
|
||||
# dimensions that make each one rare.
|
||||
profile, ctx = _profile_and_ctx(with_models=False, with_raw=True)
|
||||
ch = build_outliers(profile, ctx)
|
||||
flat = _flatten(ch.blocks)
|
||||
kv = next(b for b in flat if b.kind == "kv_table")
|
||||
flat_kv = " ".join(f"{k} {v}" for (k, v) in kv.rows)
|
||||
assert "Filas atípicas" in flat_kv
|
||||
# A non-zero number of anomalous rows is reported.
|
||||
n_cell = dict(kv.rows).get("Filas atípicas")
|
||||
assert n_cell not in (None, "—", "0"), f"sin filas atípicas: {n_cell}"
|
||||
# The anomalous-rows table carries the per-row dimension breakdown.
|
||||
tbls = [b for b in flat if b.kind == "data_table" and b.title
|
||||
and "más atípicas" in b.title]
|
||||
assert tbls, "falta la tabla de filas más atípicas"
|
||||
assert any("hacen rara" in h for h in tbls[0].header), \
|
||||
f"falta la columna de dimensiones: {tbls[0].header}"
|
||||
|
||||
|
||||
def test_multivariante_precomputed_sin_raw():
|
||||
# Without a raw sample the chapter falls back to profile['models']['outliers']
|
||||
# (lite preset path); the precomputed n_outliers (4) surfaces in the KV table.
|
||||
profile, ctx = _profile_and_ctx(with_models=True, with_raw=False)
|
||||
ch = build_outliers(profile, ctx)
|
||||
kv = next(b for b in _flatten(ch.blocks) if b.kind == "kv_table")
|
||||
assert any("4" in str(v) for (k, v) in kv.rows)
|
||||
|
||||
|
||||
def test_multivariante_ausente_degrada_a_nota():
|
||||
# No models and no raw sample → an honest note, never a crash.
|
||||
profile, ctx = _profile_and_ctx(with_models=False, with_raw=False)
|
||||
ch = build_outliers(profile, ctx)
|
||||
assert ch is not None
|
||||
notes = [b.text for b in _flatten(ch.blocks) if b.kind == "note"]
|
||||
assert any("Isolation Forest" in n for n in notes)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edges / error path.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_edge_sin_columnas_numericas_devuelve_none():
|
||||
prof = {"columns": [{"name": "c", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "x", "count": 3}]}}]}
|
||||
assert build_outliers(prof, {}) is None
|
||||
|
||||
|
||||
def test_edge_solo_texto_sintetico_devuelve_none():
|
||||
# A text-only synthetic table (no numeric column) yields None (does not break).
|
||||
prof = {"table": "notas", "n_rows": 3, "n_cols": 1,
|
||||
"columns": [{"name": "comentario", "inferred_type": "text",
|
||||
"text": {"n_docs": 3}}]}
|
||||
assert build_outliers(prof, {}) is None
|
||||
|
||||
|
||||
def test_edge_profile_none_y_vacio_no_revienta():
|
||||
assert build_outliers(None, None) is None
|
||||
assert build_outliers({}, {}) is None
|
||||
assert build_outliers({"columns": []}, {}) is None
|
||||
|
||||
|
||||
def test_edge_sin_raw_numeric_degrada_a_perfil():
|
||||
# Without raw_numeric the chapter still builds, using the profile z-score
|
||||
# counts; the univariate table exists and Tukey counts degrade to '—'.
|
||||
profile, ctx = _profile_and_ctx(with_models=True, with_raw=False)
|
||||
ch = build_outliers(profile, ctx)
|
||||
assert ch is not None
|
||||
table = next(b for b in _flatten(ch.blocks)
|
||||
if b.kind == "data_table" and b.title
|
||||
and "atípicos por columna" in b.title)
|
||||
# z column comes from the profile; Tukey count is unknown ('—').
|
||||
assert all(len(r) == 8 for r in table.rows)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Anti-cut render.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_render_pdf_y_pptx_incluyen_el_capitulo():
|
||||
profile, ctx = _profile_and_ctx()
|
||||
# The renderers build the whole document; the chapter is reached via the
|
||||
# registry. Render the chapter standalone through a one-chapter document by
|
||||
# passing the profile directly (the renderers run the full chapter registry).
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "out.pdf")
|
||||
res_pdf = render_automatic_eda_pdf(profile, pdf,
|
||||
{"write_manifest": False, "ctx": ctx})
|
||||
assert res_pdf["path"] == pdf
|
||||
txt = _pdf_text(pdf)
|
||||
assert CHAPTER_TITLE in txt, "el capítulo OUTLIERS no aparece en el PDF"
|
||||
assert "Fare" in txt
|
||||
pptx = os.path.join(d, "out.pptx")
|
||||
res_pptx = render_automatic_eda_pptx(profile, pptx,
|
||||
{"write_manifest": False, "ctx": ctx})
|
||||
assert res_pptx["path"] == pptx
|
||||
assert res_pptx["n_slides"] >= 1
|
||||
@@ -2,8 +2,17 @@
|
||||
|
||||
Builds the document cover from a TableProfile plus an optional ``ctx`` of
|
||||
presentation metadata. Reads everything defensively (``.get``) and degrades
|
||||
honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
|
||||
placeholder rather than invented, leaving a hook for the LLM layer to fill it.
|
||||
honestly.
|
||||
|
||||
The dataset size (N rows x M columns) is always shown big, as a heading right
|
||||
under the dataset name (kept together in a ``Group``), not buried in the
|
||||
metadata table. The Description and Granularity are resolved through a cascade
|
||||
so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
|
||||
(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
|
||||
``row_meaning``; otherwise a short summary is derived from the profile itself
|
||||
(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
|
||||
key-candidate columns or the table shape. Nothing is invented: the derived
|
||||
fallbacks state that they come from the profile.
|
||||
|
||||
Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
|
||||
build_<id>(profile: dict, ctx: dict) -> Chapter | None
|
||||
@@ -17,10 +26,15 @@ from datetime import datetime, timezone
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "portada"
|
||||
CHAPTER_TITLE = "Portada"
|
||||
|
||||
# Key under which eda_llm_insights stores its interpretive block in the profile.
|
||||
# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
|
||||
# row represents) from it when the LLM layer ran (``run_llm``).
|
||||
_LLM_KEY = "llm"
|
||||
|
||||
# Default human description of what the table quality score measures. Chapters
|
||||
# can override it via ctx["quality_criteria"].
|
||||
_DEFAULT_QUALITY_CRITERIA = (
|
||||
@@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str:
|
||||
return s
|
||||
|
||||
|
||||
def _llm_block(profile: dict, ctx: dict) -> dict:
|
||||
"""Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
|
||||
|
||||
It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
|
||||
may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
|
||||
not a dict degrades to an empty dict so the cover never raises.
|
||||
"""
|
||||
block = profile.get(_LLM_KEY)
|
||||
if not isinstance(block, dict):
|
||||
block = ctx.get(_LLM_KEY)
|
||||
return block if isinstance(block, dict) else {}
|
||||
|
||||
|
||||
def _count_column_types(profile: dict, ctx: dict):
|
||||
"""Best-effort (n_numeric, n_categorical) for the dataset.
|
||||
|
||||
Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
|
||||
over the whole body); falls back to counting the profile columns directly so
|
||||
the cover still has the numbers when no summary was passed.
|
||||
"""
|
||||
summary = ctx.get("document_summary")
|
||||
if isinstance(summary, dict):
|
||||
n_num = summary.get("n_numeric")
|
||||
n_cat = summary.get("n_categorical")
|
||||
if n_num is not None or n_cat is not None:
|
||||
return n_num, n_cat
|
||||
cols = profile.get("columns") or []
|
||||
n_num = sum(1 for c in cols if isinstance(c, dict)
|
||||
and c.get("inferred_type") == "numeric")
|
||||
n_cat = sum(1 for c in cols if isinstance(c, dict)
|
||||
and isinstance(c.get("categorical"), dict)
|
||||
and c.get("categorical", {}).get("top")
|
||||
and c.get("inferred_type") != "numeric")
|
||||
return n_num, n_cat
|
||||
|
||||
|
||||
def _derive_description(profile: dict, ctx: dict) -> str:
|
||||
"""A short, honest description of the dataset from the profile.
|
||||
|
||||
Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
|
||||
available. Summarizes shape, column-type mix and quality score; never empty,
|
||||
never invents business meaning (it states the description was derived)."""
|
||||
n_rows = profile.get("n_rows")
|
||||
n_cols = profile.get("n_cols")
|
||||
n_num, n_cat = _count_column_types(profile, ctx)
|
||||
head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
|
||||
type_bits = []
|
||||
if n_num:
|
||||
type_bits.append(f"{_fmt_int(n_num)} numéricas")
|
||||
if n_cat:
|
||||
type_bits.append(f"{_fmt_int(n_cat)} categóricas")
|
||||
if type_bits:
|
||||
head += " (" + ", ".join(type_bits) + ")"
|
||||
parts = [head + "."]
|
||||
score = profile.get("quality_score")
|
||||
if score is not None:
|
||||
parts.append(f"Calidad media estimada: {score}/100.")
|
||||
parts.append(
|
||||
"Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
|
||||
"para una descripción de negocio más rica.")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _derive_granularity(profile: dict, dataset_name: str) -> str:
|
||||
"""A ``Cada fila es…`` granularity sentence from the profile.
|
||||
|
||||
Prefers the key-candidate columns (a row is identified by them); when no key
|
||||
is detected, falls back to the table shape so the line is always meaningful
|
||||
and starts with ``Cada fila es`` as the user requested."""
|
||||
keys = profile.get("key_candidates") or []
|
||||
if keys:
|
||||
shown = ", ".join(str(k) for k in keys[:3])
|
||||
more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
|
||||
return (f"Cada fila es un registro identificado por {shown}{more}, "
|
||||
"candidata(s) a clave por ser únicas y sin nulos.")
|
||||
n_rows = profile.get("n_rows")
|
||||
tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
|
||||
return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
|
||||
"columna identificadora única, así que la granularidad se infiere "
|
||||
"de la forma de la tabla." + tail)
|
||||
|
||||
|
||||
def build_portada(profile: dict, ctx: dict):
|
||||
"""Build the cover Chapter, or None if there is truly nothing to show."""
|
||||
profile = profile or {}
|
||||
@@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict):
|
||||
quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
|
||||
quality_value = "—" if score is None else f"{score} / 100"
|
||||
|
||||
# Granularity: ctx wins; else derive from key candidates; else be honest.
|
||||
llm = _llm_block(profile, ctx)
|
||||
|
||||
# Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
|
||||
# candidates; finally a shape-based fallback. Always a real "Cada fila es…".
|
||||
granularity = ctx.get("granularity")
|
||||
if not granularity:
|
||||
keys = profile.get("key_candidates") or []
|
||||
if keys:
|
||||
granularity = ("Cada fila parece identificada por "
|
||||
+ ", ".join(str(k) for k in keys[:3]) + ".")
|
||||
else:
|
||||
granularity = ("Cada fila es… (granularidad no determinada — "
|
||||
"pendiente de la capa de cálculo/LLM).")
|
||||
granularity = (llm.get("row_meaning") or "").strip() or None
|
||||
if not granularity:
|
||||
granularity = _derive_granularity(profile, str(dataset_name))
|
||||
|
||||
# Description: explicit ctx wins; then the LLM "summary"; finally a short
|
||||
# profile-derived summary. Never the old empty placeholder.
|
||||
description = ctx.get("description")
|
||||
if not description:
|
||||
description = ("Descripción no provista — pendiente de la capa LLM "
|
||||
"(`run_llm`) o de `ctx['description']`.")
|
||||
description = (llm.get("summary") or "").strip() or None
|
||||
if not description:
|
||||
description = _derive_description(profile, ctx)
|
||||
|
||||
blocks = [
|
||||
# Title + dataset size shown together and BIG (Heading) at the top, kept on
|
||||
# the same page (Group). The size is no longer buried in the metadata table.
|
||||
cover = [
|
||||
model.Heading(text=str(dataset_name), level=1),
|
||||
model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
|
||||
model.Heading(text=shape, level=2),
|
||||
]
|
||||
|
||||
blocks = [
|
||||
model.Group(blocks=cover),
|
||||
model.KVTable(rows=[
|
||||
("Fuente", source_origin),
|
||||
("Almacenamiento", storage),
|
||||
("Generado", when),
|
||||
("Tamaño", shape),
|
||||
("Calidad", quality_value),
|
||||
("Criterios de calidad", quality_criteria),
|
||||
]),
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies the Fase 4b improvements:
|
||||
|
||||
1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
|
||||
heading kept together with the dataset name in a ``Group`` — and is no longer
|
||||
a row of the metadata table.
|
||||
2. Description and Granularity are resolved through a real cascade and are never
|
||||
the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
|
||||
block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
|
||||
short summary is derived from the profile and a "Cada fila es…" sentence from
|
||||
the key-candidate columns or the table shape.
|
||||
3. The chapter degrades without raising on empty/None input.
|
||||
4. It renders inside the full document to both PDF and PPTX showing that content.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
|
||||
from datascience.automatic_eda.chapters.portada import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_portada,
|
||||
)
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
|
||||
prof = {
|
||||
"table": "titanic",
|
||||
"source": "/data/titanic.csv",
|
||||
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||
"n_rows": 891,
|
||||
"n_cols": 12,
|
||||
"quality_score": 78.0,
|
||||
"columns": [
|
||||
{"name": "PassengerId", "inferred_type": "numeric",
|
||||
"null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
|
||||
"max": 891.0, "std": 257.0}},
|
||||
{"name": "Survived", "inferred_type": "numeric",
|
||||
"null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
|
||||
"max": 1.0, "std": 0.49}},
|
||||
{"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
|
||||
"categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
|
||||
{"value": "female", "count": 314,
|
||||
"pct": 0.35}],
|
||||
"mode": "male", "n_distinct": 2, "entropy": 0.93}},
|
||||
],
|
||||
}
|
||||
if with_keys:
|
||||
prof["key_candidates"] = ["PassengerId"]
|
||||
if with_llm:
|
||||
prof["llm"] = {
|
||||
"summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
|
||||
"row_meaning": "Cada fila es un pasajero del Titanic.",
|
||||
"dictionary": [], "pii": [], "cleaning": [], "analyses": [],
|
||||
}
|
||||
return prof
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def _pptx_text(path: str) -> str:
|
||||
prs = Presentation(path)
|
||||
parts = []
|
||||
for sl in prs.slides:
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
parts.append(sh.text_frame.text)
|
||||
if sh.has_table:
|
||||
tb = sh.table
|
||||
for r in range(len(tb.rows)):
|
||||
for c in range(len(tb.columns)):
|
||||
parts.append(tb.cell(r, c).text)
|
||||
return re.sub(r"\s+", " ", " ".join(parts))
|
||||
|
||||
|
||||
def _markdown_after(blocks, heading_text):
|
||||
"""Return the Markdown block that follows a Heading whose text matches."""
|
||||
for i, b in enumerate(blocks):
|
||||
if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
|
||||
for nb in blocks[i + 1:]:
|
||||
if isinstance(nb, Markdown):
|
||||
return nb
|
||||
return None
|
||||
|
||||
|
||||
def test_golden_tamano_grande_y_textos_llm():
|
||||
ch = build_portada(_profile(), {})
|
||||
assert ch is not None
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
|
||||
# 1) Title + size kept together in a Group; size is a BIG level-2 heading.
|
||||
group = next(b for b in ch.blocks if isinstance(b, Group))
|
||||
inner = group.blocks
|
||||
assert isinstance(inner[0], Heading) and inner[0].level == 1
|
||||
assert inner[0].text == "titanic"
|
||||
size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
|
||||
assert "891" in size_h.text and "12" in size_h.text
|
||||
assert "filas" in size_h.text and "columnas" in size_h.text
|
||||
|
||||
# 2) Size is no longer a row of the metadata table.
|
||||
kv = next(b for b in ch.blocks if isinstance(b, KVTable))
|
||||
labels = [r[0] for r in kv.rows]
|
||||
assert "Tamaño" not in labels
|
||||
assert "Fuente" in labels and "Calidad" in labels
|
||||
|
||||
# 3) Description and Granularity come from the LLM block.
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert desc is not None and "Titanic" in desc.text
|
||||
assert gran is not None and gran.text.startswith("Cada fila es")
|
||||
assert "pasajero" in gran.text.lower()
|
||||
|
||||
|
||||
def test_fallback_sin_llm_usa_keys_y_perfil():
|
||||
# No LLM block: description derived from the profile, granularity from keys.
|
||||
ch = build_portada(_profile(with_llm=False, with_keys=True), {})
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
# Description is the derived summary, never the old "pendiente" placeholder.
|
||||
assert "pendiente" not in desc.text.lower()
|
||||
assert "891" in desc.text and "columnas" in desc.text
|
||||
assert "numéricas" in desc.text or "categóricas" in desc.text
|
||||
# Granularity mentions the key candidate and starts with "Cada fila es".
|
||||
assert gran.text.startswith("Cada fila es")
|
||||
assert "PassengerId" in gran.text
|
||||
assert "…" not in gran.text # the old ellipsis placeholder is gone.
|
||||
|
||||
|
||||
def test_fallback_sin_llm_sin_keys_usa_forma():
|
||||
ch = build_portada(_profile(with_llm=False, with_keys=False), {})
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert gran.text.startswith("Cada fila es")
|
||||
assert "titanic" in gran.text.lower()
|
||||
assert "pendiente" not in gran.text.lower()
|
||||
|
||||
|
||||
def test_ctx_explicito_gana_sobre_llm():
|
||||
ctx = {"description": "Descripción manual.",
|
||||
"granularity": "Cada fila es una unidad manual."}
|
||||
ch = build_portada(_profile(), ctx)
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert desc.text == "Descripción manual."
|
||||
assert gran.text == "Cada fila es una unidad manual."
|
||||
|
||||
|
||||
def test_edge_perfil_vacio_no_lanza():
|
||||
# Empty / None never raise; the cover still shows a size and real texts.
|
||||
for prof, ctx in (({}, {}), (None, None)):
|
||||
ch = build_portada(prof, ctx)
|
||||
assert ch is not None
|
||||
group = next(b for b in ch.blocks if isinstance(b, Group))
|
||||
size_h = next(b for b in group.blocks
|
||||
if isinstance(b, Heading) and b.level == 2)
|
||||
assert "filas" in size_h.text and "columnas" in size_h.text
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert desc.text and "pendiente" not in desc.text.lower()
|
||||
assert gran.text.startswith("Cada fila es")
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_portada():
|
||||
prof = _profile()
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
txt = _pdf_text(out)
|
||||
assert "titanic" in txt.lower()
|
||||
assert "891" in txt and "filas" in txt and "columnas" in txt
|
||||
assert "Titanic" in txt # LLM summary in the Description.
|
||||
assert "Cada fila es" in txt # granularity sentence.
|
||||
|
||||
|
||||
def test_golden_render_pptx_muestra_portada():
|
||||
prof = _profile()
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
txt = _pptx_text(out)
|
||||
assert "titanic" in txt.lower()
|
||||
assert "891" in txt and "columnas" in txt
|
||||
assert "Cada fila es" in txt
|
||||
@@ -0,0 +1,499 @@
|
||||
"""Key-relations chapter (RELACIONES) — the keys / join structure of the data.
|
||||
|
||||
This chapter is the *relational* section of an AutomaticEDA report. It answers a
|
||||
single question for the table (or the whole DuckDB source it lives in): **how do
|
||||
the keys relate?** It composes, without reimplementing them, the registry's
|
||||
relation primitives and degrades honestly when a layer does not apply.
|
||||
|
||||
It renders, in order, only the layers that have something to say:
|
||||
|
||||
1. **Declared keys** (real schema constraints) — when the DuckDB source declares
|
||||
PRIMARY KEY / FOREIGN KEY / UNIQUE constraints, they are read verbatim via
|
||||
``detect_declared_keys_duckdb`` and shown as ground truth: which column is the
|
||||
PK, which columns are FKs and the table/column they point to.
|
||||
2. **Primary-key candidates** — the ``key_candidates`` the TableProfile already
|
||||
carries (columns whose cardinality equals the row count, with no nulls). These
|
||||
are *candidates*: a column that could serve as the row identifier.
|
||||
3. **Foreign-key candidates** when none are declared:
|
||||
- **Inter-table** (the DuckDB source has several tables): real FK candidates by
|
||||
name signal + value containment via ``infer_fk_containment_duckdb``, plus the
|
||||
join graph (roles + a pasteable Mermaid diagram) via ``build_join_graph``.
|
||||
- **Intra-table** (a single table): columns that *look* like a foreign key by a
|
||||
name+cardinality heuristic (``suggest_intratable_fk_candidates``). This is a
|
||||
**suggestion**, explicitly flagged as a heuristic, never an assertion.
|
||||
|
||||
``build_relaciones(profile, ctx) -> Chapter | None``: returns ``None`` when there
|
||||
is nothing to say (no declared key, no key candidates, and no FK candidate —
|
||||
inter- or intra-table). Reads everything defensively (``.get``) and never raises:
|
||||
anything missing degrades to a note or is omitted; a failing registry call drops
|
||||
its layer instead of aborting the chapter.
|
||||
|
||||
ctx keys this chapter consumes (all optional):
|
||||
db_path, table : str — the DuckDB file and table being profiled (set by
|
||||
``build_eda_render_ctx``). ``db_path`` is needed to read declared
|
||||
constraints, to list the sibling tables, and to run the containment-based
|
||||
FK inference. Without it, only the profile-derived layers (PK candidates,
|
||||
intra-table FK heuristic) are available.
|
||||
glossary : model.GlossaryCollector — shared glossary; the chapter registers
|
||||
the relational terms (PK, FK, containment, cardinality) and marks their
|
||||
first appearance clickable.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
# Pure/impure registry functions (group ``eda``) this chapter composes. Imported
|
||||
# defensively (module-leaf imports, like the AGREGACION chapter) so the chapter
|
||||
# still builds — degrading the affected layer to nothing — if a function is
|
||||
# somehow unavailable / not indexed yet.
|
||||
try:
|
||||
from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb
|
||||
except Exception: # noqa: BLE001 — keep the chapter importable no matter what.
|
||||
detect_declared_keys_duckdb = None # type: ignore[assignment]
|
||||
try:
|
||||
from datascience.infer_fk_containment_duckdb import infer_fk_containment_duckdb
|
||||
except Exception: # noqa: BLE001
|
||||
infer_fk_containment_duckdb = None # type: ignore[assignment]
|
||||
try:
|
||||
from datascience.build_join_graph import build_join_graph
|
||||
except Exception: # noqa: BLE001
|
||||
build_join_graph = None # type: ignore[assignment]
|
||||
try:
|
||||
from datascience.suggest_intratable_fk_candidates import (
|
||||
suggest_intratable_fk_candidates,
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
suggest_intratable_fk_candidates = None # type: ignore[assignment]
|
||||
try:
|
||||
from infra import duckdb_list_tables
|
||||
except Exception: # noqa: BLE001
|
||||
duckdb_list_tables = None # type: ignore[assignment]
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "relaciones"
|
||||
CHAPTER_TITLE = "Relaciones de clave"
|
||||
|
||||
# Cap the inter-table FK table so a wide schema does not blow up the page; the
|
||||
# rest is summarized in a closing note (no silent truncation).
|
||||
MAX_FK_ROWS = 40
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Glossary terms this chapter explains. Registered in the shared collector and
|
||||
# marked clickable on their first appearance (contract §11.1).
|
||||
# --------------------------------------------------------------------------- #
|
||||
_TERMS = {
|
||||
"pk": (
|
||||
"Clave primaria (PK)",
|
||||
"Columna (o conjunto de columnas) que identifica de forma única cada fila "
|
||||
"de una tabla: sus valores no se repiten y no son nulos. Una tabla tiene "
|
||||
"como mucho una clave primaria; es el ancla por la que otras tablas la "
|
||||
"referencian.",
|
||||
),
|
||||
"fk": (
|
||||
"Clave foránea (FK)",
|
||||
"Columna de una tabla cuyos valores apuntan a la clave primaria de otra "
|
||||
"tabla (o de la misma), creando una relación entre ambas. Una FK suele ser "
|
||||
"N:1: muchas filas de la tabla origen comparten el mismo valor de la tabla "
|
||||
"destino.",
|
||||
),
|
||||
"containment": (
|
||||
"Containment / inclusión",
|
||||
"Señal con la que se infiere una clave foránea sin que la base la declare: "
|
||||
"la fracción de valores distintos de una columna A que también aparecen "
|
||||
"como valores de otra columna B. Si casi todos los valores de A están "
|
||||
"contenidos en B (inclusión ≈ 1) y B parece una clave, A → B es una FK "
|
||||
"candidata.",
|
||||
),
|
||||
"cardinalidad": (
|
||||
"Cardinalidad",
|
||||
"Número de valores distintos de una columna. Cardinalidad igual al número "
|
||||
"de filas (y sin nulos) señala un identificador (candidato a clave "
|
||||
"primaria); cardinalidad alta pero menor que el número de filas, con "
|
||||
"valores repetidos, es típica de una clave foránea.",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _register_terms(ctx: dict) -> bool:
|
||||
"""Register the relational terms in the shared glossary. Returns whether the
|
||||
in-text appearances should be marked clickable."""
|
||||
glossary = ctx.get("glossary")
|
||||
if not isinstance(glossary, model.GlossaryCollector):
|
||||
return False
|
||||
for key, (label, definition) in _TERMS.items():
|
||||
glossary.add(key, label, definition)
|
||||
return True
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the other chapters' defensive style).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _fmt_int(value) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{int(value):,}".replace(",", ".")
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _fmt_pct_fraction(value, decimals: int = 1) -> str:
|
||||
"""Format a 0–1 fraction as a percentage. None -> placeholder."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
v = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
if v <= 1.0:
|
||||
v *= 100.0
|
||||
return f"{v:.{decimals}f}%"
|
||||
|
||||
|
||||
def _fmt_ratio(value, decimals: int = 3) -> str:
|
||||
"""Format an already-0–1 ratio (inclusion) as a plain number."""
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value):.{decimals}f}".rstrip("0").rstrip(".")
|
||||
except (TypeError, ValueError):
|
||||
return model._safe_str(value)
|
||||
|
||||
|
||||
def _is_dict(v) -> bool:
|
||||
return isinstance(v, dict)
|
||||
|
||||
|
||||
def _columns_by_name(profile: dict) -> dict:
|
||||
"""Index the profile columns by name for quick metric lookup."""
|
||||
out = {}
|
||||
for col in (profile.get("columns") or []):
|
||||
if _is_dict(col) and col.get("name") is not None:
|
||||
out[col.get("name")] = col
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Layer 1 — declared keys (real schema constraints).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _declared_keys(db_path: str, table: str):
|
||||
"""Read declared PK/FK/UNIQUE for the source, or None if unavailable."""
|
||||
if not db_path or detect_declared_keys_duckdb is None:
|
||||
return None
|
||||
try:
|
||||
out = detect_declared_keys_duckdb(db_path, table)
|
||||
except Exception: # noqa: BLE001 — dict-no-throw: treat as unavailable.
|
||||
return None
|
||||
if not _is_dict(out) or out.get("status") != "ok":
|
||||
return None
|
||||
return out
|
||||
|
||||
|
||||
def _declared_section(declared: dict) -> list:
|
||||
"""Blocks for the declared-keys layer, or [] if there is nothing declared."""
|
||||
pks = [p for p in (declared.get("primary_keys") or []) if _is_dict(p)]
|
||||
fks = [f for f in (declared.get("foreign_keys") or []) if _is_dict(f)]
|
||||
uqs = [u for u in (declared.get("unique") or []) if _is_dict(u)]
|
||||
if not (pks or fks or uqs):
|
||||
return []
|
||||
|
||||
blocks = [
|
||||
model.Heading(text="Claves declaradas en el esquema", level=2),
|
||||
model.Markdown(text=(
|
||||
"La base **declara** estas relaciones de clave como restricciones "
|
||||
"reales del esquema (constraints). Son la verdad de referencia: no se "
|
||||
"infieren, se leen tal cual de la definición de las tablas.")),
|
||||
]
|
||||
|
||||
if pks:
|
||||
rows = [[model._safe_str(p.get("table")),
|
||||
", ".join(model._safe_str(c) for c in (p.get("columns") or []))]
|
||||
for p in pks]
|
||||
blocks.append(model.DataTable(
|
||||
header=["Tabla", "Columna(s) PK"], rows=rows,
|
||||
title="Claves primarias declaradas",
|
||||
note="Cada fila: la clave primaria declarada de una tabla."))
|
||||
|
||||
if fks:
|
||||
rows = []
|
||||
for f in fks:
|
||||
src = ", ".join(model._safe_str(c) for c in (f.get("columns") or []))
|
||||
dst = ", ".join(
|
||||
model._safe_str(c) for c in (f.get("referenced_columns") or []))
|
||||
rows.append([
|
||||
model._safe_str(f.get("table")), src,
|
||||
model._safe_str(f.get("referenced_table")), dst])
|
||||
blocks.append(model.DataTable(
|
||||
header=["Tabla origen", "Columna(s) FK", "→ Tabla destino",
|
||||
"Columna(s) destino"],
|
||||
rows=rows, title="Claves foráneas declaradas",
|
||||
note="Cada fila: una FK declarada — origen → destino."))
|
||||
|
||||
if uqs:
|
||||
rows = [[model._safe_str(u.get("table")),
|
||||
", ".join(model._safe_str(c) for c in (u.get("columns") or []))]
|
||||
for u in uqs]
|
||||
blocks.append(model.DataTable(
|
||||
header=["Tabla", "Columna(s) UNIQUE"], rows=rows,
|
||||
title="Restricciones UNIQUE declaradas"))
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Layer 2 — primary-key candidates (from the profile).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _pk_candidates_section(profile: dict, mark: bool) -> list:
|
||||
"""Blocks for the PK-candidates layer, or [] if there are none."""
|
||||
keys = [k for k in (profile.get("key_candidates") or []) if k is not None]
|
||||
if not keys:
|
||||
return []
|
||||
by_name = _columns_by_name(profile)
|
||||
|
||||
pk = ("[[term:pk]]**clave primaria**[[/term]]" if mark
|
||||
else "**clave primaria**")
|
||||
intro = (
|
||||
f"Columnas **candidatas a {pk}**: su "
|
||||
"[[term:cardinalidad]]cardinalidad[[/term]] iguala al número de filas y "
|
||||
"no tienen nulos. Son candidatas, no una clave declarada: la base no "
|
||||
"las marca como tal."
|
||||
if mark else
|
||||
"Columnas **candidatas a clave primaria**: su cardinalidad iguala al "
|
||||
"número de filas y no tienen nulos. Son candidatas, no una clave "
|
||||
"declarada.")
|
||||
|
||||
rows = []
|
||||
for name in keys:
|
||||
col = by_name.get(name) or {}
|
||||
rows.append([
|
||||
model._safe_str(name),
|
||||
_fmt_int(col.get("distinct_count")),
|
||||
_fmt_pct_fraction(col.get("unique_pct")),
|
||||
model._safe_str(col.get("inferred_type") or col.get("physical_type") or "—"),
|
||||
])
|
||||
return [
|
||||
model.Heading(text="Candidatos a clave primaria", level=2),
|
||||
model.Markdown(text=intro),
|
||||
model.DataTable(
|
||||
header=["Columna", "Valores distintos", "% único", "Tipo"],
|
||||
rows=rows, title="Candidatas a clave primaria",
|
||||
note=f"{_fmt_int(profile.get('n_rows'))} filas en total como referencia."),
|
||||
]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Layer 3a — inter-table FK candidates (containment) + join graph.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _list_source_tables(db_path: str) -> list:
|
||||
"""List the tables in the DuckDB source, or [] if it can't be listed."""
|
||||
if not db_path or duckdb_list_tables is None:
|
||||
return []
|
||||
try:
|
||||
out = duckdb_list_tables(db_path)
|
||||
except Exception: # noqa: BLE001
|
||||
return []
|
||||
if not _is_dict(out) or out.get("status") != "ok":
|
||||
return []
|
||||
return [t for t in (out.get("tables") or []) if isinstance(t, str)]
|
||||
|
||||
|
||||
def _inter_table_section(db_path: str, tables: list, mark: bool) -> list:
|
||||
"""Blocks for the inter-table FK layer (containment + join graph), or []."""
|
||||
if infer_fk_containment_duckdb is None or len(tables) < 2:
|
||||
return []
|
||||
try:
|
||||
fk = infer_fk_containment_duckdb(db_path, tables=tables)
|
||||
except Exception: # noqa: BLE001
|
||||
return []
|
||||
if not _is_dict(fk) or fk.get("status") != "ok":
|
||||
return []
|
||||
candidates = [c for c in (fk.get("fk_candidates") or []) if _is_dict(c)]
|
||||
if not candidates:
|
||||
return []
|
||||
|
||||
containment = ("[[term:containment]]containment (inclusión de valores)[[/term]]"
|
||||
if mark else "containment (inclusión de valores)")
|
||||
fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**"
|
||||
blocks = [
|
||||
model.Heading(text="Claves foráneas candidatas (inter-tabla)", level=2),
|
||||
model.Markdown(text=(
|
||||
f"La fuente tiene varias tablas. Estas {fk_term} candidatas se "
|
||||
f"infieren por señal de nombre y por {containment}. No están "
|
||||
"declaradas por la base; son la relación más probable según los "
|
||||
"datos.")),
|
||||
]
|
||||
|
||||
shown = candidates[:MAX_FK_ROWS]
|
||||
rows = []
|
||||
for c in shown:
|
||||
rows.append([
|
||||
f"{model._safe_str(c.get('from_table'))}.{model._safe_str(c.get('from_col'))}",
|
||||
f"{model._safe_str(c.get('to_table'))}.{model._safe_str(c.get('to_col'))}",
|
||||
_fmt_ratio(c.get("inclusion")),
|
||||
model._safe_str(c.get("cardinality") or "—"),
|
||||
"sí" if c.get("name_match") else "no",
|
||||
])
|
||||
note = "Ordenadas por señal de nombre e inclusión."
|
||||
if len(candidates) > len(shown):
|
||||
note += f" Se muestran {len(shown)} de {len(candidates)} candidatas."
|
||||
blocks.append(model.DataTable(
|
||||
header=["Origen", "→ Destino", "Inclusión", "Cardinalidad", "Coincide nombre"],
|
||||
rows=rows, title="FK candidatas por containment", note=note))
|
||||
|
||||
# Join graph: node roles + a pasteable Mermaid diagram, kept together.
|
||||
if build_join_graph is not None:
|
||||
try:
|
||||
graph = build_join_graph(candidates, tables=tables)
|
||||
except Exception: # noqa: BLE001
|
||||
graph = None
|
||||
if _is_dict(graph):
|
||||
graph_blocks = [model.Heading(text="Grafo de relaciones", level=3)]
|
||||
nodes = [n for n in (graph.get("nodes") or []) if _is_dict(n)]
|
||||
if nodes:
|
||||
node_rows = [[
|
||||
model._safe_str(n.get("table")),
|
||||
model._safe_str(n.get("role") or "—"),
|
||||
_fmt_int(n.get("out_degree")),
|
||||
_fmt_int(n.get("in_degree")),
|
||||
] for n in nodes]
|
||||
graph_blocks.append(model.DataTable(
|
||||
header=["Tabla", "Rol", "FK salientes", "FK entrantes"],
|
||||
rows=node_rows, title="Tablas y su rol en el grafo",
|
||||
note="Rol: fact (apunta a otras), dimension (referenciada), "
|
||||
"bridge (ambas), standalone (aislada)."))
|
||||
hubs = [h for h in (graph.get("hubs") or []) if h]
|
||||
if hubs:
|
||||
graph_blocks.append(model.Markdown(text=(
|
||||
"Tablas con más relaciones salientes (candidatas a tabla de "
|
||||
"hechos): " + ", ".join(model._safe_str(h) for h in hubs) + ".")))
|
||||
mermaid = model._safe_str(graph.get("mermaid")).strip()
|
||||
if mermaid:
|
||||
graph_blocks.append(model.Markdown(text=(
|
||||
"Diagrama de las relaciones (pegable en un bloque Mermaid):")))
|
||||
graph_blocks.append(model.Markdown(
|
||||
text="```mermaid\n" + mermaid + "\n```"))
|
||||
if len(graph_blocks) > 1:
|
||||
blocks.append(model.Group(blocks=graph_blocks,
|
||||
title="Grafo de relaciones"))
|
||||
|
||||
skipped = [s for s in (fk.get("skipped") or []) if s]
|
||||
if skipped:
|
||||
blocks.append(model.Note(
|
||||
"Algunos pares se omitieron por tamaño: "
|
||||
+ "; ".join(model._safe_str(s) for s in skipped) + "."))
|
||||
return blocks
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Layer 3b — intra-table FK candidates (name+cardinality heuristic).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _intra_table_section(profile: dict, mark: bool) -> list:
|
||||
"""Blocks for the intra-table FK heuristic layer, or [] if no candidates."""
|
||||
if suggest_intratable_fk_candidates is None:
|
||||
return []
|
||||
try:
|
||||
cands = suggest_intratable_fk_candidates(profile)
|
||||
except Exception: # noqa: BLE001
|
||||
return []
|
||||
cands = [c for c in (cands or []) if _is_dict(c)]
|
||||
if not cands:
|
||||
return []
|
||||
|
||||
fk_term = "[[term:fk]]**claves foráneas**[[/term]]" if mark else "**claves foráneas**"
|
||||
blocks = [
|
||||
model.Heading(text="Posibles claves foráneas (heurística de nombre)", level=2),
|
||||
model.Markdown(text=(
|
||||
f"No hay otras tablas que referenciar, pero algunas columnas **parecen** "
|
||||
f"{fk_term} por su nombre (terminan en «id») y su cardinalidad (muchos "
|
||||
"valores repetidos, N:1). Es una **sugerencia heurística**, no una "
|
||||
"afirmación: el nombre de la tabla destino es una conjetura y no se "
|
||||
"comprueba inclusión de valores contra ninguna tabla real.")),
|
||||
]
|
||||
rows = []
|
||||
for c in cands:
|
||||
rows.append([
|
||||
model._safe_str(c.get("column")),
|
||||
model._safe_str(c.get("ref_table_guess") or "—"),
|
||||
_fmt_int(c.get("distinct_count")),
|
||||
_fmt_pct_fraction(c.get("unique_pct")),
|
||||
model._safe_str(c.get("inferred_type") or c.get("physical_type") or "—"),
|
||||
model._safe_str(c.get("reason") or ""),
|
||||
])
|
||||
blocks.append(model.DataTable(
|
||||
header=["Columna", "Posible tabla", "Valores distintos", "% único",
|
||||
"Tipo", "Motivo"],
|
||||
rows=rows, title="Posibles FK por nombre y cardinalidad",
|
||||
note="Heurística: posibles falsos positivos/negativos. No confirma containment."))
|
||||
blocks.append(model.Note(
|
||||
"Estas sugerencias se basan solo en el nombre y la cardinalidad. Para "
|
||||
"confirmarlas haría falta la tabla destino y comprobar la inclusión de "
|
||||
"valores (containment)."))
|
||||
return blocks
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _intro_blocks(mark: bool) -> list:
|
||||
pk = "[[term:pk]]clave primaria[[/term]]" if mark else "clave primaria"
|
||||
fk = "[[term:fk]]clave foránea[[/term]]" if mark else "clave foránea"
|
||||
text = (
|
||||
f"Este capítulo analiza las **relaciones de clave** de la tabla: cuál es "
|
||||
f"la {pk} y cuáles son las {fk}. Cuando la base las **declara** como "
|
||||
"restricciones del esquema, se muestran tal cual; cuando no, se proponen "
|
||||
"las más probables a partir de los datos —por containment entre tablas o, "
|
||||
"en una sola tabla, por una heurística de nombre y cardinalidad— siempre "
|
||||
"marcadas como candidatas, nunca como hechos.")
|
||||
return [model.Heading(text=CHAPTER_TITLE, level=1), model.Markdown(text=text)]
|
||||
|
||||
|
||||
def build_relaciones(profile: dict, ctx: dict):
|
||||
"""Build the RELACIONES Chapter, or None if there is nothing to say.
|
||||
|
||||
Args:
|
||||
profile: the ``eda`` group TableProfile dict (may be None/empty).
|
||||
ctx: presentation context. Consumes ``db_path`` + ``table`` (to read
|
||||
declared constraints, list sibling tables and run the containment FK
|
||||
inference) and ``glossary`` (to register the relational terms).
|
||||
|
||||
Returns:
|
||||
A ``model.Chapter`` with the applicable relation layers; or ``None`` when
|
||||
the dataset has no declared key, no key candidates and no FK candidate
|
||||
(neither inter- nor intra-table).
|
||||
"""
|
||||
if not isinstance(profile, dict):
|
||||
profile = {}
|
||||
ctx = ctx if isinstance(ctx, dict) else {}
|
||||
db_path = ctx.get("db_path")
|
||||
table = ctx.get("table")
|
||||
|
||||
mark = _register_terms(ctx)
|
||||
|
||||
# Build each layer; the chapter is the concatenation of the non-empty ones.
|
||||
declared = _declared_keys(db_path, table)
|
||||
declared_blocks = _declared_section(declared) if declared else []
|
||||
declared_has_fk = bool(declared and declared.get("foreign_keys"))
|
||||
|
||||
pk_blocks = _pk_candidates_section(profile, mark)
|
||||
|
||||
tables = _list_source_tables(db_path)
|
||||
inter_blocks = _inter_table_section(db_path, tables, mark)
|
||||
|
||||
# The intra-table heuristic only makes sense when no real FK is available for
|
||||
# this table — neither declared nor inferred inter-table. Otherwise the real
|
||||
# relations already answer the question and the heuristic is just noise.
|
||||
if declared_has_fk or inter_blocks:
|
||||
intra_blocks = []
|
||||
else:
|
||||
intra_blocks = _intra_table_section(profile, mark)
|
||||
|
||||
body = declared_blocks + pk_blocks + inter_blocks + intra_blocks
|
||||
if not body:
|
||||
return None # chapter does not apply: nothing to say about relations.
|
||||
|
||||
blocks = _intro_blocks(mark) + body
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,273 @@
|
||||
"""Tests for the RELACIONES chapter — DoD: golden(s) + edges + no-cut render.
|
||||
|
||||
Two goldens covering the two real paths of the chapter:
|
||||
|
||||
- **Intra-table** (a single table, no db source for relations): the chapter shows
|
||||
the primary-key candidates from the profile and the heuristic foreign-key
|
||||
suggestions (name + cardinality), explicitly flagged as a heuristic. Renders to
|
||||
PDF and PPTX with nothing cut.
|
||||
- **Inter-table** (a real DuckDB file with two related tables, customers/orders,
|
||||
with a declared FK): the chapter shows the declared keys, the containment-based
|
||||
FK candidates and the join graph (roles + a pasteable Mermaid diagram).
|
||||
|
||||
Edges: a profile with no key candidate and no FK-looking column returns None;
|
||||
``None`` / ``{}`` profiles do not raise. The chapter registers its glossary terms.
|
||||
|
||||
Layers that depend on the sibling registry functions delegated alongside this
|
||||
chapter (``detect_declared_keys_duckdb``, ``suggest_intratable_fk_candidates``)
|
||||
are asserted **conditionally on the function being importable**, so the chapter's
|
||||
honest-degradation contract is what is tested, never a hard dependency on import
|
||||
timing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import duckdb
|
||||
from pptx import Presentation
|
||||
from pypdf import PdfReader
|
||||
|
||||
from datascience.automatic_eda.chapters.relaciones import build_relaciones
|
||||
from datascience.automatic_eda.model import Chapter, Group, GlossaryCollector
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
# The optional sibling functions: their layers are asserted only when present.
|
||||
try:
|
||||
from datascience.detect_declared_keys_duckdb import detect_declared_keys_duckdb
|
||||
except Exception: # noqa: BLE001
|
||||
detect_declared_keys_duckdb = None
|
||||
try:
|
||||
from datascience.suggest_intratable_fk_candidates import (
|
||||
suggest_intratable_fk_candidates,
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
suggest_intratable_fk_candidates = None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Helpers.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _flatten(blocks) -> list:
|
||||
"""Flatten Group blocks so a test can inspect every leaf block."""
|
||||
out = []
|
||||
for b in blocks:
|
||||
if isinstance(b, Group):
|
||||
out.extend(_flatten(b.blocks))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
def _text_of(chapter: Chapter) -> str:
|
||||
"""Collect all visible text of a chapter's blocks into one string."""
|
||||
parts = []
|
||||
for b in _flatten(chapter.blocks):
|
||||
for attr in ("text", "title", "note"):
|
||||
v = getattr(b, attr, None)
|
||||
if isinstance(v, str):
|
||||
parts.append(v)
|
||||
header = getattr(b, "header", None)
|
||||
if isinstance(header, list):
|
||||
parts.extend(str(c) for c in header)
|
||||
rows = getattr(b, "rows", None)
|
||||
if isinstance(rows, list):
|
||||
for r in rows:
|
||||
if isinstance(r, (list, tuple)):
|
||||
parts.extend(str(c) for c in r)
|
||||
else:
|
||||
parts.append(str(r))
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _render_both(chapter: Chapter, tag: str):
|
||||
"""Render the chapter to PDF and PPTX; return (pdf_text, n_slides)."""
|
||||
tmp = tempfile.mkdtemp(prefix=f"relaciones_{tag}_")
|
||||
pdf_path = os.path.join(tmp, "out.pdf")
|
||||
pptx_path = os.path.join(tmp, "out.pptx")
|
||||
meta = {"title": f"EDA — {tag}"}
|
||||
render_automatic_eda_pdf([chapter], pdf_path, meta)
|
||||
render_automatic_eda_pptx([chapter], pptx_path, meta)
|
||||
assert os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
|
||||
assert os.path.exists(pptx_path) and os.path.getsize(pptx_path) > 0
|
||||
text = "".join(p.extract_text() or "" for p in PdfReader(pdf_path).pages)
|
||||
n_slides = len(Presentation(pptx_path).slides)
|
||||
return text, n_slides
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Fixtures.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _titanic_profile() -> dict:
|
||||
"""A single-table profile: a PK candidate + a column that looks like a FK."""
|
||||
return {
|
||||
"table": "titanic",
|
||||
"source": "/data/titanic.csv",
|
||||
"n_rows": 891,
|
||||
"n_cols": 4,
|
||||
"key_candidates": ["PassengerId"],
|
||||
"columns": [
|
||||
{"name": "PassengerId", "inferred_type": "numeric",
|
||||
"physical_type": "BIGINT", "distinct_count": 891,
|
||||
"unique_pct": 1.0, "flags": ["possible_id"]},
|
||||
{"name": "ticket_id", "inferred_type": "numeric",
|
||||
"physical_type": "BIGINT", "distinct_count": 681,
|
||||
"unique_pct": 0.76, "flags": []},
|
||||
{"name": "fare", "inferred_type": "numeric",
|
||||
"physical_type": "DOUBLE", "distinct_count": 248,
|
||||
"unique_pct": 0.28, "flags": []},
|
||||
{"name": "sex", "inferred_type": "categorical",
|
||||
"physical_type": "VARCHAR", "distinct_count": 2,
|
||||
"unique_pct": 0.002, "flags": []},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _make_relational_db(path: str) -> None:
|
||||
"""Create a small DuckDB with customers(id) <- orders(customer_id), real FK."""
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)")
|
||||
con.execute(
|
||||
"CREATE TABLE orders(id INTEGER PRIMARY KEY, "
|
||||
"customer_id INTEGER REFERENCES customers(id), amount DOUBLE)")
|
||||
con.execute("INSERT INTO customers VALUES "
|
||||
"(1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e')")
|
||||
con.execute("INSERT INTO orders VALUES "
|
||||
"(1,1,10.0),(2,1,20.0),(3,2,30.0),(4,3,40.0),"
|
||||
"(5,3,50.0),(6,4,60.0),(7,5,70.0),(8,2,80.0)")
|
||||
con.close()
|
||||
|
||||
|
||||
def _orders_profile() -> dict:
|
||||
"""A profile for the `orders` table of the relational DB."""
|
||||
return {
|
||||
"table": "orders",
|
||||
"source": "orders",
|
||||
"n_rows": 8,
|
||||
"n_cols": 3,
|
||||
"key_candidates": ["id"],
|
||||
"columns": [
|
||||
{"name": "id", "inferred_type": "numeric", "physical_type": "INTEGER",
|
||||
"distinct_count": 8, "unique_pct": 1.0, "flags": ["possible_id"]},
|
||||
{"name": "customer_id", "inferred_type": "numeric",
|
||||
"physical_type": "INTEGER", "distinct_count": 5, "unique_pct": 0.625,
|
||||
"flags": []},
|
||||
{"name": "amount", "inferred_type": "numeric", "physical_type": "DOUBLE",
|
||||
"distinct_count": 8, "unique_pct": 1.0, "flags": []},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden 1 — intra-table.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_intra_table_pk_and_fk_heuristic():
|
||||
"""Single table: PK candidate shown; FK heuristic shown (if fn available);
|
||||
renders to PDF + PPTX with nothing cut."""
|
||||
prof = _titanic_profile()
|
||||
glossary = GlossaryCollector()
|
||||
# No db_path: only the profile-derived layers apply (no declared, no inter).
|
||||
chapter = build_relaciones(prof, {"glossary": glossary})
|
||||
|
||||
assert isinstance(chapter, Chapter)
|
||||
assert chapter.id == "relaciones"
|
||||
text = _text_of(chapter)
|
||||
|
||||
# PK candidate is always present (comes from the profile).
|
||||
assert "Candidatos a clave primaria" in text
|
||||
assert "PassengerId" in text
|
||||
|
||||
# Glossary terms got registered.
|
||||
for key in ("pk", "fk", "cardinalidad"):
|
||||
assert glossary.has(key)
|
||||
|
||||
# FK heuristic layer: present iff the delegated function is importable.
|
||||
if suggest_intratable_fk_candidates is not None:
|
||||
assert "Posibles claves foráneas" in text
|
||||
assert "ticket_id" in text
|
||||
# The float measure and the PK itself are NOT suggested as FKs.
|
||||
assert "Posibles FK por nombre" in text
|
||||
|
||||
pdf_text, n_slides = _render_both(chapter, "intra")
|
||||
assert "PassengerId" in pdf_text
|
||||
assert n_slides >= 1
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden 2 — inter-table (real DuckDB).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_inter_table_containment_and_join_graph():
|
||||
"""Two related tables: declared FK (if fn available) + containment FK
|
||||
candidate + Mermaid join graph."""
|
||||
tmp = tempfile.mkdtemp(prefix="relaciones_db_")
|
||||
db_path = os.path.join(tmp, "shop.duckdb")
|
||||
_make_relational_db(db_path)
|
||||
|
||||
prof = _orders_profile()
|
||||
glossary = GlossaryCollector()
|
||||
chapter = build_relaciones(
|
||||
prof, {"db_path": db_path, "table": "orders", "glossary": glossary})
|
||||
|
||||
assert isinstance(chapter, Chapter)
|
||||
text = _text_of(chapter)
|
||||
|
||||
# Inter-table containment FK candidate: customer_id -> customers.id. This path
|
||||
# uses infer_fk_containment_duckdb + build_join_graph, both already in the
|
||||
# registry, so it must be present.
|
||||
assert "Claves foráneas candidatas (inter-tabla)" in text
|
||||
assert "orders.customer_id" in text
|
||||
assert "customers.id" in text
|
||||
# Join graph with a pasteable Mermaid diagram.
|
||||
assert "Grafo de relaciones" in text
|
||||
assert "mermaid" in text
|
||||
assert "graph LR" in text
|
||||
assert "containment" in text.lower()
|
||||
|
||||
# Declared-keys layer: present iff the delegated function is importable.
|
||||
if detect_declared_keys_duckdb is not None:
|
||||
assert "Claves declaradas en el esquema" in text
|
||||
assert "Claves foráneas declaradas" in text
|
||||
|
||||
pdf_text, n_slides = _render_both(chapter, "inter")
|
||||
assert "customer_id" in pdf_text
|
||||
assert n_slides >= 1
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edges.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_none_when_no_relations():
|
||||
"""No key candidates, no FK-looking columns, no db source -> None."""
|
||||
prof = {
|
||||
"table": "flat", "n_rows": 100, "n_cols": 2, "key_candidates": [],
|
||||
"columns": [
|
||||
{"name": "value", "inferred_type": "numeric", "physical_type": "DOUBLE",
|
||||
"distinct_count": 50, "unique_pct": 0.5, "flags": []},
|
||||
{"name": "label", "inferred_type": "categorical",
|
||||
"physical_type": "VARCHAR", "distinct_count": 3, "unique_pct": 0.03,
|
||||
"flags": []},
|
||||
],
|
||||
}
|
||||
assert build_relaciones(prof, {}) is None
|
||||
|
||||
|
||||
def test_empty_and_none_profile_do_not_raise():
|
||||
"""None / {} profile and missing ctx degrade to None without raising."""
|
||||
assert build_relaciones(None, None) is None
|
||||
assert build_relaciones({}, {}) is None
|
||||
assert build_relaciones({}, {"glossary": GlossaryCollector()}) is None
|
||||
|
||||
|
||||
def test_pk_candidate_only_builds_chapter():
|
||||
"""A profile with only a key candidate (no FK anything, no db) still builds:
|
||||
the relations chapter applies because there is a PK candidate to report."""
|
||||
prof = {
|
||||
"table": "t", "n_rows": 10, "n_cols": 1, "key_candidates": ["row_id"],
|
||||
"columns": [
|
||||
{"name": "row_id", "inferred_type": "numeric", "physical_type": "BIGINT",
|
||||
"distinct_count": 10, "unique_pct": 1.0, "flags": ["possible_id"]},
|
||||
],
|
||||
}
|
||||
chapter = build_relaciones(prof, {})
|
||||
assert isinstance(chapter, Chapter)
|
||||
assert "Candidatos a clave primaria" in _text_of(chapter)
|
||||
@@ -0,0 +1,559 @@
|
||||
"""Free-text / NLP distributions chapter (TEXT DISTR) for AutomaticEDA.
|
||||
|
||||
First chapter for **non-tabular** content: it profiles the linguistic content of
|
||||
any column holding long free text (reviews, descriptions, comments, tickets) that
|
||||
the categorical chapter cannot meaningfully summarize (high cardinality, many
|
||||
words per value). It is the cheap, model-free counterpart to ``cat_distr`` for
|
||||
columns that are prose rather than discrete labels.
|
||||
|
||||
Activation (returns ``None`` when it does not apply):
|
||||
|
||||
1. Cheap gate from the aggregated profile: at least one non-numeric column whose
|
||||
``categorical.len_mean`` (mean character length) is ``>= _MIN_LEN_CHARS``.
|
||||
A dataset whose only string columns are short labels (e.g. titanic's
|
||||
``Name``, ~27 chars) never passes this gate, so the chapter disappears with
|
||||
zero extra work and the existing report is untouched.
|
||||
2. Confirmation from a raw sample: each candidate column is sampled (push-down
|
||||
``extract_text_sample`` over ``ctx['db_path']``/``ctx['table']``, or an
|
||||
in-memory ``ctx['text_raw']`` for tests) and kept only if the **median word
|
||||
count is ``>= _MIN_WORDS``** — i.e. it is genuinely long text, not a long
|
||||
single token. If no column survives, the chapter returns ``None``.
|
||||
|
||||
Per surviving column the chapter emits, kept together on its own page/slide
|
||||
(``Group(page_break_before=...)``):
|
||||
|
||||
- a key/value summary (documents, length percentiles, vocabulary richness with
|
||||
**[[term:ttr]]TTR[[/term]]** and **[[term:hapax]]hapax legomena[[/term]]**,
|
||||
dominant language, exact-duplicate %, readability when available);
|
||||
- a word-count histogram figure;
|
||||
- a top-terms table + a horizontal bar figure;
|
||||
- bigram and trigram frequency tables;
|
||||
- a detected-language bar figure (when ``langdetect`` is available);
|
||||
- an optional word-cloud figure (only when ``wordcloud`` is installed);
|
||||
- a closing note on duplicates / readability degradation.
|
||||
|
||||
Every metric is delegated to pure ``eda`` registry functions
|
||||
(``compute_text_length_stats``, ``compute_vocabulary_stats``,
|
||||
``compute_top_ngrams``, ``detect_corpus_language``, ``compute_text_duplicates``,
|
||||
``compute_text_readability``) and the raw sample to ``extract_text_sample``; all
|
||||
are imported defensively so a missing function or optional library degrades that
|
||||
single piece to a note instead of aborting the chapter. Optional libraries
|
||||
(``langdetect``, ``textstat``, ``wordcloud``, ``datasketch``) are never required:
|
||||
the piece is silently omitted when they are absent.
|
||||
|
||||
Contract: build_<id>(profile, ctx) -> Chapter | None ; CHAPTER_VERSION = "x.y.z".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.0.0"
|
||||
CHAPTER_ID = "text_distr"
|
||||
CHAPTER_TITLE = "Texto libre (NLP)"
|
||||
|
||||
# Cheap activation gate (characters): a non-numeric column whose mean string
|
||||
# length reaches this is a candidate for "long text". Short labels (titanic's
|
||||
# Name ≈ 27 chars) stay below it, so the chapter does not fire on them.
|
||||
_MIN_LEN_CHARS = 50
|
||||
# Confirmation gate (words): a candidate is kept only if its median document has
|
||||
# at least this many words — genuine prose, not a long id/URL token.
|
||||
_MIN_WORDS = 20
|
||||
# Bound the document so very wide datasets stay readable.
|
||||
_MAX_TEXT_COLS = 5
|
||||
# Raw text rows to sample per column when the chapter must extract them itself.
|
||||
_SAMPLE_ROWS = 2000
|
||||
# Rows shown in the frequency tables.
|
||||
_TOP_TERMS = 15
|
||||
_TOP_NGRAMS = 10
|
||||
|
||||
# Glossary terms this chapter explains (registered in the shared collector and
|
||||
# marked clickable on first appearance — same mechanism as cat_distr's entropía).
|
||||
_TERMS = {
|
||||
"ttr": (
|
||||
"TTR (type-token ratio)",
|
||||
"Riqueza léxica de un texto: número de palabras distintas (tipos) "
|
||||
"dividido por el número total de palabras (tokens). Vale 1 cuando no se "
|
||||
"repite ninguna palabra (máxima variedad) y baja hacia 0 cuando el "
|
||||
"vocabulario se repite mucho. Depende de la longitud del corpus, así que "
|
||||
"compara mejor textos de tamaño parecido."),
|
||||
"hapax": (
|
||||
"Hapax legomena",
|
||||
"Palabras que aparecen una sola vez en todo el corpus. Un porcentaje "
|
||||
"alto de hapax indica vocabulario muy variado o, a veces, ruido "
|
||||
"(erratas, identificadores, tokens raros). Se expresa como porcentaje "
|
||||
"sobre el número de palabras distintas."),
|
||||
}
|
||||
|
||||
|
||||
def _fmt_int(value) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{int(value):,}".replace(",", ".")
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_num(value, decimals: int = 2) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, int):
|
||||
return f"{value:,}".replace(",", ".")
|
||||
if isinstance(value, float):
|
||||
if value != value: # NaN
|
||||
return "NaN"
|
||||
if value in (float("inf"), float("-inf")):
|
||||
return str(value)
|
||||
text = f"{value:.{decimals}f}".rstrip("0").rstrip(".")
|
||||
return text if text else "0"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fmt_pct(value, decimals: int = 1) -> str:
|
||||
if value is None:
|
||||
return "—"
|
||||
try:
|
||||
return f"{float(value):.{decimals}f}%"
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
|
||||
|
||||
def _truncate(text, limit: int = 40) -> str:
|
||||
s = model._safe_str(text)
|
||||
return s if len(s) <= limit else s[: max(1, limit - 1)].rstrip() + "…"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Defensive wrappers around the registry functions: each returns the function's
|
||||
# output dict or a safe empty default, never raising and never importing at
|
||||
# module load (so the chapter stays importable even if a function is missing).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _length_stats(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
out = compute_text_length_stats(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _vocab_stats(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
out = compute_vocabulary_stats(texts, top_k=_TOP_TERMS)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _ngrams(texts, n) -> list:
|
||||
try:
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
out = compute_top_ngrams(texts, n=n, top_k=_TOP_NGRAMS)
|
||||
if isinstance(out, dict):
|
||||
return out.get("top") or []
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _language(texts) -> dict:
|
||||
try:
|
||||
from datascience.detect_corpus_language import detect_corpus_language
|
||||
out = detect_corpus_language(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {"available": False, "distribution": [], "dominant": None}
|
||||
|
||||
|
||||
def _duplicates(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
out = compute_text_duplicates(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _readability(texts) -> dict:
|
||||
try:
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
out = compute_text_readability(texts)
|
||||
if isinstance(out, dict):
|
||||
return out
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return {"available": False, "flesch": {}}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Candidate detection + raw sample acquisition.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _candidate_columns(profile: dict) -> list:
|
||||
"""Cheap gate: non-numeric columns whose mean char length reaches the
|
||||
threshold. Returns the list of column names (possibly empty)."""
|
||||
out = []
|
||||
for col in profile.get("columns") or []:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
if col.get("inferred_type") == "numeric":
|
||||
continue
|
||||
cat = col.get("categorical")
|
||||
if not isinstance(cat, dict):
|
||||
continue
|
||||
len_mean = cat.get("len_mean")
|
||||
if isinstance(len_mean, (int, float)) and not isinstance(len_mean, bool) \
|
||||
and len_mean >= _MIN_LEN_CHARS:
|
||||
name = col.get("name")
|
||||
if name:
|
||||
out.append(str(name))
|
||||
return out
|
||||
|
||||
|
||||
def _get_samples(profile: dict, ctx: dict, columns: list) -> dict:
|
||||
"""Return {col: [str, ...]} raw text samples for the candidate columns.
|
||||
|
||||
Prefers an in-memory ``ctx['text_raw']`` (used by tests); otherwise pushes a
|
||||
sample down to the database via ``extract_text_sample`` using ctx db_path /
|
||||
table. Never raises: returns {} when no sample can be obtained."""
|
||||
text_raw = ctx.get("text_raw")
|
||||
if isinstance(text_raw, dict) and text_raw:
|
||||
return {c: [str(v) for v in (text_raw.get(c) or []) if v is not None]
|
||||
for c in columns if text_raw.get(c)}
|
||||
|
||||
db_path = ctx.get("db_path")
|
||||
table = ctx.get("table")
|
||||
if not db_path or not table:
|
||||
return {}
|
||||
backend = ctx.get("backend") or "duckdb"
|
||||
sample = ctx.get("sample") or _SAMPLE_ROWS
|
||||
try:
|
||||
from datascience.extract_text_sample import extract_text_sample
|
||||
out = extract_text_sample(db_path, table, columns, backend=backend,
|
||||
sample=sample)
|
||||
if isinstance(out, dict) and out.get("status") == "ok":
|
||||
cols = out.get("columns")
|
||||
if isinstance(cols, dict):
|
||||
return {c: list(v) for c, v in cols.items() if v}
|
||||
except Exception: # noqa: BLE001 — dict-no-throw: no sample → chapter omits.
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _confirm_long_text(samples: dict) -> dict:
|
||||
"""Keep only columns whose median word count reaches _MIN_WORDS. Returns
|
||||
{col: length_stats_dict} for the survivors, in input order."""
|
||||
survivors = {}
|
||||
for col, texts in samples.items():
|
||||
stats = _length_stats(texts)
|
||||
words = stats.get("words") if isinstance(stats, dict) else None
|
||||
median = words.get("p50") if isinstance(words, dict) else None
|
||||
if isinstance(median, (int, float)) and not isinstance(median, bool) \
|
||||
and median >= _MIN_WORDS:
|
||||
survivors[col] = stats
|
||||
return survivors
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Figures (lazy matplotlib, scaled by the renderers — same style as num_distr).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _hist_figure(name: str, length_stats: dict):
|
||||
def make():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
fig = Figure(figsize=(6.2, 3.0))
|
||||
ax = fig.add_subplot(111)
|
||||
bins = (length_stats or {}).get("word_hist") or []
|
||||
drew = False
|
||||
for b in bins:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
lo, hi, count = b.get("lo"), b.get("hi"), b.get("count") or 0
|
||||
if lo is None or hi is None:
|
||||
continue
|
||||
width = (hi - lo) if hi > lo else max(abs(lo) * 1e-3, 1e-6)
|
||||
ax.bar(lo, count, width=width, align="edge", color="#9ec6df",
|
||||
edgecolor="#5b8aa6", linewidth=0.4)
|
||||
drew = True
|
||||
if not drew:
|
||||
ax.text(0.5, 0.5, "(sin datos de longitud)", ha="center",
|
||||
va="center", color="#8a8a8a", transform=ax.transAxes)
|
||||
ax.set_xlabel("palabras por documento", fontsize=8)
|
||||
ax.set_ylabel("nº de documentos", fontsize=8)
|
||||
ax.tick_params(labelsize=7)
|
||||
for spine in ("top", "right"):
|
||||
ax.spines[spine].set_visible(False)
|
||||
ax.set_title(f"Longitud de «{_truncate(name, 30)}»", fontsize=10,
|
||||
loc="left")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
return make
|
||||
|
||||
|
||||
def _barh_figure(title: str, items: list, label_key: str, value_key: str,
|
||||
xlabel: str):
|
||||
"""Horizontal bar chart from [{label_key:..., value_key:...}, ...]."""
|
||||
def make():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
rows = [it for it in (items or []) if isinstance(it, dict)
|
||||
and isinstance(it.get(value_key), (int, float))]
|
||||
rows = rows[:12]
|
||||
fig = Figure(figsize=(6.2, max(2.2, 0.32 * len(rows) + 0.8)))
|
||||
ax = fig.add_subplot(111)
|
||||
if not rows:
|
||||
ax.text(0.5, 0.5, "(sin datos)", ha="center", va="center",
|
||||
color="#8a8a8a", transform=ax.transAxes)
|
||||
ax.axis("off")
|
||||
return fig
|
||||
labels = [_truncate(r.get(label_key), 28) for r in rows][::-1]
|
||||
values = [float(r.get(value_key) or 0) for r in rows][::-1]
|
||||
ypos = range(len(rows))
|
||||
ax.barh(list(ypos), values, color="#9ec6df", edgecolor="#5b8aa6",
|
||||
linewidth=0.4)
|
||||
ax.set_yticks(list(ypos))
|
||||
ax.set_yticklabels(labels, fontsize=7)
|
||||
ax.set_xlabel(xlabel, fontsize=8)
|
||||
ax.tick_params(labelsize=7)
|
||||
for spine in ("top", "right"):
|
||||
ax.spines[spine].set_visible(False)
|
||||
ax.set_title(_truncate(title, 44), fontsize=10, loc="left")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
return make
|
||||
|
||||
|
||||
def _wordcloud_figure(texts):
|
||||
"""Word-cloud figure callable, or None if wordcloud is not installed."""
|
||||
try:
|
||||
import wordcloud # noqa: F401
|
||||
except Exception: # noqa: BLE001 — optional dependency: omit the figure.
|
||||
return None
|
||||
|
||||
def make():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
from matplotlib.figure import Figure
|
||||
from wordcloud import WordCloud
|
||||
fig = Figure(figsize=(6.2, 3.2))
|
||||
ax = fig.add_subplot(111)
|
||||
joined = " ".join(t for t in texts if isinstance(t, str))
|
||||
try:
|
||||
wc = WordCloud(width=800, height=400, background_color="white",
|
||||
colormap="viridis").generate(joined)
|
||||
ax.imshow(wc, interpolation="bilinear")
|
||||
except Exception: # noqa: BLE001
|
||||
ax.text(0.5, 0.5, "(nube de palabras no disponible)", ha="center",
|
||||
va="center", color="#8a8a8a", transform=ax.transAxes)
|
||||
ax.axis("off")
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
return make
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Per-column block assembly.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _summary_kv(n_docs, length_stats, vocab, lang, dup, read):
|
||||
chars = (length_stats or {}).get("chars") or {}
|
||||
words = (length_stats or {}).get("words") or {}
|
||||
sents = (length_stats or {}).get("sentences") or {}
|
||||
rows = [
|
||||
("Documentos", _fmt_int(n_docs)),
|
||||
("Caracteres (media · p50 · p90 · p99)",
|
||||
f"{_fmt_num(chars.get('mean'))} · {_fmt_int(chars.get('p50'))} · "
|
||||
f"{_fmt_int(chars.get('p90'))} · {_fmt_int(chars.get('p99'))}"),
|
||||
("Palabras (media · p50 · p90 · p99)",
|
||||
f"{_fmt_num(words.get('mean'))} · {_fmt_int(words.get('p50'))} · "
|
||||
f"{_fmt_int(words.get('p90'))} · {_fmt_int(words.get('p99'))}"),
|
||||
("Frases (media · máx)",
|
||||
f"{_fmt_num(sents.get('mean'))} · {_fmt_int(sents.get('max'))}"),
|
||||
("Vocabulario (tokens · tipos · TTR)",
|
||||
f"{_fmt_int(vocab.get('n_tokens'))} · {_fmt_int(vocab.get('n_types'))} "
|
||||
f"· {_fmt_num(vocab.get('ttr'), 3)}"),
|
||||
("Hapax legomena",
|
||||
f"{_fmt_int(vocab.get('n_hapax'))} ({_fmt_pct(vocab.get('hapax_pct'))})"),
|
||||
]
|
||||
if isinstance(lang, dict) and lang.get("available"):
|
||||
dom = lang.get("dominant")
|
||||
n_langs = len(lang.get("distribution") or [])
|
||||
rows.append(("Idioma dominante · nº idiomas",
|
||||
f"{model._safe_str(dom) or '—'} · {_fmt_int(n_langs)}"))
|
||||
if isinstance(dup, dict) and dup.get("n_docs"):
|
||||
rows.append(("Duplicados exactos",
|
||||
f"{_fmt_int(dup.get('n_exact_dup'))} "
|
||||
f"({_fmt_pct(dup.get('exact_dup_pct'))})"))
|
||||
if isinstance(read, dict) and read.get("available"):
|
||||
flesch = read.get("flesch") or {}
|
||||
rows.append(("Legibilidad Flesch (media)",
|
||||
_fmt_num(flesch.get("mean"), 1)))
|
||||
return model.KVTable(rows=rows, title="Resumen del texto")
|
||||
|
||||
|
||||
def _terms_table(vocab) -> "model.DataTable | None":
|
||||
top = (vocab or {}).get("top_terms") or []
|
||||
rows = [[_truncate(t.get("term"), 32), _fmt_int(t.get("count")),
|
||||
_fmt_pct(t.get("pct"))]
|
||||
for t in top[:_TOP_TERMS] if isinstance(t, dict)]
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=["Término", "Conteo", "% tokens"], rows=rows,
|
||||
title="Términos más frecuentes",
|
||||
note="stopwords ES+EN eliminadas")
|
||||
|
||||
|
||||
def _ngram_table(items, n_label) -> "model.DataTable | None":
|
||||
rows = [[_truncate(it.get("ngram"), 40), _fmt_int(it.get("count"))]
|
||||
for it in (items or [])[:_TOP_NGRAMS] if isinstance(it, dict)]
|
||||
if not rows:
|
||||
return None
|
||||
return model.DataTable(header=[n_label, "Conteo"], rows=rows,
|
||||
title=f"{n_label} más frecuentes")
|
||||
|
||||
|
||||
def _dup_note(dup, lang, read) -> "model.Note | None":
|
||||
bits = []
|
||||
if isinstance(dup, dict):
|
||||
nd = dup.get("near_dup") or {}
|
||||
if nd.get("available"):
|
||||
bits.append(
|
||||
f"casi-duplicados detectados (MinHash, umbral "
|
||||
f"{_fmt_num(nd.get('threshold'))}): "
|
||||
f"{_fmt_int(nd.get('n_near_dup_docs'))} documentos")
|
||||
else:
|
||||
bits.append("near-duplicados no calculados (datasketch no instalado; "
|
||||
"se reportan solo los duplicados exactos por hash)")
|
||||
if isinstance(lang, dict) and not lang.get("available"):
|
||||
bits.append("detección de idioma omitida (langdetect no instalado)")
|
||||
if isinstance(read, dict) and not read.get("available"):
|
||||
bits.append("legibilidad omitida (textstat no instalado)")
|
||||
if not bits:
|
||||
return None
|
||||
return model.Note(" · ".join(bits))
|
||||
|
||||
|
||||
def _column_group(name, texts, length_stats, idx, mark_terms):
|
||||
vocab = _vocab_stats(texts)
|
||||
lang = _language(texts)
|
||||
dup = _duplicates(texts)
|
||||
read = _readability(texts)
|
||||
n_docs = (length_stats or {}).get("n_docs")
|
||||
|
||||
blocks = [
|
||||
model.Heading(text=str(name), level=2),
|
||||
_summary_kv(n_docs, length_stats, vocab, lang, dup, read),
|
||||
model.Figure(make=_hist_figure(name, length_stats),
|
||||
caption=f"Distribución de la longitud (palabras) de "
|
||||
f"«{_truncate(name, 30)}»."),
|
||||
]
|
||||
|
||||
terms_tbl = _terms_table(vocab)
|
||||
if terms_tbl is not None:
|
||||
blocks.append(terms_tbl)
|
||||
blocks.append(model.Figure(
|
||||
make=_barh_figure(f"Top términos de «{_truncate(name, 24)}»",
|
||||
vocab.get("top_terms"), "term", "count",
|
||||
"conteo"),
|
||||
caption="Términos más frecuentes (barras)."))
|
||||
|
||||
bi_tbl = _ngram_table(_ngrams(texts, 2), "Bigrama")
|
||||
if bi_tbl is not None:
|
||||
blocks.append(bi_tbl)
|
||||
tri_tbl = _ngram_table(_ngrams(texts, 3), "Trigrama")
|
||||
if tri_tbl is not None:
|
||||
blocks.append(tri_tbl)
|
||||
|
||||
if isinstance(lang, dict) and lang.get("available") \
|
||||
and lang.get("distribution"):
|
||||
blocks.append(model.Figure(
|
||||
make=_barh_figure(f"Idiomas detectados en «{_truncate(name, 24)}»",
|
||||
lang.get("distribution"), "lang", "count",
|
||||
"documentos"),
|
||||
caption="Distribución de idiomas detectados (langdetect)."))
|
||||
|
||||
wc = _wordcloud_figure(texts)
|
||||
if wc is not None:
|
||||
blocks.append(model.Figure(
|
||||
make=wc, caption=f"Nube de palabras de «{_truncate(name, 30)}»."))
|
||||
|
||||
note = _dup_note(dup, lang, read)
|
||||
if note is not None:
|
||||
blocks.append(note)
|
||||
|
||||
return model.Group(blocks=blocks, page_break_before=(idx > 0))
|
||||
|
||||
|
||||
def _intro_blocks(n_cols, mark_terms):
|
||||
ttr = ("[[term:ttr]]TTR[[/term]]" if mark_terms else "TTR")
|
||||
hapax = ("[[term:hapax]]hapax legomena[[/term]]" if mark_terms
|
||||
else "hapax legomena")
|
||||
text = (
|
||||
f"Este capítulo perfila las columnas de **texto libre largo** del "
|
||||
f"dataset (reseñas, descripciones, comentarios): contenido lingüístico "
|
||||
f"que la distribución categórica no resume bien. Para cada columna se "
|
||||
f"muestran la longitud de los documentos, la riqueza de vocabulario "
|
||||
f"(incluido el {ttr} y el porcentaje de {hapax}), los términos y "
|
||||
f"n-gramas más frecuentes, los idiomas detectados y el nivel de "
|
||||
f"duplicación. Las métricas son baratas y sin modelos pesados; las "
|
||||
f"piezas que dependen de una librería opcional se omiten si no está "
|
||||
f"instalada.")
|
||||
return [
|
||||
model.Heading(text=CHAPTER_TITLE, level=1),
|
||||
model.Markdown(text=text),
|
||||
]
|
||||
|
||||
|
||||
def build_text_distr(profile: dict, ctx: dict):
|
||||
"""Build the free-text Chapter, or None if no long-text column applies."""
|
||||
profile = profile or {}
|
||||
ctx = ctx or {}
|
||||
|
||||
# 1) Cheap gate from the profile (no DB access yet).
|
||||
candidates = _candidate_columns(profile)
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# 2) Raw sample + 3) confirm genuine long text (median words >= threshold).
|
||||
samples = _get_samples(profile, ctx, candidates)
|
||||
if not samples:
|
||||
return None
|
||||
survivors = _confirm_long_text(samples)
|
||||
if not survivors:
|
||||
return None
|
||||
|
||||
# Register glossary terms (clickable) once we know the chapter applies.
|
||||
glossary = ctx.get("glossary")
|
||||
mark_terms = False
|
||||
if isinstance(glossary, model.GlossaryCollector):
|
||||
for key, (label, definition) in _TERMS.items():
|
||||
glossary.add(key, label, definition)
|
||||
mark_terms = True
|
||||
|
||||
blocks = list(_intro_blocks(len(survivors), mark_terms))
|
||||
|
||||
rendered = list(survivors.items())[:_MAX_TEXT_COLS]
|
||||
for idx, (name, length_stats) in enumerate(rendered):
|
||||
texts = samples.get(name) or []
|
||||
blocks.append(_column_group(name, texts, length_stats, idx, mark_terms))
|
||||
|
||||
if len(survivors) > len(rendered):
|
||||
omitted = len(survivors) - len(rendered)
|
||||
blocks.append(model.Note(
|
||||
f"Se muestran las primeras {len(rendered)} columnas de texto; "
|
||||
f"quedan {omitted} sin mostrar para mantener acotado el informe."))
|
||||
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
@@ -0,0 +1,256 @@
|
||||
"""Tests for the TEXT DISTR chapter — DoD: golden + edges + degradation.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles and feeds the raw text sample
|
||||
in-memory through ``ctx['text_raw']`` (no DuckDB needed), so the suite is fast
|
||||
and deterministic. Verifies that ``build_text_distr``:
|
||||
|
||||
- GOLDEN: with a long-text column, emits the chapter with its key blocks
|
||||
(length summary, word histogram, top-terms table, n-gram tables, language
|
||||
bars) and registers the clickable glossary terms; and that it renders inside
|
||||
the full document to both PDF and PPTX showing that content.
|
||||
- EDGE (None): a dataset whose only string column is short labels (titanic-like
|
||||
``Name``) yields ``None`` without raising — the existing report is untouched.
|
||||
- EDGE (None): a column that passes the cheap char gate but whose documents are
|
||||
short (median words below the threshold) is rejected at the confirmation step.
|
||||
- DEGRADATION: with ``langdetect`` / ``textstat`` / ``wordcloud`` unavailable,
|
||||
the chapter still builds (those pieces are omitted) and never raises.
|
||||
"""
|
||||
|
||||
import builtins
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import (
|
||||
DataTable, Figure, GlossaryCollector, Group, Heading, KVTable, Markdown,
|
||||
Note,
|
||||
)
|
||||
from datascience.automatic_eda.chapters.text_distr import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_text_distr,
|
||||
)
|
||||
from datascience.automatic_eda.chapters_registry import build_document
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Synthetic corpus + profiles.
|
||||
# --------------------------------------------------------------------------- #
|
||||
_ES = [
|
||||
"El producto llegó en perfecto estado y mucho antes de lo previsto por la tienda",
|
||||
"La calidad de los materiales es realmente excelente y se nota la diferencia al usarlo",
|
||||
"No me convenció del todo porque esperaba bastante más por el precio que pagué finalmente",
|
||||
"El servicio de atención al cliente fue rápido amable y resolvió mi problema sin demora",
|
||||
"Lo recomiendo totalmente ya que ha superado con creces todas mis expectativas iniciales",
|
||||
]
|
||||
_EN = [
|
||||
"The product arrived in perfect condition and much earlier than the store had promised me",
|
||||
"The build quality is genuinely outstanding and you can really feel the difference using it",
|
||||
"I was not fully convinced because I expected quite a lot more for the price i finally paid",
|
||||
"Customer support was fast friendly and solved my whole problem without any delay at all",
|
||||
"I highly recommend it since it has exceeded by far every one of my initial expectations",
|
||||
]
|
||||
|
||||
|
||||
def _long_reviews(n=40) -> list:
|
||||
"""A corpus of long multi-sentence reviews (>= 20 words each), mixing two
|
||||
languages and including a few exact duplicates."""
|
||||
out = []
|
||||
for i in range(n):
|
||||
base = _ES if i % 3 != 0 else _EN # mostly ES, some EN
|
||||
a = base[i % len(base)]
|
||||
b = base[(i + 2) % len(base)]
|
||||
out.append(f"{a}. {b}.")
|
||||
# Inject a couple of exact duplicates.
|
||||
out.append(out[0])
|
||||
out.append(out[1])
|
||||
return out
|
||||
|
||||
|
||||
def _text_profile() -> dict:
|
||||
"""Profile with a long free-text column (review) + a numeric + a short cat."""
|
||||
return {
|
||||
"table": "reviews",
|
||||
"source": "/data/reviews.duckdb",
|
||||
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||
"n_rows": 42,
|
||||
"n_cols": 3,
|
||||
"quality_score": 88.0,
|
||||
"columns": [
|
||||
{
|
||||
"name": "review",
|
||||
"inferred_type": "categorical",
|
||||
"categorical": {
|
||||
"top": [{"value": "x", "count": 2, "pct": 0.05}],
|
||||
"n_distinct": 40,
|
||||
"len_mean": 180.0,
|
||||
"len_min": 80,
|
||||
"len_max": 220,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"inferred_type": "numeric",
|
||||
"numeric": {"mean": 3.1, "median": 3.0, "std": 1.2,
|
||||
"min": 1, "max": 5},
|
||||
},
|
||||
{
|
||||
"name": "product",
|
||||
"inferred_type": "categorical",
|
||||
"categorical": {
|
||||
"top": [{"value": "teclado", "count": 10, "pct": 0.25}],
|
||||
"n_distinct": 6,
|
||||
"len_mean": 7.0,
|
||||
"len_min": 5, "len_max": 11,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _no_text_profile() -> dict:
|
||||
"""titanic-like: the only string column is short labels (Name ≈ 27 chars)."""
|
||||
return {
|
||||
"table": "titanic",
|
||||
"n_rows": 891,
|
||||
"n_cols": 3,
|
||||
"columns": [
|
||||
{"name": "Age", "inferred_type": "numeric",
|
||||
"numeric": {"mean": 29.7, "median": 28.0, "std": 14.5}},
|
||||
{"name": "Name", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "Braund, Mr. Owen Harris",
|
||||
"count": 1, "pct": 0.001}],
|
||||
"n_distinct": 891, "len_mean": 27.0,
|
||||
"len_min": 12, "len_max": 82}},
|
||||
{"name": "Sex", "inferred_type": "categorical",
|
||||
"categorical": {"top": [{"value": "male", "count": 577,
|
||||
"pct": 0.65}],
|
||||
"n_distinct": 2, "len_mean": 4.6,
|
||||
"len_min": 4, "len_max": 6}},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _flatten(blocks) -> list:
|
||||
"""Recursively flatten Group blocks so tests can inspect leaf blocks."""
|
||||
out = []
|
||||
for b in blocks:
|
||||
if isinstance(b, Group):
|
||||
out.extend(_flatten(b.blocks))
|
||||
else:
|
||||
out.append(b)
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_activa_con_texto():
|
||||
glossary = GlossaryCollector()
|
||||
ctx = {"text_raw": {"review": _long_reviews()}, "glossary": glossary}
|
||||
ch = build_text_distr(_text_profile(), ctx)
|
||||
|
||||
assert ch is not None, "el capítulo debe activarse con una columna de texto largo"
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
leaves = _flatten(ch.blocks)
|
||||
kinds = [b.kind for b in leaves]
|
||||
assert "heading" in kinds
|
||||
assert "kv_table" in kinds # summary
|
||||
assert "figure" in kinds # histogram / bars
|
||||
assert "data_table" in kinds # top terms + n-grams
|
||||
|
||||
# KV summary mentions vocabulary metrics.
|
||||
kv = next(b for b in leaves if isinstance(b, KVTable))
|
||||
labels = " ".join(str(r[0]) for r in kv.rows)
|
||||
assert "TTR" in labels
|
||||
assert "Hapax" in labels or "hapax" in labels
|
||||
|
||||
# There is a terms table and at least one n-gram table.
|
||||
titles = [getattr(b, "title", "") or "" for b in leaves
|
||||
if isinstance(b, DataTable)]
|
||||
assert any("Términos" in t for t in titles)
|
||||
assert any("Bigrama" in t for t in titles)
|
||||
|
||||
# Glossary terms were registered (clickable destinations).
|
||||
assert glossary.has("ttr")
|
||||
assert glossary.has("hapax")
|
||||
|
||||
|
||||
def test_golden_render_pdf_pptx():
|
||||
profile = _text_profile()
|
||||
ctx = {"text_raw": {"review": _long_reviews()},
|
||||
"dataset_name": "reviews"}
|
||||
chapters = build_document(profile, ctx)
|
||||
ids = [c.id for c in chapters]
|
||||
assert "text_distr" in ids, f"text_distr ausente en {ids}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
pdf = os.path.join(d, "t.pdf")
|
||||
pptx = os.path.join(d, "t.pptx")
|
||||
rp = render_automatic_eda_pdf(profile, pdf, {"title": "EDA", "ctx": ctx})
|
||||
rx = render_automatic_eda_pptx(profile, pptx, {"title": "EDA", "ctx": ctx})
|
||||
assert rp.get("path") and os.path.exists(pdf)
|
||||
assert rx.get("path") and os.path.exists(pptx)
|
||||
|
||||
text = "\n".join(p.extract_text() or "" for p in PdfReader(pdf).pages)
|
||||
assert "Texto libre" in text or "TTR" in text
|
||||
|
||||
prs = Presentation(pptx)
|
||||
ptext = []
|
||||
for slide in prs.slides:
|
||||
for shp in slide.shapes:
|
||||
if shp.has_text_frame:
|
||||
ptext.append(shp.text_frame.text)
|
||||
joined = "\n".join(ptext)
|
||||
assert "Texto libre" in joined or "TTR" in joined
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edges — None.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_edge_none_sin_texto_largo():
|
||||
# titanic-like: short labels only → chapter must not apply.
|
||||
assert build_text_distr(_no_text_profile(), {}) is None
|
||||
|
||||
|
||||
def test_edge_none_palabras_cortas():
|
||||
# Char gate passes (len_mean high) but documents are short → confirmation
|
||||
# rejects them (median words below threshold).
|
||||
profile = _text_profile()
|
||||
short = ["palabra " * 3] * 30 # 3 words each, < _MIN_WORDS
|
||||
ctx = {"text_raw": {"review": short}}
|
||||
assert build_text_distr(profile, ctx) is None
|
||||
|
||||
|
||||
def test_edge_none_empty_profile():
|
||||
assert build_text_distr({}, {}) is None
|
||||
assert build_text_distr(None, None) is None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Degradation — optional libs absent.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_degradacion_sin_libs(monkeypatch):
|
||||
real_import = builtins.__import__
|
||||
blocked = ("langdetect", "textstat", "wordcloud", "datasketch")
|
||||
|
||||
def fake_import(name, *a, **k):
|
||||
if name in blocked or any(name.startswith(b + ".") for b in blocked):
|
||||
raise ImportError(f"simulado: {name}")
|
||||
return real_import(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||
|
||||
ctx = {"text_raw": {"review": _long_reviews()}}
|
||||
ch = build_text_distr(_text_profile(), ctx)
|
||||
# Still builds (the cheap, stdlib-only pieces remain) and never raises.
|
||||
assert ch is not None
|
||||
leaves = _flatten(ch.blocks)
|
||||
assert any(isinstance(b, KVTable) for b in leaves)
|
||||
assert any(isinstance(b, DataTable) for b in leaves)
|
||||
# A degradation note is present mentioning the missing optional libs.
|
||||
notes = " ".join(b.text for b in leaves if isinstance(b, Note))
|
||||
assert "langdetect" in notes or "textstat" in notes or "datasketch" in notes
|
||||
@@ -31,8 +31,12 @@ CHAPTER_ORDER = [
|
||||
"analisis_llm", # LLM interpretation — sits next to overview (user request)
|
||||
"num_distr", # numeric distributions
|
||||
"cat_distr", # categorical distributions
|
||||
"text_distr", # free-text / NLP distributions (non-tabular content)
|
||||
"calidad", # data quality
|
||||
"missingness", # missing-data patterns (co-occurrence of absences; MCAR/MAR)
|
||||
"outliers", # atypical values: univariate (Tukey/z) + multivariate (IsolationForest)
|
||||
"correlacion", # correlations / associations
|
||||
"relaciones", # key relations: declared/candidate PK + FK (inter/intra-table)
|
||||
"modelos", # cheap models (PCA/KMeans/outliers)
|
||||
"timeseries", # time-series analysis
|
||||
"geospatial", # geospatial
|
||||
|
||||
@@ -0,0 +1,253 @@
|
||||
"""Tests for the Markdown completeness appendix (report 2053).
|
||||
|
||||
The AutomaticEDA Markdown is the output meant to be *pasted into an LLM*, so it
|
||||
must carry EVERYTHING the engine computed — even the numbers the human-facing
|
||||
chapters (shared with the PDF/PPTX) drop for readability. ``render_md`` appends a
|
||||
full-data appendix built from ``meta['profile']`` that closes the six losses the
|
||||
evaluation found:
|
||||
|
||||
1. the complete association matrix (every pair, incl. correlation_ratio /
|
||||
cramers_v) — not just the top extremes;
|
||||
2. every numeric statistic for every numeric column (skew/kurtosis/percentiles);
|
||||
3. the concrete recommended re-expression;
|
||||
4. KMeans ``scores_by_k``;
|
||||
5. the normality test statistics;
|
||||
6. correct headers for bar/scree figure tables (not ``Desde/Hasta/Frecuencia``).
|
||||
|
||||
Self-contained: a synthetic profile, no DuckDB, no heavy renderer.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest # noqa: F401
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..", "..", "..")) # python/functions
|
||||
if _FUNCTIONS not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS)
|
||||
|
||||
from datascience.automatic_eda import model # noqa: E402
|
||||
from datascience.automatic_eda.render_md_impl import ( # noqa: E402
|
||||
_bars_table,
|
||||
_is_histogram_caption,
|
||||
_profile_appendix,
|
||||
render_md,
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Synthetic profile fixtures.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _numeric(skew, kurtosis):
|
||||
"""A numeric stat block with every key the appendix serializes."""
|
||||
return {
|
||||
"count": 100, "min": 0.0, "max": 10.0, "mean": 5.0, "median": 5.0,
|
||||
"mode": 4.0, "std": 2.0, "variance": 4.0, "cv": 0.4,
|
||||
"p1": 0.1, "p5": 0.5, "p25": 2.5, "p50": 5.0, "p75": 7.5,
|
||||
"p95": 9.5, "p99": 9.9, "iqr": 5.0, "skew": skew, "kurtosis": kurtosis,
|
||||
"n_outliers": 1, "distribution_type": "normal",
|
||||
}
|
||||
|
||||
|
||||
def _profile():
|
||||
"""A small but structurally faithful TableProfile (3 numeric, 2 categorical)."""
|
||||
pairs = [
|
||||
{"a": "A", "b": "B", "a_type": "numeric", "b_type": "numeric",
|
||||
"method": "pearson/spearman", "value": 0.8,
|
||||
"p_value": 1e-9, "p_value_adjusted": 2e-9, "significant": True},
|
||||
{"a": "A", "b": "C", "a_type": "numeric", "b_type": "numeric",
|
||||
"method": "pearson/spearman", "value": -0.3,
|
||||
"p_value": 0.01, "p_value_adjusted": 0.02, "significant": True},
|
||||
{"a": "A", "b": "Cat1", "a_type": "numeric", "b_type": "categorical",
|
||||
"method": "correlation_ratio", "value": 0.45,
|
||||
"p_value": 0.001, "p_value_adjusted": 0.002, "significant": True},
|
||||
# The single cat-cat pair the human chapter never shows.
|
||||
{"a": "Cat1", "b": "Cat2", "a_type": "categorical",
|
||||
"b_type": "categorical", "method": "cramers_v", "value": 0.11,
|
||||
"p_value": 0.04, "p_value_adjusted": 0.05, "significant": False},
|
||||
]
|
||||
return {
|
||||
"correlations": {
|
||||
"pairs": pairs,
|
||||
"multiple_testing": {"method": "bh", "n_tests": 4, "n_rejected": 3},
|
||||
},
|
||||
"columns": [
|
||||
{"name": "A", "count": 100, "numeric": _numeric(0.0, -1.2),
|
||||
"reexpression": {"recommended": "none", "ladder_power": 1.0,
|
||||
"reason": "symmetric", "alternatives": []}},
|
||||
{"name": "B", "count": 100, "numeric": _numeric(4.77, 33.1),
|
||||
"reexpression": {"recommended": "log1p", "ladder_power": 0.0,
|
||||
"reason": "skew 4.77 with zeros",
|
||||
"alternatives": [{"transform": "yeo-johnson"},
|
||||
{"transform": "sqrt"}]}},
|
||||
{"name": "C", "count": 100, "numeric": _numeric(-0.6, 0.2)},
|
||||
{"name": "Cat1", "categorical": {"top": [], "mode": "x"}},
|
||||
{"name": "Cat2", "categorical": {"top": [], "mode": "y"}},
|
||||
],
|
||||
"models": {
|
||||
"kmeans": {
|
||||
"best_k": 3,
|
||||
"scores_by_k": [
|
||||
{"k": 2, "silhouette": 0.46, "inertia": 900.0},
|
||||
{"k": 3, "silhouette": 0.50, "inertia": 550.0},
|
||||
{"k": 4, "silhouette": 0.38, "inertia": 430.0},
|
||||
],
|
||||
"cluster_sizes": [40, 35, 25],
|
||||
},
|
||||
"normality": {
|
||||
"A": {"n": 100,
|
||||
"jarque_bera": {"stat": 18.7, "p": 8e-5, "normal": False},
|
||||
"dagostino": {"stat": 18.1, "p": 1e-4, "normal": False},
|
||||
"shapiro": {"stat": 0.98, "p": 7e-8, "normal": False},
|
||||
"is_normal": False},
|
||||
"C": {"n": 100,
|
||||
"jarque_bera": {"stat": 2.1, "p": 0.35, "normal": True},
|
||||
"dagostino": {"stat": 1.9, "p": 0.38, "normal": True},
|
||||
"shapiro": {"stat": 0.99, "p": 0.12, "normal": True},
|
||||
"is_normal": True},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _dummy_chapters():
|
||||
"""A minimal one-chapter document so render_md does not early-return empty."""
|
||||
return model.as_chapters([
|
||||
{"id": "intro", "title": "Intro",
|
||||
"blocks": [{"kind": "markdown", "text": "cuerpo del informe"}]},
|
||||
])
|
||||
|
||||
|
||||
def _render(tmp_path, profile):
|
||||
out = os.path.join(str(tmp_path), "out.md")
|
||||
res = render_md(_dummy_chapters(), out, {"title": "EDA — t", "profile": profile})
|
||||
assert res["path"] == out
|
||||
return open(out, encoding="utf-8").read()
|
||||
|
||||
|
||||
def _table_rows(md, section_title):
|
||||
"""Count data rows of the first Markdown table under ``section_title``."""
|
||||
seg = md.split(section_title, 1)[1]
|
||||
rows, in_t, seen_sep = 0, False, False
|
||||
for ln in seg.splitlines():
|
||||
if ln.startswith("|"):
|
||||
in_t = True
|
||||
stripped = ln.replace("|", "").replace(" ", "")
|
||||
if stripped and set(stripped) == {"-"}:
|
||||
seen_sep = True
|
||||
continue
|
||||
if seen_sep:
|
||||
rows += 1
|
||||
elif in_t and not ln.strip():
|
||||
break
|
||||
return rows
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden: every datum the profile holds reaches the .md.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_appendix_lists_all_correlation_pairs(tmp_path):
|
||||
md = _render(tmp_path, _profile())
|
||||
assert "## Apéndice — Datos completos del perfil" in md
|
||||
# All 4 pairs (the real titanic profile has 28; here 4 synthetic).
|
||||
assert _table_rows(md, "### Matriz de asociación") == 4
|
||||
# The cat-cat Cramér's V pair the human chapter drops is present.
|
||||
assert "Cat1 ↔ Cat2" in md
|
||||
assert "cramers_v" in md
|
||||
assert "correlation_ratio" in md
|
||||
|
||||
|
||||
def test_appendix_has_skew_kurtosis_for_every_numeric(tmp_path):
|
||||
md = _render(tmp_path, _profile())
|
||||
seg = md.split("### Estadísticos numéricos completos", 1)[1].split("###", 1)[0]
|
||||
lines = [l for l in seg.splitlines() if l.startswith("|")]
|
||||
header = [h.strip() for h in lines[0].strip("|").split("|")]
|
||||
assert "skew" in header and "kurtosis" in header
|
||||
ski, kui = header.index("skew"), header.index("kurtosis")
|
||||
data = lines[2:] # skip header + separator
|
||||
assert len(data) == 3 # exactly the 3 numeric columns
|
||||
for row in data:
|
||||
cells = [c.strip() for c in row.strip("|").split("|")]
|
||||
assert cells[ski] != "", f"missing skew in {cells[0]}"
|
||||
assert cells[kui] != "", f"missing kurtosis in {cells[0]}"
|
||||
|
||||
|
||||
def test_appendix_has_extended_percentiles(tmp_path):
|
||||
md = _render(tmp_path, _profile())
|
||||
seg = md.split("### Estadísticos numéricos completos", 1)[1]
|
||||
header = [h.strip() for h in seg.splitlines()[2].strip("|").split("|")]
|
||||
for p in ("p1", "p5", "p25", "p75", "p95", "p99"):
|
||||
assert p in header, f"percentile {p} missing from describe header"
|
||||
|
||||
|
||||
def test_appendix_names_concrete_reexpression(tmp_path):
|
||||
md = _render(tmp_path, _profile())
|
||||
assert "### Re-expresión recomendada" in md
|
||||
assert "log1p" in md # the concrete transform, not just "consider re-expressing"
|
||||
assert "yeo-johnson" in md # alternatives listed too
|
||||
|
||||
|
||||
def test_appendix_has_kmeans_scores_by_k(tmp_path):
|
||||
md = _render(tmp_path, _profile())
|
||||
assert "scores_by_k" in md
|
||||
assert _table_rows(md, "#### KMeans — selección de k") == 3 # k=2,3,4
|
||||
|
||||
|
||||
def test_appendix_has_normality_statistics(tmp_path):
|
||||
md = _render(tmp_path, _profile())
|
||||
assert "JB stat" in md # the statistic, not only the p-value
|
||||
assert "Shapiro stat" in md
|
||||
assert _table_rows(md, "#### Tests de normalidad") == 2 # cols A and C
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Edge: a profile missing models / correlations degrades, never raises.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_lite_profile_without_models(tmp_path):
|
||||
prof = _profile()
|
||||
prof.pop("models") # lite: no KMeans/normality
|
||||
md = _render(tmp_path, prof)
|
||||
assert "scores_by_k" not in md # section skipped
|
||||
assert "Matriz de asociación" in md # correlations still dumped
|
||||
assert "## Apéndice" in md
|
||||
|
||||
|
||||
def test_profile_without_correlations(tmp_path):
|
||||
prof = _profile()
|
||||
prof.pop("correlations")
|
||||
md = _render(tmp_path, prof) # must not raise
|
||||
assert "Matriz de asociación" not in md
|
||||
assert "Estadísticos numéricos completos" in md # numeric section still there
|
||||
|
||||
|
||||
def test_no_profile_means_no_appendix(tmp_path):
|
||||
out = os.path.join(str(tmp_path), "noprof.md")
|
||||
res = render_md(_dummy_chapters(), out, {"title": "x"})
|
||||
assert res["path"] == out
|
||||
assert "## Apéndice" not in open(out, encoding="utf-8").read()
|
||||
|
||||
|
||||
def test_appendix_helper_is_defensive():
|
||||
assert _profile_appendix(None) == ""
|
||||
assert _profile_appendix({}) == ""
|
||||
assert _profile_appendix({"columns": []}) == ""
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Loss #6: bar/scree figure tables get a non-misleading header.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_histogram_caption_detection():
|
||||
assert _is_histogram_caption("Histograma de Age")
|
||||
assert _is_histogram_caption("Distribución de Fare")
|
||||
assert not _is_histogram_caption("Media de Survived por Sex")
|
||||
assert not _is_histogram_caption("Varianza explicada (scree PCA)")
|
||||
|
||||
|
||||
def test_bars_table_custom_header():
|
||||
bars = [(0.0, 1.0, 5.0), (1.0, 2.0, 3.0)]
|
||||
hist = _bars_table(bars) # default histogram header
|
||||
assert "| Desde | Hasta | Frecuencia |" in hist
|
||||
bar = _bars_table(bars, ("Inicio", "Fin", "Valor"))
|
||||
assert "| Inicio | Fin | Valor |" in bar
|
||||
assert "Frecuencia" not in bar
|
||||
@@ -139,10 +139,17 @@ class Group:
|
||||
it starts on a fresh page and flows (honest degradation, never cut). Use it to
|
||||
bind ``Heading`` + ``Markdown`` + ``Figure`` of one idea together (see the
|
||||
DISTR NUM / AGREGACION chapters).
|
||||
|
||||
When ``page_break_before`` is True the renderer additionally forces the group
|
||||
to *start* on a fresh page/slide (unless the current one is already empty), so
|
||||
a chapter can give each unit its own page — e.g. one categorical column per
|
||||
page (see CAT DISTR). It is purely additive: the default False keeps the plain
|
||||
keep-together behaviour for every existing chapter.
|
||||
"""
|
||||
|
||||
blocks: list = field(default_factory=list)
|
||||
title: Optional[str] = None
|
||||
page_break_before: bool = False
|
||||
kind: str = field(default="group", init=False)
|
||||
|
||||
|
||||
@@ -228,7 +235,9 @@ def as_block(obj: Any):
|
||||
return Note(text=_safe_str(obj.get("text")))
|
||||
if cls is Group:
|
||||
return Group(blocks=as_blocks(obj.get("blocks")),
|
||||
title=obj.get("title"))
|
||||
title=obj.get("title"),
|
||||
page_break_before=bool(
|
||||
obj.get("page_break_before", False)))
|
||||
if cls is GlossaryEntry:
|
||||
return GlossaryEntry(key=_safe_str(obj.get("key")),
|
||||
label=_safe_str(obj.get("label")),
|
||||
|
||||
@@ -0,0 +1,748 @@
|
||||
"""AutomaticEDA Markdown serializer — one self-contained file to paste to an LLM.
|
||||
|
||||
Same document model as the PDF/PPTX renderers (an ordered list of
|
||||
:class:`Chapter`, each a list of format-independent blocks) but emitted as plain
|
||||
**Markdown** instead of a binary. The goal is different from the other two
|
||||
renderers: a Markdown EDA is meant to be *pasted into an LLM*, so it prioritises
|
||||
TEXT and DATA over visuals. Tables become Markdown tables (every row dumped, no
|
||||
pagination — nothing is cut because there are no pages); a ``Figure`` becomes its
|
||||
caption plus, when possible, the underlying bar/histogram data as a Markdown
|
||||
table (an LLM cannot see the image); glossary term markers are stripped while
|
||||
``**bold**`` is kept (it is valid Markdown).
|
||||
|
||||
dict-no-throw (the ``eda`` group style): :func:`render_md` never raises. On a
|
||||
fatal error it returns ``{path: None, ...}`` with a ``note`` explaining why; a
|
||||
malformed block degrades to a readable note rather than crashing the document.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from . import model
|
||||
|
||||
# Glossary span markers (kept text, dropped markers). We intentionally do NOT use
|
||||
# ``text_layout.strip_inline_md`` for Markdown blocks because that also removes
|
||||
# ``**bold**`` — valid Markdown we want to preserve when pasting to an LLM.
|
||||
_TERM_OPEN_RE = re.compile(r"\[\[term:[A-Za-z0-9_]+\]\]")
|
||||
_MAX_BAR_ROWS = 100
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Small helpers.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _clean_terms(s) -> str:
|
||||
"""Drop glossary term markers, keeping the visible text (and any **bold**)."""
|
||||
s = model._safe_str(s)
|
||||
s = _TERM_OPEN_RE.sub("", s)
|
||||
return s.replace("[[/term]]", "")
|
||||
|
||||
|
||||
def _cell(v) -> str:
|
||||
"""Render a value as a safe Markdown table cell.
|
||||
|
||||
Escapes pipes (``|`` -> ``\\|``) so they do not break the column layout and
|
||||
folds newlines to ``<br>`` so a multi-line value stays inside one cell. None
|
||||
becomes an empty string.
|
||||
"""
|
||||
s = model._safe_str(v)
|
||||
s = s.replace("|", "\\|")
|
||||
s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\n", "<br>")
|
||||
return s
|
||||
|
||||
|
||||
def _slug(text: str) -> str:
|
||||
"""GitHub-style heading anchor: lowercase, spaces->'-', drop other symbols."""
|
||||
s = model._safe_str(text).strip().lower()
|
||||
out = []
|
||||
for ch in s:
|
||||
if ch.isalnum():
|
||||
out.append(ch)
|
||||
elif ch in " -":
|
||||
out.append("-")
|
||||
# any other symbol is dropped.
|
||||
slug = "".join(out)
|
||||
while "--" in slug:
|
||||
slug = slug.replace("--", "-")
|
||||
return slug.strip("-")
|
||||
|
||||
|
||||
def _fmt_num(v) -> str:
|
||||
"""Compact number for the figure data tables (ints as ints, else 4 sig figs)."""
|
||||
try:
|
||||
f = float(v)
|
||||
except Exception: # noqa: BLE001
|
||||
return model._safe_str(v)
|
||||
if f != f: # NaN
|
||||
return "NaN"
|
||||
if f == int(f) and abs(f) < 1e15:
|
||||
return str(int(f))
|
||||
return f"{f:.4g}"
|
||||
|
||||
|
||||
def _fmt_int(v) -> str:
|
||||
try:
|
||||
return str(int(v))
|
||||
except Exception: # noqa: BLE001
|
||||
return model._safe_str(v)
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
from datetime import datetime, timezone
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Document header (title + metadata blockquote + numbered index).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _meta_block(meta: dict) -> list:
|
||||
"""Build the metadata lines for the header blockquote (omitting absentees)."""
|
||||
ctx = meta.get("ctx") if isinstance(meta.get("ctx"), dict) else {}
|
||||
lines: list = []
|
||||
|
||||
def add(label, value) -> None:
|
||||
if value is None:
|
||||
return
|
||||
s = model._safe_str(value).strip()
|
||||
if s and s.lower() != "none":
|
||||
lines.append(f"**{label}:** {s}")
|
||||
|
||||
add("Dataset", ctx.get("dataset_name") or meta.get("dataset_name"))
|
||||
add("Fuente", ctx.get("source_origin") or meta.get("source_origin"))
|
||||
add("Almacenamiento", ctx.get("storage") or meta.get("storage"))
|
||||
n_rows = ctx.get("n_rows", meta.get("n_rows"))
|
||||
n_cols = ctx.get("n_cols", meta.get("n_cols"))
|
||||
if n_rows is not None and n_cols is not None:
|
||||
lines.append(
|
||||
f"**Dimensiones:** {_fmt_int(n_rows)} filas × {_fmt_int(n_cols)} columnas")
|
||||
add("Generado", meta.get("generated_at") or _now_iso())
|
||||
lines.append(f"**Motor:** {model.ENGINE_NAME} v{model.ENGINE_VERSION}")
|
||||
return lines
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Per-block serializers. Each returns a Markdown string (no surrounding blanks;
|
||||
# the caller separates blocks with a blank line).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _md_heading(block) -> str:
|
||||
level = int(getattr(block, "level", 1) or 1)
|
||||
hashes = "#" * min(level + 2, 6) # level1 -> ###; '#'/'##' reserved for doc/chapter.
|
||||
text = _clean_terms(getattr(block, "text", "")).strip()
|
||||
return f"{hashes} {text}"
|
||||
|
||||
|
||||
def _md_markdown(block) -> str:
|
||||
# Keep the text verbatim, dropping only glossary markers (keep **bold**).
|
||||
return _clean_terms(getattr(block, "text", "")).rstrip("\n")
|
||||
|
||||
|
||||
def _md_kv_table(block) -> str:
|
||||
lines: list = []
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
lines.append(f"**{_clean_terms(title).strip()}**")
|
||||
lines.append("")
|
||||
lines.append("| Campo | Valor |")
|
||||
lines.append("| --- | --- |")
|
||||
for row in (getattr(block, "rows", []) or []):
|
||||
try:
|
||||
label, value = row[0], row[1]
|
||||
except Exception: # noqa: BLE001
|
||||
label, value = row, ""
|
||||
lines.append(f"| {_cell(label)} | {_cell(value)} |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _md_data_table(block) -> str:
|
||||
lines: list = []
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
lines.append(f"**{_clean_terms(title).strip()}**")
|
||||
lines.append("")
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
if not header:
|
||||
ncol = max((len(r) for r in rows), default=1)
|
||||
header = [f"col{i + 1}" for i in range(ncol)]
|
||||
ncol = len(header)
|
||||
lines.append("| " + " | ".join(_cell(h) for h in header) + " |")
|
||||
lines.append("| " + " | ".join(["---"] * ncol) + " |")
|
||||
for r in rows: # dump every row — no pagination, nothing cut.
|
||||
cells = [_cell(r[c]) if c < len(r) else "" for c in range(ncol)]
|
||||
lines.append("| " + " | ".join(cells) + " |")
|
||||
note = getattr(block, "note", None)
|
||||
if note:
|
||||
lines.append("")
|
||||
lines.append(f"*{_clean_terms(note).strip()}*")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _bars_table(bars: list, header: tuple = ("Desde", "Hasta", "Frecuencia")) -> str:
|
||||
"""Render extracted bar/histogram data as a Markdown table.
|
||||
|
||||
``header`` is the 3-column header to use. Histogram bars are
|
||||
``(Desde, Hasta, Frecuencia)``; bar/scree charts (means by group, PCA
|
||||
explained variance) are *not* bins, so the caller passes a semantically
|
||||
correct header (e.g. ``(Inicio, Fin, Valor)``) to avoid the misleading
|
||||
"Frecuencia" label — see report 2053, loss #6.
|
||||
"""
|
||||
h0, h1, h2 = header
|
||||
lines = [f"| {h0} | {h1} | {h2} |", "| --- | --- | --- |"]
|
||||
shown = bars[:_MAX_BAR_ROWS]
|
||||
for x0, x1, h in shown:
|
||||
lines.append(f"| {_fmt_num(x0)} | {_fmt_num(x1)} | {_fmt_num(h)} |")
|
||||
out = "\n".join(lines)
|
||||
extra = len(bars) - len(shown)
|
||||
if extra > 0:
|
||||
out += f"\n\n*… ({extra} filas más)*"
|
||||
return out
|
||||
|
||||
|
||||
def _is_histogram_caption(caption: str) -> bool:
|
||||
"""True when a figure caption describes a histogram (genuine numeric bins).
|
||||
|
||||
Histograms are the only figures whose bars are real ``[Desde, Hasta)`` bins
|
||||
with a frequency count. Bar charts (means by group) and the PCA scree plot
|
||||
carry per-category / per-component values, not bins — they must not inherit
|
||||
the ``Desde/Hasta/Frecuencia`` header.
|
||||
"""
|
||||
c = (caption or "").lower()
|
||||
return "histograma" in c or "distribución" in c or "distribucion" in c
|
||||
|
||||
|
||||
def _extract_bars(fig) -> list:
|
||||
"""Collect (x_from, x_to, height) of the rectangular bars of a matplotlib fig.
|
||||
|
||||
Histogram / bar-chart bars are ``matplotlib.patches.Rectangle`` with positive
|
||||
width and height; spines, legends and zero-area artists are skipped. Never
|
||||
raises — returns ``[]`` on any problem.
|
||||
"""
|
||||
bars: list = []
|
||||
try:
|
||||
for ax in fig.get_axes():
|
||||
# Collect this axes' positive-area rectangles, then keep only the ones
|
||||
# that look like actual histogram/bar bins. Reference shapes that
|
||||
# matplotlib also stores in ``ax.patches`` — most notably the ``±1σ``
|
||||
# band drawn by ``axvspan`` (a single rectangle far wider than a bin)
|
||||
# and a lone Tukey boxplot box — would otherwise show up as fake
|
||||
# "bins". A histogram axes has several near-equal-width bars, so we
|
||||
# drop any rectangle whose width is more than twice the median width
|
||||
# of that axes' rectangles (the σ-band spans many bins; uniform bins
|
||||
# all sit at the median width and stay).
|
||||
ax_bars: list = []
|
||||
for patch in list(getattr(ax, "patches", []) or []):
|
||||
try:
|
||||
w = patch.get_width()
|
||||
h = patch.get_height()
|
||||
x = patch.get_x()
|
||||
except Exception: # noqa: BLE001 — not a Rectangle-like patch.
|
||||
continue
|
||||
if w and w > 0 and h and h > 0:
|
||||
ax_bars.append((x, x + w, h))
|
||||
if len(ax_bars) >= 3:
|
||||
widths = sorted(b[1] - b[0] for b in ax_bars)
|
||||
median_w = widths[len(widths) // 2]
|
||||
if median_w > 0:
|
||||
ax_bars = [b for b in ax_bars
|
||||
if (b[1] - b[0]) <= 2.0 * median_w]
|
||||
bars.extend(ax_bars)
|
||||
except Exception: # noqa: BLE001
|
||||
return []
|
||||
return bars
|
||||
|
||||
|
||||
def _md_figure(block, meta: dict, out_path: str, counter: list) -> str:
|
||||
"""Serialize a Figure prioritising TEXT + DATA (an LLM cannot see the image).
|
||||
|
||||
Emits the caption, then — if the matplotlib figure has bars — a Markdown table
|
||||
of the underlying (Desde, Hasta, Frecuencia) values. Optionally (when
|
||||
``meta['embed_figures']`` is True) also exports a PNG beside the .md and adds
|
||||
an image link; off by default so the Markdown stays self-contained.
|
||||
"""
|
||||
caption = model._safe_str(getattr(block, "caption", "")).strip()
|
||||
parts = [f"*Figura: {caption}*" if caption else "*Figura*"]
|
||||
fig = None
|
||||
try:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg") # defensive: headless rasterization backend.
|
||||
fig = getattr(block, "fig", None)
|
||||
make = getattr(block, "make", None)
|
||||
if fig is None and callable(make):
|
||||
fig = make()
|
||||
if fig is not None:
|
||||
bars = _extract_bars(fig)
|
||||
if bars:
|
||||
# A histogram's bars are genuine numeric bins (Desde/Hasta/
|
||||
# Frecuencia). Bar charts and the PCA scree plot are not bins —
|
||||
# give them a header that does not lie about "Frecuencia".
|
||||
header = (("Desde", "Hasta", "Frecuencia")
|
||||
if _is_histogram_caption(caption)
|
||||
else ("Inicio", "Fin", "Valor"))
|
||||
parts.append(_bars_table(bars, header))
|
||||
if meta.get("embed_figures"):
|
||||
png = _embed_png(fig, out_path, counter)
|
||||
if png:
|
||||
parts.append(f"")
|
||||
except Exception: # noqa: BLE001 — a bad figure degrades to just its caption.
|
||||
pass
|
||||
finally:
|
||||
if fig is not None:
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
plt.close(fig)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _embed_png(fig, out_path: str, counter: list) -> str:
|
||||
"""Export the figure to ``<basename>_figN.png`` beside the .md; return its name."""
|
||||
try:
|
||||
counter[0] += 1
|
||||
base = os.path.splitext(os.path.basename(out_path))[0] or "figura"
|
||||
name = f"{base}_fig{counter[0]}.png"
|
||||
path = os.path.join(os.path.dirname(os.path.abspath(out_path)), name)
|
||||
fig.savefig(path, format="png", dpi=120, bbox_inches="tight")
|
||||
return name
|
||||
except Exception: # noqa: BLE001
|
||||
return ""
|
||||
|
||||
|
||||
def _md_image(block) -> str:
|
||||
path = model._safe_str(getattr(block, "path", ""))
|
||||
caption = model._safe_str(getattr(block, "caption", "")).strip()
|
||||
out = f""
|
||||
if caption:
|
||||
out += f"\n\n*{caption}*"
|
||||
return out
|
||||
|
||||
|
||||
def _md_caption(block) -> str:
|
||||
return f"*{_clean_terms(getattr(block, 'text', '')).strip()}*"
|
||||
|
||||
|
||||
def _md_note(block) -> str:
|
||||
text = _clean_terms(getattr(block, "text", "")).strip()
|
||||
lines = text.split("\n")
|
||||
return "\n".join((f"> {ln}" if ln.strip() else ">") for ln in lines)
|
||||
|
||||
|
||||
def _md_group(block, meta: dict, out_path: str, counter: list) -> str:
|
||||
parts: list = []
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
parts.append(f"### {_clean_terms(title).strip()}")
|
||||
for b in (getattr(block, "blocks", []) or []):
|
||||
try:
|
||||
seg = _serialize_block(b, meta, out_path, counter)
|
||||
except Exception: # noqa: BLE001
|
||||
seg = ""
|
||||
if seg:
|
||||
parts.append(seg)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _md_glossary_entry(block) -> str:
|
||||
label = (model._safe_str(getattr(block, "label", "")).strip()
|
||||
or model._safe_str(getattr(block, "key", "")).strip())
|
||||
definition = _clean_terms(getattr(block, "definition", "")).strip()
|
||||
out = f"### {label}"
|
||||
if definition:
|
||||
out += f"\n\n{definition}"
|
||||
return out
|
||||
|
||||
|
||||
def _serialize_block(block, meta: dict, out_path: str, counter: list) -> str:
|
||||
"""Dispatch a single block to its Markdown serializer. Unknown -> note."""
|
||||
kind = getattr(block, "kind", "")
|
||||
if kind == "heading":
|
||||
return _md_heading(block)
|
||||
if kind == "markdown":
|
||||
return _md_markdown(block)
|
||||
if kind == "kv_table":
|
||||
return _md_kv_table(block)
|
||||
if kind == "data_table":
|
||||
return _md_data_table(block)
|
||||
if kind == "figure":
|
||||
return _md_figure(block, meta, out_path, counter)
|
||||
if kind == "image":
|
||||
return _md_image(block)
|
||||
if kind == "caption":
|
||||
return _md_caption(block)
|
||||
if kind == "note":
|
||||
return _md_note(block)
|
||||
if kind == "group":
|
||||
return _md_group(block, meta, out_path, counter)
|
||||
if kind == "glossary_entry":
|
||||
return _md_glossary_entry(block)
|
||||
# Unknown content -> readable note (mirrors the model's defensive coercion).
|
||||
return _md_note(model.Note(text=model._safe_str(block)))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Profile appendix — the data the human-facing chapters drop.
|
||||
#
|
||||
# The chapter document (shared with the PDF/PPTX renderers) is designed for human
|
||||
# reading and intentionally omits raw numbers: the correlation matrix shows only
|
||||
# the top extremes, the numeric blocks skip skew/kurtosis/extended percentiles,
|
||||
# the model chapter does not list ``scores_by_k`` or the normality test
|
||||
# statistics. But the Markdown is meant to be *pasted into an LLM*, so it should
|
||||
# carry EVERYTHING the engine computed. This appendix serializes the full
|
||||
# ``profile`` (passed via ``meta['profile']``) as Markdown tables, additively:
|
||||
# the PDF/PPTX are untouched, the .md simply has more than they do. Each section
|
||||
# is emitted only when its source data is present, so a ``lite`` profile (no
|
||||
# models) or a profile without correlations degrades cleanly instead of raising.
|
||||
# See report 2053 for the six losses this closes.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _pair_types(a_type, b_type) -> str:
|
||||
"""Short ``num↔cat`` label for an association pair's variable types."""
|
||||
def short(t):
|
||||
t = model._safe_str(t).lower()
|
||||
if t.startswith("num"):
|
||||
return "num"
|
||||
if t.startswith("cat"):
|
||||
return "cat"
|
||||
return t or "?"
|
||||
return f"{short(a_type)}↔{short(b_type)}"
|
||||
|
||||
|
||||
def _app_correlations(corr: dict) -> str:
|
||||
"""Loss #1 — every association pair (not just the top extremes).
|
||||
|
||||
Dumps all of ``correlations['pairs']`` as a table (pair · types · method ·
|
||||
value · p · p-FDR · significant), ordered by |value| desc so the strongest
|
||||
associations lead while nothing is cut. Includes the ``correlation_ratio``
|
||||
(num↔cat) and ``cramers_v`` (cat↔cat) pairs the human chapter never shows.
|
||||
"""
|
||||
pairs = list(corr.get("pairs", []) or [])
|
||||
if not pairs:
|
||||
return ""
|
||||
def keyfn(p):
|
||||
try:
|
||||
return -abs(float(p.get("value")))
|
||||
except Exception: # noqa: BLE001
|
||||
return 0.0
|
||||
pairs_sorted = sorted(pairs, key=keyfn)
|
||||
lines = ["### Matriz de asociación — todos los pares",
|
||||
"",
|
||||
("| Par | Tipos | Método | Valor | p-value | p-ajustado (FDR) "
|
||||
"| ¿Sig? |"),
|
||||
"| --- | --- | --- | --- | --- | --- | --- |"]
|
||||
for p in pairs_sorted:
|
||||
par = f"{_cell(p.get('a'))} ↔ {_cell(p.get('b'))}"
|
||||
types = _pair_types(p.get("a_type"), p.get("b_type"))
|
||||
method = _cell(p.get("method"))
|
||||
val = _fmt_num(p.get("value"))
|
||||
pv = _fmt_num(p.get("p_value")) if p.get("p_value") is not None else ""
|
||||
padj = (_fmt_num(p.get("p_value_adjusted"))
|
||||
if p.get("p_value_adjusted") is not None else "")
|
||||
sig = "sí" if p.get("significant") else "no"
|
||||
lines.append(
|
||||
f"| {par} | {types} | {method} | {val} | {pv} | {padj} | {sig} |")
|
||||
mt = corr.get("multiple_testing") or {}
|
||||
n_tests = mt.get("n_tests", corr.get("n_tests"))
|
||||
n_rej = mt.get("n_rejected")
|
||||
note_bits = [f"{len(pairs)} pares en total"]
|
||||
if n_tests is not None and n_rej is not None:
|
||||
note_bits.append(
|
||||
f"{n_rej} de {n_tests} significativos tras corrección "
|
||||
f"{model._safe_str(mt.get('method', 'FDR')).upper()}")
|
||||
lines.append("")
|
||||
lines.append(f"*{'; '.join(note_bits)}.*")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# Numeric statistics, in serialization order: (profile key, column header).
|
||||
_NUM_STATS = [
|
||||
("count", "n"), ("mean", "mean"), ("median", "median"), ("mode", "mode"),
|
||||
("std", "std"), ("variance", "variance"), ("cv", "cv"),
|
||||
("skew", "skew"), ("kurtosis", "kurtosis"),
|
||||
("min", "min"), ("p1", "p1"), ("p5", "p5"), ("p25", "p25"), ("p50", "p50"),
|
||||
("p75", "p75"), ("p95", "p95"), ("p99", "p99"), ("iqr", "iqr"),
|
||||
("max", "max"), ("n_outliers", "outliers"),
|
||||
("distribution_type", "distribución"),
|
||||
]
|
||||
|
||||
|
||||
def _app_numeric_describe(columns: list) -> str:
|
||||
"""Loss #2 — every numeric statistic for every numeric column.
|
||||
|
||||
One row per numeric column with the full describe: mean/median/mode/std/
|
||||
variance/cv, skew & kurtosis (for ALL columns, not only the skewed ones),
|
||||
p1/p5/p25/p50/p75/p95/p99, iqr, min/max, outliers and distribution_type.
|
||||
"""
|
||||
rows = []
|
||||
for info in (columns or []):
|
||||
num = info.get("numeric") if isinstance(info, dict) else None
|
||||
if not num:
|
||||
continue
|
||||
name = _cell(info.get("name"))
|
||||
cells = [name]
|
||||
for key, _hdr in _NUM_STATS:
|
||||
v = num.get("count" if key == "count" else key)
|
||||
if key == "count":
|
||||
v = num.get("count", info.get("count"))
|
||||
if key == "distribution_type":
|
||||
cells.append(_cell(v))
|
||||
else:
|
||||
cells.append(_fmt_num(v) if v is not None else "")
|
||||
rows.append(cells)
|
||||
if not rows:
|
||||
return ""
|
||||
header = ["Columna"] + [hdr for _k, hdr in _NUM_STATS]
|
||||
lines = ["### Estadísticos numéricos completos (describe)",
|
||||
"",
|
||||
"| " + " | ".join(header) + " |",
|
||||
"| " + " | ".join(["---"] * len(header)) + " |"]
|
||||
for cells in rows:
|
||||
lines.append("| " + " | ".join(cells) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _app_reexpression(columns: list) -> str:
|
||||
"""Loss #3 — the concrete recommended re-expression per column.
|
||||
|
||||
Names the transform (log1p/sqrt/yeo-johnson/none) instead of a vague
|
||||
"consider re-expressing", with the ladder power, reason and alternatives.
|
||||
"""
|
||||
rows = []
|
||||
for info in (columns or []):
|
||||
rx = info.get("reexpression") if isinstance(info, dict) else None
|
||||
if not rx or not isinstance(rx, dict):
|
||||
continue
|
||||
rec = model._safe_str(rx.get("recommended")).strip()
|
||||
if not rec:
|
||||
continue
|
||||
alts = rx.get("alternatives") or []
|
||||
alt_txt = ", ".join(
|
||||
model._safe_str(a.get("transform")) for a in alts
|
||||
if isinstance(a, dict) and a.get("transform")) or "—"
|
||||
rows.append([
|
||||
_cell(info.get("name")), _cell(rec),
|
||||
_fmt_num(rx.get("ladder_power")) if rx.get("ladder_power") is not None else "",
|
||||
_cell(rx.get("reason")), _cell(alt_txt),
|
||||
])
|
||||
if not rows:
|
||||
return ""
|
||||
lines = ["### Re-expresión recomendada (escalera de Tukey)",
|
||||
"",
|
||||
"| Columna | Recomendada | Potencia | Razón | Alternativas |",
|
||||
"| --- | --- | --- | --- | --- |"]
|
||||
for r in rows:
|
||||
lines.append("| " + " | ".join(r) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _app_kmeans_scores(kmeans: dict) -> str:
|
||||
"""Loss #4 — KMeans silhouette + inertia per k (justifies the chosen k)."""
|
||||
scores = list(kmeans.get("scores_by_k", []) or [])
|
||||
if not scores:
|
||||
return ""
|
||||
best_k = kmeans.get("best_k")
|
||||
lines = ["#### KMeans — selección de k (`scores_by_k`)",
|
||||
"",
|
||||
"| k | Silhouette | Inercia | Elegido |",
|
||||
"| --- | --- | --- | --- |"]
|
||||
for s in scores:
|
||||
if not isinstance(s, dict):
|
||||
continue
|
||||
k = s.get("k")
|
||||
chosen = "✓" if best_k is not None and k == best_k else ""
|
||||
lines.append(
|
||||
f"| {_fmt_num(k)} | {_fmt_num(s.get('silhouette'))} "
|
||||
f"| {_fmt_num(s.get('inertia'))} | {chosen} |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _app_normality(normality: dict) -> str:
|
||||
"""Loss #5 — each normality test's statistic next to its p-value."""
|
||||
if not isinstance(normality, dict) or not normality:
|
||||
return ""
|
||||
lines = ["#### Tests de normalidad (estadístico + p-value)",
|
||||
"",
|
||||
("| Columna | n | JB stat | JB p | D'Agostino stat | D'Agostino p "
|
||||
"| Shapiro stat | Shapiro p | ¿Normal? |"),
|
||||
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |"]
|
||||
any_row = False
|
||||
for col, res in normality.items():
|
||||
if not isinstance(res, dict):
|
||||
continue
|
||||
jb = res.get("jarque_bera") or {}
|
||||
da = res.get("dagostino") or {}
|
||||
sh = res.get("shapiro") or {}
|
||||
is_norm = "sí" if res.get("is_normal") else "no"
|
||||
lines.append(
|
||||
f"| {_cell(col)} | {_fmt_num(res.get('n')) if res.get('n') is not None else ''} "
|
||||
f"| {_fmt_num(jb.get('stat'))} | {_fmt_num(jb.get('p'))} "
|
||||
f"| {_fmt_num(da.get('stat'))} | {_fmt_num(da.get('p'))} "
|
||||
f"| {_fmt_num(sh.get('stat'))} | {_fmt_num(sh.get('p'))} | {is_norm} |")
|
||||
any_row = True
|
||||
return "\n".join(lines) if any_row else ""
|
||||
|
||||
|
||||
def _profile_appendix(profile: dict) -> str:
|
||||
"""Build the full-data appendix from a TableProfile dict (additive).
|
||||
|
||||
Returns a Markdown ``## Apéndice`` section with one sub-table per loss the
|
||||
human chapters drop, or ``""`` when the profile carries none of them. Never
|
||||
raises: a missing/oddly-shaped section is skipped, not fatal.
|
||||
"""
|
||||
if not isinstance(profile, dict):
|
||||
return ""
|
||||
sections: list = []
|
||||
try:
|
||||
corr = profile.get("correlations") or {}
|
||||
seg = _app_correlations(corr) if isinstance(corr, dict) else ""
|
||||
if seg:
|
||||
sections.append(seg)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
try:
|
||||
columns = profile.get("columns") or []
|
||||
seg = _app_numeric_describe(columns)
|
||||
if seg:
|
||||
sections.append(seg)
|
||||
seg = _app_reexpression(columns)
|
||||
if seg:
|
||||
sections.append(seg)
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
try:
|
||||
models = profile.get("models") or {}
|
||||
if isinstance(models, dict):
|
||||
model_segs = []
|
||||
seg = _app_kmeans_scores(models.get("kmeans") or {})
|
||||
if seg:
|
||||
model_segs.append(seg)
|
||||
seg = _app_normality(models.get("normality") or {})
|
||||
if seg:
|
||||
model_segs.append(seg)
|
||||
if model_segs:
|
||||
sections.append(
|
||||
"### Modelos — detalle\n\n" + "\n\n".join(model_segs))
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
if not sections:
|
||||
return ""
|
||||
intro = ("Volcado completo de los datos que el motor computó y que los "
|
||||
"capítulos (pensados para lectura humana / PDF) resumen. "
|
||||
"Pensado para que un LLM reconstruya el análisis entero.")
|
||||
return ("## Apéndice — Datos completos del perfil\n\n"
|
||||
f"*{intro}*\n\n" + "\n\n".join(sections))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def render_md(chapters: list, out_path: str, meta: dict = None) -> dict:
|
||||
"""Serialize a list of Chapters into a single self-contained Markdown file.
|
||||
|
||||
The output leads with ``# <title>``, a metadata blockquote and a numbered
|
||||
``## Índice`` linking each chapter, then one ``## N. <title>`` section per
|
||||
chapter with its blocks. Tables become Markdown tables (every row dumped),
|
||||
figures become caption + underlying data table, glossary markers are stripped
|
||||
while ``**bold**`` is kept. Designed to be pasted into an LLM.
|
||||
|
||||
Args:
|
||||
chapters: a list of ``Chapter`` (dataclasses or dicts); normalized
|
||||
defensively with ``model.as_chapters``.
|
||||
out_path: filesystem path for the ``.md`` (parent dirs are created).
|
||||
meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
|
||||
``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
|
||||
``generated_at``, ``embed_figures`` (export PNGs beside the .md,
|
||||
default False).
|
||||
|
||||
Returns:
|
||||
dict (never raises): ``{path: str|None, n_chars: int,
|
||||
chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
|
||||
None and ``note`` explains why.
|
||||
"""
|
||||
meta = meta or {}
|
||||
chapters = model.as_chapters(chapters)
|
||||
title = model._safe_str(meta.get("title")) or model.ENGINE_NAME
|
||||
|
||||
# Edge: nothing to render -> a minimal but valid Markdown document.
|
||||
if not chapters:
|
||||
content = (f"# {title}\n\n"
|
||||
"*(documento vacío — sin capítulos aplicables)*\n")
|
||||
return _write(out_path, content, [], "documento vacío")
|
||||
|
||||
counter = [0] # document-wide figure counter for unique PNG names.
|
||||
notes: list = []
|
||||
segments: list = [f"# {title}"]
|
||||
|
||||
meta_lines = _meta_block(meta)
|
||||
if meta_lines:
|
||||
segments.append("\n".join(f"> {ln}" for ln in meta_lines))
|
||||
|
||||
# Numbered index. The anchor matches the chapter heading emitted below
|
||||
# (``## N. <title>``) in GitHub slug style.
|
||||
chap_heads = []
|
||||
idx_lines = ["## Índice"]
|
||||
for i, ch in enumerate(chapters, 1):
|
||||
head_text = f"{i}. {model._safe_str(ch.title)}"
|
||||
anchor = _slug(head_text)
|
||||
chap_heads.append((head_text, anchor))
|
||||
idx_lines.append(f"{i}. [{model._safe_str(ch.title)}](#{anchor})")
|
||||
segments.append("\n".join(idx_lines))
|
||||
|
||||
chapters_meta = []
|
||||
for i, ch in enumerate(chapters, 1):
|
||||
segments.append("---")
|
||||
head_text, _anchor = chap_heads[i - 1]
|
||||
segments.append(f"## {head_text}")
|
||||
|
||||
blocks = list(ch.blocks or [])
|
||||
# Omit a leading level-1 Heading that just repeats the chapter title.
|
||||
if blocks:
|
||||
b0 = blocks[0]
|
||||
if (getattr(b0, "kind", "") == "heading"
|
||||
and int(getattr(b0, "level", 1) or 1) == 1
|
||||
and _clean_terms(getattr(b0, "text", "")).strip()
|
||||
== model._safe_str(ch.title).strip()):
|
||||
blocks = blocks[1:]
|
||||
|
||||
for block in blocks:
|
||||
try:
|
||||
seg = _serialize_block(block, meta, out_path, counter)
|
||||
except Exception as e: # noqa: BLE001
|
||||
seg = _md_note(model.Note(text=model._safe_str(block)))
|
||||
notes.append(
|
||||
f"bloque '{getattr(block, 'kind', '?')}' del capítulo "
|
||||
f"'{ch.id}' degradado: {e}")
|
||||
if seg:
|
||||
segments.append(seg)
|
||||
chapters_meta.append({"id": ch.id, "version": ch.version})
|
||||
|
||||
# Full-data appendix: dump everything the profile holds that the human
|
||||
# chapters drop (additive — the .md ends up with more than the PDF/PPTX).
|
||||
# Emitted only when a profile is supplied via meta['profile']; never fatal.
|
||||
try:
|
||||
appendix = _profile_appendix(meta.get("profile"))
|
||||
except Exception as e: # noqa: BLE001
|
||||
appendix = ""
|
||||
notes.append(f"apéndice de perfil omitido: {e}")
|
||||
if appendix:
|
||||
segments.append("---")
|
||||
segments.append(appendix)
|
||||
|
||||
content = "\n\n".join(segments) + "\n"
|
||||
note = f"{len(content)} caracteres"
|
||||
if notes:
|
||||
note += " · " + "; ".join(notes)
|
||||
return _write(out_path, content, chapters_meta, note)
|
||||
|
||||
|
||||
def _write(out_path: str, content: str, chapters_meta: list, note: str) -> dict:
|
||||
"""Write the Markdown to disk (creating parents). dict-no-throw."""
|
||||
try:
|
||||
parent = os.path.dirname(os.path.abspath(out_path))
|
||||
os.makedirs(parent, exist_ok=True)
|
||||
with open(out_path, "w", encoding="utf-8") as fh:
|
||||
fh.write(content)
|
||||
except Exception as e: # noqa: BLE001 — never raise from the writer.
|
||||
return {"path": None, "n_chars": 0, "chapters": [],
|
||||
"note": f"no se pudo escribir el Markdown: {e}"}
|
||||
return {"path": out_path, "n_chars": len(content),
|
||||
"chapters": chapters_meta, "note": note}
|
||||
@@ -675,6 +675,61 @@ def _measure_figure_like(block) -> float:
|
||||
return target_h + 0.04 + cap_h + _GAP
|
||||
|
||||
|
||||
def _measure_kv_table(block) -> float:
|
||||
"""Faithful height of a KVTable — matches ``_place_kv_table``.
|
||||
|
||||
Counts the optional title heading and, per row, the wrapped VALUE column
|
||||
(the label column never wraps in the placer). The previous estimate assumed
|
||||
one line per row and ignored the title, so a column's keep-together Group
|
||||
under-budgeted the figure and the chart spilled to the next page. Keep this in
|
||||
sync with ``_place_kv_table``."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
rows = getattr(block, "rows", []) or []
|
||||
key_w = 1.9
|
||||
val_chars = tl.chars_per_line(_USABLE_W - key_w - 0.1, _FS_BODY)
|
||||
lh = tl.line_height_in(_FS_BODY)
|
||||
for row in rows:
|
||||
try:
|
||||
value = row[1]
|
||||
except Exception: # noqa: BLE001
|
||||
value = ""
|
||||
v_lines = tl.wrap(model._safe_str(value), val_chars)
|
||||
h += lh * len(v_lines) + _ROW_VPAD
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_data_table(block) -> float:
|
||||
"""Faithful height of a DataTable — matches ``_place_data_table``.
|
||||
|
||||
Counts the optional title heading, the wrapped header row, every wrapped data
|
||||
row (per-column wrap via the same ``_col_widths``/``_wrap_row`` the placer
|
||||
uses) and the optional note. Keep this in sync with ``_place_data_table``."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
fs = _FS_CELL
|
||||
widths = _col_widths(header, rows, fs)
|
||||
lh = tl.line_height_in(fs)
|
||||
if header:
|
||||
header_lines = _wrap_row(header, widths, fs)
|
||||
h += lh * max((len(c) for c in header_lines), default=1) + _ROW_VPAD * 2
|
||||
for r in rows:
|
||||
cells_lines = _wrap_row(r, widths, fs)
|
||||
h += lh * max((len(c) for c in cells_lines), default=1) + _ROW_VPAD * 2
|
||||
note = getattr(block, "note", None)
|
||||
if note:
|
||||
nlines = tl.wrap(model._safe_str(note),
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
h += tl.line_height_in(_FS_NOTE) * len(nlines)
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_block(st: _PdfState, block) -> float:
|
||||
kind = getattr(block, "kind", "")
|
||||
try:
|
||||
@@ -690,13 +745,9 @@ def _measure_block(st: _PdfState, block) -> float:
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
return tl.line_height_in(_FS_NOTE) * len(lines) + _GAP
|
||||
if kind == "kv_table":
|
||||
rows = getattr(block, "rows", []) or []
|
||||
return (tl.line_height_in(_FS_BODY) + _ROW_VPAD) * (len(rows) + 1) \
|
||||
+ _GAP
|
||||
return _measure_kv_table(block)
|
||||
if kind == "data_table":
|
||||
rows = getattr(block, "rows", []) or []
|
||||
return (tl.line_height_in(_FS_CELL) + _ROW_VPAD * 2) \
|
||||
* (len(rows) + 1) + _GAP
|
||||
return _measure_data_table(block)
|
||||
if kind == "group":
|
||||
return sum(_measure_block(st, b)
|
||||
for b in (getattr(block, "blocks", []) or []))
|
||||
@@ -735,6 +786,10 @@ def _place_group(st: _PdfState, block) -> None:
|
||||
blocks = getattr(block, "blocks", []) or []
|
||||
if not blocks:
|
||||
return
|
||||
# Opt-in page break: start this group on a fresh page unless the current one
|
||||
# is still empty (so a chapter can give each unit its own page).
|
||||
if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
|
||||
_new_page(st)
|
||||
avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
|
||||
_shrink_group_figures(st, blocks, avail_full)
|
||||
total = sum(_measure_block(st, b) for b in blocks)
|
||||
|
||||
@@ -625,6 +625,55 @@ def _measure_figure_like(block) -> float:
|
||||
return target_h + 0.05 + cap_h + _GAP
|
||||
|
||||
|
||||
def _measure_kv_table(block) -> float:
|
||||
"""Faithful KVTable height — matches ``_place_kv_table`` (rendered as a
|
||||
Campo/Valor data table with wrapped cells). The previous estimate assumed one
|
||||
line per row and ignored the title, so a keep-together Group under-budgeted
|
||||
the figure and the chart spilled to the next slide. Keep in sync."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
rows = getattr(block, "rows", []) or []
|
||||
data_rows = []
|
||||
for row in rows:
|
||||
try:
|
||||
label, value = row[0], row[1]
|
||||
except Exception: # noqa: BLE001
|
||||
label, value = str(row), ""
|
||||
data_rows.append([model._safe_str(label), model._safe_str(value)])
|
||||
header = ["Campo", "Valor"]
|
||||
widths = _col_widths(header, data_rows)
|
||||
fs = _FS_CELL
|
||||
h += _row_height_in(header, widths, fs)
|
||||
for r in data_rows:
|
||||
h += _row_height_in(r, widths, fs)
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_data_table(block) -> float:
|
||||
"""Faithful DataTable height — matches ``_place_data_table`` (title heading +
|
||||
wrapped header + every wrapped row + optional note). Keep in sync."""
|
||||
h = 0.0
|
||||
title = getattr(block, "title", None)
|
||||
if title:
|
||||
h += _measure_heading_text(title, 2)
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
fs = _FS_CELL
|
||||
widths = _col_widths(header, rows)
|
||||
if header:
|
||||
h += _row_height_in(header, widths, fs)
|
||||
for r in rows:
|
||||
h += _row_height_in(r, widths, fs)
|
||||
note = getattr(block, "note", None)
|
||||
if note:
|
||||
nlines = tl.wrap(model._safe_str(note),
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
h += tl.line_height_in(_FS_NOTE) * len(nlines) + 0.05
|
||||
return h + _GAP
|
||||
|
||||
|
||||
def _measure_block(st: _PptxState, block) -> float:
|
||||
kind = getattr(block, "kind", "")
|
||||
try:
|
||||
@@ -639,9 +688,10 @@ def _measure_block(st: _PptxState, block) -> float:
|
||||
lines = tl.wrap(getattr(block, "text", ""),
|
||||
tl.chars_per_line(_USABLE_W, _FS_NOTE))
|
||||
return tl.line_height_in(_FS_NOTE) * len(lines) + 0.05 + _GAP
|
||||
if kind in ("kv_table", "data_table"):
|
||||
rows = getattr(block, "rows", []) or []
|
||||
return (tl.line_height_in(_FS_CELL) + 0.10) * (len(rows) + 1) + _GAP
|
||||
if kind == "kv_table":
|
||||
return _measure_kv_table(block)
|
||||
if kind == "data_table":
|
||||
return _measure_data_table(block)
|
||||
if kind == "group":
|
||||
return sum(_measure_block(st, b)
|
||||
for b in (getattr(block, "blocks", []) or []))
|
||||
@@ -664,10 +714,14 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
|
||||
if getattr(b, "kind", "") not in ("figure", "image"))
|
||||
fig_overhead = tl.line_height_in(_FS_NOTE) + 0.05 + 0.05 + _GAP
|
||||
budget = avail_full - nonfig_h - 0.10 * len(fig_blocks)
|
||||
if budget <= 1.0:
|
||||
# Low thresholds: a 16:9 slide is short, so a content-heavy column (cardinality
|
||||
# table + top-k + chart) only fits if the chart is allowed to shrink small.
|
||||
# Prefer a small-but-present chart on the SAME slide over splitting the column
|
||||
# across slides (matches the PDF renderer's keep-together philosophy).
|
||||
if budget <= 0.6:
|
||||
return # not enough room to keep together; let it flow (degrade).
|
||||
per = budget / len(fig_blocks) - fig_overhead
|
||||
if per <= 0.8:
|
||||
if per <= 0.35:
|
||||
return
|
||||
for fb in fig_blocks:
|
||||
cur = getattr(fb, "height_in", None)
|
||||
@@ -675,12 +729,90 @@ def _shrink_group_figures(st: _PptxState, blocks: list, avail_full: float) -> No
|
||||
if isinstance(cur, (int, float)) and cur > 0 else per)
|
||||
|
||||
|
||||
# Minimum height (inches) reserved for a figure inside a keep-together group on
|
||||
# the short 16:9 slide. When a high-cardinality column's table(s) would otherwise
|
||||
# leave no room, the data table is trimmed (with an honest note) so the chart
|
||||
# stays on the SAME slide next to its table instead of spilling to the next one.
|
||||
_GROUP_MIN_FIG_H = 1.3
|
||||
|
||||
|
||||
def _trim_data_table_to_budget(block, budget: float):
|
||||
"""Return a copy of a DataTable whose rows fit within ``budget`` inches.
|
||||
|
||||
Keeps the title, header, as many leading rows as fit (at least one) and an
|
||||
honest note reporting how many of the original rows are shown. NEVER mutates
|
||||
the original block — the same Chapter blocks are rendered by the PDF renderer,
|
||||
which keeps the full table (an A5 page fits it)."""
|
||||
header = list(getattr(block, "header", []) or [])
|
||||
rows = list(getattr(block, "rows", []) or [])
|
||||
title = getattr(block, "title", None)
|
||||
fs = _FS_CELL
|
||||
widths = _col_widths(header, rows)
|
||||
fixed = 0.0
|
||||
if title:
|
||||
fixed += _measure_heading_text(title, 2)
|
||||
if header:
|
||||
fixed += _row_height_in(header, widths, fs)
|
||||
note_h = tl.line_height_in(_FS_NOTE) + 0.05
|
||||
avail_rows = budget - fixed - note_h - _GAP
|
||||
kept = []
|
||||
used = 0.0
|
||||
for r in rows:
|
||||
rh = _row_height_in(r, widths, fs)
|
||||
if used + rh > avail_rows and kept:
|
||||
break
|
||||
kept.append(r)
|
||||
used += rh
|
||||
if len(kept) >= len(rows):
|
||||
return block # already fits; keep the original (with its own note).
|
||||
note = (f"top {len(kept)} de {len(rows)} categorías mostradas "
|
||||
"(recortado para caber en el slide; el PDF muestra más)")
|
||||
return model.DataTable(header=header, rows=kept, title=title, note=note)
|
||||
|
||||
|
||||
def _fit_group_blocks(st: _PptxState, blocks: list, avail_full: float) -> list:
|
||||
"""Return a slide-fitting copy of a keep-together group's blocks.
|
||||
|
||||
On the short 16:9 slide a high-cardinality column's top-k table plus its
|
||||
chart can overflow. Reserve ``_GROUP_MIN_FIG_H`` for the (later shrunk) figure
|
||||
and trim the data table(s) to what is left, so every column keeps its chart
|
||||
next to its table on ONE slide. No-op when the group has no figure+table pair
|
||||
(e.g. id-like columns already drop the top-k upstream, or it already fits)."""
|
||||
has_fig = any(getattr(b, "kind", "") in ("figure", "image") for b in blocks)
|
||||
tbls = [b for b in blocks if getattr(b, "kind", "") == "data_table"]
|
||||
if not (has_fig and tbls):
|
||||
return blocks
|
||||
fixed_h = sum(_measure_block(st, b) for b in blocks
|
||||
if getattr(b, "kind", "") not in ("figure", "image",
|
||||
"data_table"))
|
||||
tables_h = sum(_measure_block(st, b) for b in tbls)
|
||||
budget_tables = avail_full - fixed_h - _GROUP_MIN_FIG_H
|
||||
if tables_h <= budget_tables:
|
||||
return blocks # already fits next to a min-height figure; leave intact.
|
||||
out = []
|
||||
for b in blocks:
|
||||
if getattr(b, "kind", "") != "data_table":
|
||||
out.append(b)
|
||||
continue
|
||||
trimmed = _trim_data_table_to_budget(b, max(budget_tables, 0.8))
|
||||
out.append(trimmed)
|
||||
budget_tables -= _measure_data_table(trimmed)
|
||||
return out
|
||||
|
||||
|
||||
def _place_group(st: _PptxState, block) -> None:
|
||||
"""Render a keep-together Group: move it whole to the next slide if needed."""
|
||||
blocks = getattr(block, "blocks", []) or []
|
||||
if not blocks:
|
||||
return
|
||||
# Opt-in slide break: start this group on a fresh slide unless the current one
|
||||
# is still empty (so a chapter can give each unit its own slide).
|
||||
if getattr(block, "page_break_before", False) and st.y > _CONTENT_TOP + 1e-6:
|
||||
_new_slide(st, cont=True)
|
||||
avail_full = _CONTENT_BOTTOM - _CONTENT_TOP
|
||||
# Trim oversized tables first (keeps the chart on the same slide), then shrink
|
||||
# the figure to share the remaining room.
|
||||
blocks = _fit_group_blocks(st, blocks, avail_full)
|
||||
_shrink_group_figures(st, blocks, avail_full)
|
||||
total = sum(_measure_block(st, b) for b in blocks)
|
||||
if total <= avail_full:
|
||||
|
||||
@@ -0,0 +1,125 @@
|
||||
---
|
||||
id: build_boxplots_figure_py_datascience
|
||||
name: build_boxplots_figure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def build_boxplots_figure(boxes: list, title: str = \"\", max_boxes: int = 12) -> \"matplotlib.figure.Figure\""
|
||||
description: "Construye una unica figura matplotlib con boxplots de Tukey HORIZONTALES (uno por columna) usando ax.bxp: caja Q1-Q3, bigotes hasta 1.5*IQR, linea de mediana y puntos atipicos. Consume la salida de build_boxplot_stats (un dict box por columna, leido con .get) mas una lista opcional de outliers crudos por columna; si vienen los dibuja como puntos (showfliers), si no marca solo box[min]/box[max] cuando hay outliers de cola (igual que num_distr). Dibuja como mucho max_boxes cajas (las primeras, ya ordenadas por contaminacion por el caller) y avisa de la truncacion con (mostrando N de M). Backend Agg sin pyplot global; alto adaptativo al nº de cajas. Defensiva: omite entradas invalidas y NUNCA lanza — sin cajas validas devuelve una figura placeholder (sin boxplots). Es la version small-multiples del capitulo num_distr para responder que columnas tienen mas outliers de un vistazo."
|
||||
tags: [eda, outliers, boxplot, tukey, iqr, bxp, matplotlib, figure, visualization, small-multiples, datascience, impure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [matplotlib]
|
||||
example: |
|
||||
from datascience.build_boxplot_stats import build_boxplot_stats
|
||||
from datascience.build_boxplots_figure import build_boxplots_figure
|
||||
boxes = [
|
||||
{"name": "ingresos", "box": build_boxplot_stats({"min": 1.0, "max": 9e3,
|
||||
"p25": 1e3, "median": 2e3, "p75": 3e3, "n_outliers": 7}), "fliers": None},
|
||||
{"name": "edad", "box": build_boxplot_stats({"min": 0.0, "max": 99.0,
|
||||
"p25": 25.0, "median": 38.0, "p75": 52.0}), "fliers": None},
|
||||
]
|
||||
fig = build_boxplots_figure(boxes, title="Outliers por columna", max_boxes=12)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_returns_figure_with_axes"
|
||||
- "test_empty_list_returns_placeholder_figure"
|
||||
- "test_invalid_box_is_skipped_not_raised"
|
||||
- "test_all_invalid_returns_placeholder"
|
||||
- "test_raw_fliers_are_drawn"
|
||||
- "test_max_boxes_truncates_and_does_not_raise"
|
||||
test_file_path: "python/functions/datascience/build_boxplots_figure_test.py"
|
||||
file_path: "python/functions/datascience/build_boxplots_figure.py"
|
||||
params:
|
||||
- name: boxes
|
||||
desc: "Lista de dicts, cada uno {\"name\": str, \"box\": dict, \"fliers\": list|None}. box es EXACTAMENTE la salida de build_boxplot_stats (claves leidas con .get: q1, median, q3, whisker_lo, whisker_hi, min, max, has_low_outliers, has_high_outliers, lower_fence, upper_fence, n_outliers). fliers es la lista opcional de outliers crudos: si viene se dibuja como puntos; si es None/ausente solo se marcan los extremos box[min]/box[max] cuando hay outliers de cola. Entradas que no son dict, sin box dict, o sin q1/median/q3 se omiten. El caller las pasa ya ordenadas por contaminacion (la mayor primera)."
|
||||
- name: title
|
||||
desc: "Titulo de la figura (fig.suptitle, alineado a la izquierda). Vacio => sin titulo. Si len(boxes) > max_boxes se le anade una nota \"(mostrando N de M)\" para que la truncacion no sea silenciosa. Default \"\"."
|
||||
- name: max_boxes
|
||||
desc: "Numero maximo de cajas a dibujar (las primeras de la lista). Default 12. Un valor no entero o <= 0 cae a 12. Si la lista trae mas entradas, las sobrantes se descartan pero se reporta en el titulo con (mostrando N de M)."
|
||||
output: "Un matplotlib.figure.Figure (figsize 7.0 x alto adaptativo = max(2.0, 0.5*n + 1.0), dpi 150) con un unico Axes que apila boxplots horizontales de Tukey (ax.bxp, orientation=horizontal con fallback vert=False), uno por columna valida, de arriba a abajo en el orden recibido. Cada caja: relleno #9ec6df, borde/bigotes/caps #5b8aa6, mediana #2e8b57, atipicos #c0392b. Etiquetas del eje Y = nombres de columna; eje X etiquetado \"valor\". Outliers dibujados desde fliers crudos (showfliers) o, si faltan, marcados en box[min]/box[max] segun has_low/high_outliers. Si no queda ninguna caja valida (lista vacia o todas invalidas) devuelve una Figure placeholder con texto centrado \"(sin boxplots)\"; cualquier error inesperado se captura y devuelve una Figure con el mensaje de error. NUNCA lanza. El caller rasteriza/cierra la figura; la funcion no la muestra ni la guarda."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.build_boxplot_stats import build_boxplot_stats
|
||||
from datascience.build_boxplots_figure import build_boxplots_figure
|
||||
|
||||
# Un `box` por columna numérica, derivado del sub-bloque `numeric` del profile
|
||||
# (salida de describe_numeric). El caller los pasa ya ordenados por outlier_pct.
|
||||
boxes = [
|
||||
{
|
||||
"name": "ingresos",
|
||||
"box": build_boxplot_stats({
|
||||
"min": 1.0, "max": 9000.0,
|
||||
"p25": 1000.0, "median": 2000.0, "p75": 3000.0,
|
||||
"n_outliers": 7,
|
||||
}),
|
||||
"fliers": None, # valores crudos desconocidos -> se marca solo el extremo.
|
||||
},
|
||||
{
|
||||
"name": "edad",
|
||||
"box": build_boxplot_stats({
|
||||
"min": 0.0, "max": 99.0,
|
||||
"p25": 25.0, "median": 38.0, "p75": 52.0,
|
||||
}),
|
||||
"fliers": [88.0, 95.0, 99.0], # outliers crudos -> se dibujan como puntos.
|
||||
},
|
||||
]
|
||||
|
||||
fig = build_boxplots_figure(boxes, title="Outliers por columna", max_boxes=12)
|
||||
|
||||
# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
|
||||
fig.savefig("/tmp/boxplots.png")
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en el capítulo de outliers de un informe EDA cuando quieras comparar de un
|
||||
vistazo *qué columnas están más contaminadas por valores atípicos*: a diferencia
|
||||
de `num_distr` (que dibuja un histograma+boxplot por columna en figuras
|
||||
separadas), aquí apilas todos los boxplots horizontales en **una sola figura**
|
||||
(small multiples). Primero deriva el `box` de cada columna con
|
||||
`build_boxplot_stats`, ordénalas por `outlier_pct` descendente, envuélvelas como
|
||||
`{"name", "box", "fliers"}` y pásaselas. Si tienes los valores crudos fuera de
|
||||
las vallas, métele la lista `fliers` y se dibujarán como puntos; si no, la
|
||||
función marca solo los extremos `min`/`max` cuando hay cola.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
|
||||
y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
|
||||
para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
|
||||
es thread-safe; esta función construye el `Figure` directamente, así que es
|
||||
segura de llamar en bucle desde el renderer.
|
||||
- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo
|
||||
guarda. Quien la consume debe rasterizarla y luego liberarla
|
||||
(`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes.
|
||||
- **`fliers` opcional, semántica distinta.** Si pasas la lista de outliers
|
||||
crudos se dibujan todos como puntos (`showfliers=True`). Si es `None`/ausente
|
||||
los valores son desconocidos y solo se marca un punto en `box["min"]` /
|
||||
`box["max"]` cuando `has_low_outliers` / `has_high_outliers` — mismo criterio
|
||||
que `num_distr`. No inventes fliers a partir del profile: el `box` no trae los
|
||||
valores crudos, solo si los extremos superan las vallas.
|
||||
- **API de orientación de `ax.bxp`.** matplotlib reciente usa
|
||||
`orientation="horizontal"`; las versiones antiguas usan `vert=False`. La
|
||||
función prueba la primera y cae a la segunda en `except TypeError`, así que
|
||||
funciona en ambas. Si `bxp` falla del todo, el Axes degrada a un texto
|
||||
"(boxplot no disponible)" en vez de propagar.
|
||||
- **Truncación visible.** `max_boxes` (default 12) limita el nº de cajas para que
|
||||
ninguna se solape; si la lista trae más, las sobrantes se descartan pero se
|
||||
avisa en el título con "(mostrando N de M)". Pasa las columnas ya ordenadas por
|
||||
contaminación para que las descartadas sean las menos relevantes.
|
||||
- **Defensiva, nunca lanza.** Lista vacía, entradas no-dict, sin `box`, o sin
|
||||
`q1`/`median`/`q3` se omiten sin propagar; sin cajas válidas devuelve un
|
||||
placeholder "(sin boxplots)" y cualquier error inesperado se captura en una
|
||||
figura con el texto del error. No envuelvas la llamada en try/except por miedo
|
||||
a un raise — no lo hay.
|
||||
@@ -0,0 +1,250 @@
|
||||
"""Impure EDA helper: a single figure of horizontal Tukey boxplots (`eda` group).
|
||||
|
||||
Draws, in one ``matplotlib.figure.Figure``, a stack of horizontal Tukey boxplots
|
||||
(one per column) using ``ax.bxp``: each carries its box (Q1–Q3), whiskers (up to
|
||||
1.5·IQR), the median line and its outlier points. It consumes the output of the
|
||||
pure registry function ``build_boxplot_stats`` (one ``box`` dict per column) plus
|
||||
an optional list of raw outlier values per column; it never recomputes anything.
|
||||
|
||||
It is the "small-multiples" companion of ``num_distr`` (which draws one
|
||||
histogram+boxplot per column): here every column shares a single figure so the
|
||||
caller can show, at a glance, *which* columns are the most contaminated by
|
||||
outliers (the caller passes them already ordered by contamination).
|
||||
|
||||
Impure because it touches matplotlib's rendering machinery. It uses the headless
|
||||
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
|
||||
global state and is safe to call repeatedly from a report renderer. It is fully
|
||||
defensive and NEVER raises: invalid entries are skipped and, if nothing valid
|
||||
remains, it returns a placeholder figure carrying a centered "(sin boxplots)".
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
# Blue palette shared with the ``num_distr`` chapter so the report stays coherent.
|
||||
_BOX_FACE = "#9ec6df" # box fill.
|
||||
_BOX_EDGE = "#5b8aa6" # box / whisker / cap border.
|
||||
_MEDIAN = "#2e8b57" # median line (sea green).
|
||||
_OUTLIER = "#c0392b" # outlier points (soft red).
|
||||
# Muted gray for the placeholder / fallback message text.
|
||||
_MUTED_TEXT = "#5f6b7a"
|
||||
# Soft red for the error fallback message.
|
||||
_ERROR_TEXT = "#b00020"
|
||||
|
||||
|
||||
def _num(value):
|
||||
"""Coerce ``value`` to float defensively; None for None/bool/non-numeric/NaN."""
|
||||
# bool is a subclass of int; a stat value is never a real bool, so treat
|
||||
# True/False as missing instead of silently coercing to 1.0/0.0.
|
||||
if value is None or isinstance(value, bool):
|
||||
return None
|
||||
try:
|
||||
f = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if f != f: # NaN guard.
|
||||
return None
|
||||
return f
|
||||
|
||||
|
||||
def _placeholder_figure(message: str, color: str = _MUTED_TEXT) -> "Figure":
|
||||
"""Return a fallback ``Figure`` carrying a single centered message."""
|
||||
fig = Figure(figsize=(7.0, 2.4), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
ax.axis("off")
|
||||
ax.text(
|
||||
0.5,
|
||||
0.5,
|
||||
message,
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=12,
|
||||
color=color,
|
||||
wrap=True,
|
||||
transform=ax.transAxes,
|
||||
)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def build_boxplots_figure(
|
||||
boxes: list,
|
||||
title: str = "",
|
||||
max_boxes: int = 12,
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Build one figure of stacked horizontal Tukey boxplots (one per column).
|
||||
|
||||
For each entry the function builds a ``bxp`` stats record (``med, q1, q3,
|
||||
whislo, whishi, fliers, label``) from its ``box`` sub-dict (the output of
|
||||
``build_boxplot_stats``) and draws all of them as horizontal boxplots sharing
|
||||
the X axis, top-to-bottom in the order received (the caller is expected to
|
||||
pass them already sorted by contamination).
|
||||
|
||||
Outliers are shown two ways:
|
||||
|
||||
- If an entry carries a ``fliers`` list (the raw out-of-fence values), they
|
||||
are drawn as red points via ``ax.bxp(..., showfliers=True)``.
|
||||
- If ``fliers`` is ``None``/absent, the raw values are unknown, so only the
|
||||
extremes are marked: a red point at ``box["min"]`` when
|
||||
``box["has_low_outliers"]`` and at ``box["max"]`` when
|
||||
``box["has_high_outliers"]`` (same convention as ``num_distr``).
|
||||
|
||||
The function is fully defensive and NEVER raises. Entries that are not dicts,
|
||||
lack a ``box`` dict, or miss any of ``q1``/``median``/``q3`` are skipped. If
|
||||
after filtering no valid box remains it returns a placeholder ``Figure`` with
|
||||
a centered "(sin boxplots)"; any unexpected error is caught and turned into a
|
||||
fallback figure carrying the error text. It always returns a ``Figure``.
|
||||
|
||||
Args:
|
||||
boxes: List of dicts ``{"name": str, "box": dict, "fliers": list|None}``.
|
||||
``box`` is exactly the output of ``build_boxplot_stats`` (read with
|
||||
``.get``: ``q1, median, q3, whisker_lo, whisker_hi, min, max,
|
||||
has_low_outliers, has_high_outliers, ...``). ``fliers`` is the
|
||||
optional list of raw outlier values; when present they are plotted,
|
||||
otherwise only the extremes are marked.
|
||||
title: Figure title (``fig.suptitle``). Empty => no title. When the list
|
||||
is longer than ``max_boxes`` a "(mostrando N de M)" note is appended.
|
||||
max_boxes: Draw at most the first ``max_boxes`` entries (default 12). The
|
||||
rest are dropped but their omission is surfaced in the title note, so
|
||||
the truncation is never silent.
|
||||
|
||||
Returns:
|
||||
A ``matplotlib.figure.Figure`` with a single Axes holding the horizontal
|
||||
boxplots (height adaptive to the box count so none overlap). The caller is
|
||||
responsible for rasterizing/closing it; this function never shows nor
|
||||
saves it.
|
||||
"""
|
||||
try:
|
||||
if not isinstance(boxes, (list, tuple)) or len(boxes) == 0:
|
||||
return _placeholder_figure("(sin boxplots)")
|
||||
|
||||
total = len(boxes)
|
||||
|
||||
# Cap the number of boxes; tolerate a non-int / non-positive max_boxes.
|
||||
try:
|
||||
cap = int(max_boxes)
|
||||
except (TypeError, ValueError):
|
||||
cap = 12
|
||||
if cap <= 0:
|
||||
cap = 12
|
||||
candidates = list(boxes)[:cap]
|
||||
|
||||
stats_list = [] # bxp stats records, in draw order.
|
||||
labels = [] # Y tick labels (column names).
|
||||
manual_markers = [] # (position, box) for entries without raw fliers.
|
||||
any_fliers = False # whether to enable showfliers in the bxp call.
|
||||
|
||||
for entry in candidates:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
box = entry.get("box")
|
||||
if not isinstance(box, dict):
|
||||
continue
|
||||
|
||||
q1 = _num(box.get("q1"))
|
||||
med = _num(box.get("median"))
|
||||
q3 = _num(box.get("q3"))
|
||||
# Without the three quartiles a boxplot cannot be drawn — skip it.
|
||||
if q1 is None or med is None or q3 is None:
|
||||
continue
|
||||
|
||||
# Whisker extremes fall back to the quartiles when missing.
|
||||
whislo = _num(box.get("whisker_lo"))
|
||||
whishi = _num(box.get("whisker_hi"))
|
||||
if whislo is None:
|
||||
whislo = q1
|
||||
if whishi is None:
|
||||
whishi = q3
|
||||
|
||||
name = entry.get("name")
|
||||
label = "" if name is None else str(name)
|
||||
|
||||
position = len(stats_list) + 1 # bxp positions are 1-indexed.
|
||||
fliers_raw = entry.get("fliers")
|
||||
if isinstance(fliers_raw, (list, tuple)):
|
||||
fliers = [v for v in (_num(x) for x in fliers_raw) if v is not None]
|
||||
if fliers:
|
||||
any_fliers = True
|
||||
else:
|
||||
# Raw values unknown: draw no bxp fliers, mark min/max by hand.
|
||||
fliers = []
|
||||
manual_markers.append((position, box))
|
||||
|
||||
stats_list.append({
|
||||
"med": med,
|
||||
"q1": q1,
|
||||
"q3": q3,
|
||||
"whislo": whislo,
|
||||
"whishi": whishi,
|
||||
"fliers": fliers,
|
||||
"label": label,
|
||||
})
|
||||
labels.append(label)
|
||||
|
||||
if not stats_list:
|
||||
return _placeholder_figure("(sin boxplots)")
|
||||
|
||||
n = len(stats_list)
|
||||
positions = list(range(1, n + 1))
|
||||
|
||||
# Height grows with the box count so none of them overlap.
|
||||
height = max(2.0, 0.5 * n + 1.0)
|
||||
fig = Figure(figsize=(7.0, height), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
bxp_kw = dict(
|
||||
showfliers=any_fliers, widths=0.5, patch_artist=True,
|
||||
boxprops={"facecolor": _BOX_FACE, "edgecolor": _BOX_EDGE},
|
||||
medianprops={"color": _MEDIAN, "linewidth": 1.6},
|
||||
whiskerprops={"color": _BOX_EDGE},
|
||||
capprops={"color": _BOX_EDGE},
|
||||
flierprops={"marker": "o", "markersize": 3.5,
|
||||
"markerfacecolor": _OUTLIER, "markeredgecolor": _OUTLIER,
|
||||
"linestyle": "none"})
|
||||
try:
|
||||
# ``orientation`` is the current API; older matplotlib uses ``vert``.
|
||||
try:
|
||||
ax.bxp(stats_list, positions=positions,
|
||||
orientation="horizontal", **bxp_kw)
|
||||
except TypeError:
|
||||
ax.bxp(stats_list, positions=positions, vert=False, **bxp_kw)
|
||||
except Exception: # noqa: BLE001 — never let bxp kill the whole figure.
|
||||
ax.text(0.5, 0.5, "(boxplot no disponible)", ha="center",
|
||||
va="center", fontsize=10, color=_MUTED_TEXT,
|
||||
transform=ax.transAxes)
|
||||
|
||||
# For entries without raw fliers, mark only the out-of-fence extremes.
|
||||
for position, box in manual_markers:
|
||||
mn = _num(box.get("min"))
|
||||
mx = _num(box.get("max"))
|
||||
if box.get("has_low_outliers") and mn is not None:
|
||||
ax.plot([mn], [position], marker="o", markersize=3.5,
|
||||
color=_OUTLIER, zorder=5)
|
||||
if box.get("has_high_outliers") and mx is not None:
|
||||
ax.plot([mx], [position], marker="o", markersize=3.5,
|
||||
color=_OUTLIER, zorder=5)
|
||||
|
||||
# Pin the Y tick labels explicitly so they work across matplotlib
|
||||
# versions regardless of whether ``bxp`` consumed the ``label`` key.
|
||||
ax.set_yticks(positions)
|
||||
ax.set_yticklabels(labels, fontsize=8)
|
||||
ax.set_xlabel("valor", fontsize=9)
|
||||
ax.tick_params(labelsize=7)
|
||||
ax.margins(y=0.15)
|
||||
for spine in ("top", "right"):
|
||||
ax.spines[spine].set_visible(False)
|
||||
|
||||
# Surface truncation in the title instead of silently dropping boxes.
|
||||
note = f"(mostrando {n} de {total})" if total > cap else ""
|
||||
heading = " ".join(p for p in (title, note) if p)
|
||||
if heading:
|
||||
fig.suptitle(heading, fontsize=12, x=0.02, ha="left")
|
||||
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
except Exception as exc: # noqa: BLE001 — never raise from a figure builder.
|
||||
return _placeholder_figure(
|
||||
f"error al dibujar boxplots: {exc}", color=_ERROR_TEXT)
|
||||
@@ -0,0 +1,109 @@
|
||||
"""Tests para build_boxplots_figure (boxplots horizontales de Tukey, grupo eda).
|
||||
|
||||
Usa el backend Agg sin display; no muestra ni guarda figuras. Cada test cierra
|
||||
explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular
|
||||
estado entre tests.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
from build_boxplots_figure import build_boxplots_figure
|
||||
|
||||
|
||||
def _box(name, q1, median, q3, mn, mx, low=False, high=False, fliers=None):
|
||||
"""Construye una entrada {name, box, fliers} con un box estilo build_boxplot_stats."""
|
||||
iqr = q3 - q1
|
||||
return {
|
||||
"name": name,
|
||||
"box": {
|
||||
"q1": q1,
|
||||
"median": median,
|
||||
"q3": q3,
|
||||
"iqr": iqr,
|
||||
"lower_fence": q1 - 1.5 * iqr,
|
||||
"upper_fence": q3 + 1.5 * iqr,
|
||||
"whisker_lo": max(mn, q1 - 1.5 * iqr),
|
||||
"whisker_hi": min(mx, q3 + 1.5 * iqr),
|
||||
"min": mn,
|
||||
"max": mx,
|
||||
"has_low_outliers": low,
|
||||
"has_high_outliers": high,
|
||||
"n_outliers": 0,
|
||||
},
|
||||
"fliers": fliers,
|
||||
}
|
||||
|
||||
|
||||
def test_returns_figure_with_axes():
|
||||
boxes = [
|
||||
_box("edad", 10.0, 25.0, 40.0, 1.0, 100.0, high=True),
|
||||
_box("ingresos", 100.0, 200.0, 300.0, 50.0, 400.0),
|
||||
_box("score", -1.0, 0.0, 1.0, -5.0, 5.0, low=True, high=True),
|
||||
]
|
||||
fig = build_boxplots_figure(boxes, title="Boxplots", max_boxes=12)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
# Tres cajas -> tres etiquetas en el eje Y.
|
||||
ax = fig.axes[0]
|
||||
assert len(ax.get_yticks()) == 3
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_list_returns_placeholder_figure():
|
||||
fig = build_boxplots_figure([], title="vacío")
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_invalid_box_is_skipped_not_raised():
|
||||
boxes = [
|
||||
{"name": "rota", "box": {"q1": None, "median": None, "q3": None}},
|
||||
{"name": "sin_box"}, # falta la clave box.
|
||||
"no_es_dict", # entrada no-dict.
|
||||
_box("buena", 1.0, 2.0, 3.0, 0.0, 10.0, high=True),
|
||||
]
|
||||
fig = build_boxplots_figure(boxes)
|
||||
assert isinstance(fig, Figure)
|
||||
ax = fig.axes[0]
|
||||
# Solo la caja válida sobrevive al filtrado.
|
||||
assert len(ax.get_yticks()) == 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_all_invalid_returns_placeholder():
|
||||
boxes = [
|
||||
{"name": "a", "box": {"q1": None, "median": 1.0, "q3": 2.0}},
|
||||
{"name": "b"},
|
||||
]
|
||||
fig = build_boxplots_figure(boxes)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_raw_fliers_are_drawn():
|
||||
boxes = [
|
||||
_box("con_fliers", 10.0, 20.0, 30.0, 5.0, 200.0,
|
||||
high=True, fliers=[150.0, 180.0, 200.0]),
|
||||
]
|
||||
fig = build_boxplots_figure(boxes)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_max_boxes_truncates_and_does_not_raise():
|
||||
boxes = [_box(f"c{i}", float(i), float(i + 1), float(i + 2),
|
||||
float(i - 5), float(i + 10)) for i in range(20)]
|
||||
fig = build_boxplots_figure(boxes, title="muchos", max_boxes=5)
|
||||
assert isinstance(fig, Figure)
|
||||
ax = fig.axes[0]
|
||||
# Solo se dibujan las primeras 5 cajas.
|
||||
assert len(ax.get_yticks()) == 5
|
||||
plt.close(fig)
|
||||
@@ -0,0 +1,68 @@
|
||||
---
|
||||
name: classify_relationship_type
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def classify_relationship_type(xs: list, ys: list) -> dict"
|
||||
description: "Clasifica el TIPO de relacion entre dos variables numericas pareadas por indice para el EDA automatico del grupo eda. Limpia los pares de forma defensiva (descarta None/bool/NaN/inf), reusa pearson y spearman_corr del registry y ajusta polinomios de grado 2 y 3 con numpy.polyfit (R^2 manual), y a partir de esas senales etiqueta la forma: 'lineal', 'polinomica (grado 2/3)', 'monotona no-lineal' o 'debil/sin forma'. Orden de decision: debil -> monotona -> polinomica -> lineal (la primera que matchea gana), con umbrales calibrados para datos reales discretos/ruidosos. Devuelve ademas los coeficientes del mejor modelo en orden de numpy.polyval para pintar la curva de ajuste sobre el scatter. Funcion pura no-throw: ante datos insuficientes (menos de 5 pares validos o varianza ~0) o cualquier fallo devuelve el dict canonico con tipo='debil/sin forma' y el resto a None."
|
||||
tags: [eda, correlation, relationship, classification, polyfit, datascience, pure]
|
||||
params:
|
||||
- name: xs
|
||||
desc: "Lista (o tupla) de valores numericos de la primera variable, pareada por indice con ys. Cada par xs[i],ys[i] se descarta si cualquiera de los dos es None, bool, NaN o inf. Lectura defensiva."
|
||||
- name: ys
|
||||
desc: "Lista (o tupla) de valores numericos de la segunda variable, pareada por indice con xs. Mismas reglas de limpieza que xs."
|
||||
output: "Dict con SIEMPRE las mismas 8 claves: tipo (str: 'lineal' | 'polinómica (grado 2)' | 'polinómica (grado 3)' | 'monótona no-lineal' | 'débil/sin forma'); pearson (float|None: coeficiente de Pearson r); r2_linear (float|None: r**2 del ajuste lineal); spearman (float|None: rho de Spearman); r2_poly2 (float|None: R^2 del ajuste polinomico de grado 2); r2_poly3 (float|None: R^2 del ajuste de grado 3); best_degree (int|None: grado del modelo elegido — 1 lineal, 2/3 polinomico, None si monotona/debil); coeffs (list|None: coeficientes del mejor modelo en orden de numpy.polyval para pintar la curva, o None). Ante datos insuficientes o error: tipo='débil/sin forma' y el resto de claves a None."
|
||||
uses_functions: [pearson_py_datascience, spearman_corr_py_datascience]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [numpy]
|
||||
tested: true
|
||||
tests: ["test_lineal", "test_polinomica_cuadratica", "test_monotona_no_lineal", "test_monotona_exponencial", "test_debil_sin_forma", "test_lista_vacia_no_lanza", "test_longitudes_distintas_no_lanza", "test_todos_none_no_lanza", "test_entradas_none_no_lanza", "test_constante_no_lanza", "test_filtra_nan_inf_bool"]
|
||||
test_file_path: "python/functions/datascience/classify_relationship_type_test.py"
|
||||
file_path: "python/functions/datascience/classify_relationship_type.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.classify_relationship_type import classify_relationship_type
|
||||
import numpy as np
|
||||
|
||||
# Relacion claramente cuadratica (forma de parabola) sobre dominio simetrico.
|
||||
x = list(np.linspace(-10, 10, 60))
|
||||
y = [v * v for v in x]
|
||||
|
||||
res = classify_relationship_type(x, y)
|
||||
print(res["tipo"]) # 'polinómica (grado 2)'
|
||||
print(res["best_degree"]) # 2
|
||||
print(res["r2_linear"]) # 0.0 -> el Pearson lineal no ve la parabola
|
||||
print(res["r2_poly2"]) # 1.0
|
||||
print(res["coeffs"]) # [1.0, -0.0, -0.0] -> numpy.polyval(coeffs, x) ~ x**2
|
||||
|
||||
# El capitulo pinta la curva de ajuste cuando coeffs no es None:
|
||||
# if res["coeffs"] is not None:
|
||||
# xs_fit = np.linspace(min(x), max(x), 200)
|
||||
# ys_fit = np.polyval(res["coeffs"], xs_fit)
|
||||
# ax.plot(xs_fit, ys_fit) # curva sobre el ax.scatter(x, y)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
- Usala en el capitulo de relaciones/correlaciones del EDA automatico, despues de detectar dos columnas numericas con alguna asociacion, para decidir QUE curva de ajuste pintar sobre el scatter (recta, parabola, cubica o ninguna) y poner una etiqueta legible al tipo de relacion.
|
||||
- Cuando un Pearson bajo no signifique "sin relacion": esta funcion cruza Pearson con Spearman y con ajustes polinomicos para distinguir una relacion lineal debil de una monotona no-lineal (que el rango si capta) o de una curva polinomica.
|
||||
- Cuando necesites un punto de entrada determinista y no-throw que, con los mismos datos, devuelva siempre el mismo `tipo` y los mismos `coeffs` listos para `numpy.polyval` sin tener que ajustar modelos a mano en el capitulo.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Funcion pura, deterministica y no-throw: ante menos de 5 pares validos, varianza ~0 (xs o ys constante) o cualquier excepcion interna devuelve el dict canonico `tipo="débil/sin forma"` con el resto de claves a `None`. El dict SIEMPRE trae las 8 claves: nunca compruebes existencia, comprueba `None`.
|
||||
- El orden de decision importa: `débil -> monótona -> polinómica -> lineal` (la primera que matchee gana). La monotonia se evalua ANTES que el ajuste polinomico, asi que una curva monotona suave (exp, log, potencias) sale `monótona no-lineal` aunque un cubico tambien la ajuste — la dominancia del rango (Spearman >> Pearson) es la senal mas interpretable. Solo cae en `polinómica` una forma curva NO monotona (p.ej. una parabola, Spearman ~0 pero R^2 polinomico alto).
|
||||
- Umbrales fijos (calibrados para EDA con datos discretos/ruidosos, no para inferencia formal): `débil/sin forma` si las tres senales son bajas a la vez (`abs(pearson) < 0.3` y `abs(spearman) < 0.3` y `mejor_poly < 0.3`); `monótona no-lineal` si `abs(spearman) - abs(pearson) >= 0.1` y `abs(spearman) >= 0.4`; `polinómica (grado N)` si el mejor polinomico mejora `>= 0.1` sobre el lineal y su R^2 `>= 0.3`; en cualquier otro caso con senal (no debil) `lineal`. El suelo de 0.3 evita llamar "debil" a relaciones reales pero discretas (conteos, escalas ordinales) con R^2 bajo pero direccion clara.
|
||||
- `coeffs` va en orden de `numpy.polyval` (grado descendente). Para `lineal` es `[pendiente, intercepto]` (grado 1); para `polinómica` los del grado elegido; para `monótona no-lineal` y `débil/sin forma` es `None` (el scatter pintara una curva suavizada o nada — lo decide el capitulo, no esta funcion).
|
||||
- `best_degree` prefiere el grado 2 sobre el 3 cuando empatan dentro de 0.02 de R^2 (parsimonia): no esperes grado 3 salvo que mejore claramente.
|
||||
- Los pares con `None`, `bool`, `NaN` o `inf` se descartan por indice en silencio; `bool` cuenta como no-numerico (un `True` no es `1`). El dominio de los datos afecta al resultado: una parabola sobre un dominio simetrico da Pearson ~0 (sale `polinómica`), pero sobre un dominio asimetrico el Pearson sube y puede salir `lineal`.
|
||||
@@ -0,0 +1,187 @@
|
||||
"""Clasifica el TIPO de relacion entre dos variables numericas pareadas.
|
||||
|
||||
Funcion pura del grupo eda. Dadas dos listas numericas pareadas por indice,
|
||||
limpia los pares de forma defensiva, calcula correlaciones lineal (Pearson) y de
|
||||
rangos (Spearman) y ajustes polinomicos de grado 2 y 3, y a partir de esas
|
||||
senales etiqueta la forma de la relacion para el EDA automatico:
|
||||
|
||||
"lineal" | "polinómica (grado 2)" | "polinómica (grado 3)" |
|
||||
"monótona no-lineal" | "débil/sin forma"
|
||||
|
||||
Ademas devuelve los coeficientes del mejor modelo (en orden de numpy.polyval)
|
||||
para que el capitulo pinte la curva de ajuste sobre el scatter. Reusa las
|
||||
funciones del registry `pearson` y `spearman_corr` en vez de reimplementarlas.
|
||||
|
||||
NUNCA lanza: ante cualquier fallo o dato insuficiente devuelve el dict canonico
|
||||
con tipo="débil/sin forma" y el resto de claves a None.
|
||||
"""
|
||||
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from datascience.datascience import pearson
|
||||
from datascience.spearman_corr import spearman_corr
|
||||
|
||||
# Forma canonica de la respuesta cuando no se puede clasificar (datos
|
||||
# insuficientes, varianza nula o error interno). Siempre las mismas claves.
|
||||
_WEAK = {
|
||||
"tipo": "débil/sin forma",
|
||||
"pearson": None,
|
||||
"r2_linear": None,
|
||||
"spearman": None,
|
||||
"r2_poly2": None,
|
||||
"r2_poly3": None,
|
||||
"best_degree": None,
|
||||
"coeffs": None,
|
||||
}
|
||||
|
||||
|
||||
def _is_num(v) -> bool:
|
||||
"""True si v es un numero real finito (int/float, no bool, no NaN, no inf)."""
|
||||
return (
|
||||
isinstance(v, (int, float))
|
||||
and not isinstance(v, bool)
|
||||
and not (isinstance(v, float) and (math.isnan(v) or math.isinf(v)))
|
||||
)
|
||||
|
||||
|
||||
def _poly_r2(coeffs, x_arr, y_arr, ss_tot: float) -> float:
|
||||
"""R^2 de un ajuste polinomico: 1 - SS_res/SS_tot. 0 si SS_tot==0."""
|
||||
if ss_tot == 0.0:
|
||||
return 0.0
|
||||
pred = np.polyval(coeffs, x_arr)
|
||||
ss_res = float(np.sum((y_arr - pred) ** 2))
|
||||
return 1.0 - ss_res / ss_tot
|
||||
|
||||
|
||||
def classify_relationship_type(xs: list, ys: list) -> dict:
|
||||
"""Clasifica el tipo de relacion entre dos variables numericas pareadas.
|
||||
|
||||
Empareja xs[i],ys[i] por indice y descarta el par si cualquiera de los dos
|
||||
es None, bool, NaN o inf. Sobre los pares limpios calcula Pearson r
|
||||
(r2_linear = r**2), Spearman rho y los R^2 de ajustes polinomicos de grado 2
|
||||
y 3 (con numpy.polyfit + R^2 manual). Con esas senales decide la etiqueta.
|
||||
|
||||
Orden de evaluacion de la etiqueta (la primera que matchee gana). Los
|
||||
umbrales estan calibrados para datos reales, a menudo discretos y ruidosos
|
||||
(conteos, escalas ordinales): una relacion con |r| >= 0.3, |rho| >= 0.3 o un
|
||||
polinomio con R^2 >= 0.3 ya tiene FORMA y no debe etiquetarse como "debil".
|
||||
1. "débil/sin forma" — todas las senales bajas a la vez:
|
||||
abs(pearson) < 0.3 y abs(spearman) < 0.3 y mejor_poly < 0.3.
|
||||
2. "monótona no-lineal" — el rango (Spearman) capta una monotonia que el
|
||||
Pearson lineal no: abs(spearman) - abs(pearson) >= 0.1 y
|
||||
abs(spearman) >= 0.4. No se fuerza un polinomio (coeffs/best_degree =
|
||||
None); el capitulo dibuja la tendencia ordenada sobre el scatter.
|
||||
3. "polinómica (grado N)" — el mejor polinomico mejora claramente sobre
|
||||
el lineal (mejor_poly - r2_linear >= 0.1) y mejor_poly >= 0.3. N es el
|
||||
grado (2 o 3) con mejor R^2, prefiriendo el 2 si empatan dentro de 0.02
|
||||
(parsimonia).
|
||||
4. "lineal" — el resto: hay senal (no es debil) y la forma que existe es
|
||||
esencialmente lineal. best_degree=1, coeffs del ajuste de grado 1.
|
||||
|
||||
Si hay menos de 5 pares validos, o la varianza de xs o de ys es ~0
|
||||
(constante), devuelve directamente "débil/sin forma".
|
||||
|
||||
Args:
|
||||
xs: lista (o tupla) de valores numericos de la primera variable,
|
||||
pareada por indice con ys. Pares con None/bool/NaN/inf se descartan.
|
||||
ys: lista (o tupla) de valores numericos de la segunda variable,
|
||||
pareada por indice con xs.
|
||||
|
||||
Returns:
|
||||
dict con SIEMPRE las mismas claves:
|
||||
tipo (str), pearson (float|None), r2_linear (float|None),
|
||||
spearman (float|None), r2_poly2 (float|None), r2_poly3 (float|None),
|
||||
best_degree (int|None: 1, 2, 3 o None),
|
||||
coeffs (list|None: coeficientes en orden de numpy.polyval, o None).
|
||||
Nunca lanza: ante fallo o datos insuficientes devuelve el dict debil.
|
||||
"""
|
||||
try:
|
||||
if xs is None or ys is None:
|
||||
return dict(_WEAK)
|
||||
|
||||
pairs = [
|
||||
(float(x), float(y))
|
||||
for x, y in zip(xs, ys)
|
||||
if _is_num(x) and _is_num(y)
|
||||
]
|
||||
|
||||
# Datos insuficientes para hablar de forma de la relacion.
|
||||
if len(pairs) < 5:
|
||||
return dict(_WEAK)
|
||||
|
||||
clean_x = [p[0] for p in pairs]
|
||||
clean_y = [p[1] for p in pairs]
|
||||
|
||||
# Varianza ~0 en cualquiera de las series => relacion indefinida.
|
||||
if len(set(clean_x)) < 2 or len(set(clean_y)) < 2:
|
||||
return dict(_WEAK)
|
||||
x_arr = np.asarray(clean_x, dtype=float)
|
||||
y_arr = np.asarray(clean_y, dtype=float)
|
||||
if float(np.var(x_arr)) < 1e-15 or float(np.var(y_arr)) < 1e-15:
|
||||
return dict(_WEAK)
|
||||
|
||||
# Correlaciones reutilizando las funciones del registry.
|
||||
r = pearson(clean_x, clean_y)
|
||||
spearman = spearman_corr(clean_x, clean_y)
|
||||
r2_linear = r ** 2
|
||||
|
||||
# Ajustes polinomicos grado 2 y 3 con R^2 manual.
|
||||
ss_tot = float(np.sum((y_arr - float(np.mean(y_arr))) ** 2))
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
c1 = np.polyfit(x_arr, y_arr, 1)
|
||||
c2 = np.polyfit(x_arr, y_arr, 2)
|
||||
c3 = np.polyfit(x_arr, y_arr, 3)
|
||||
r2_poly2 = _poly_r2(c2, x_arr, y_arr, ss_tot)
|
||||
r2_poly3 = _poly_r2(c3, x_arr, y_arr, ss_tot)
|
||||
|
||||
mejor_poly = max(r2_poly2, r2_poly3)
|
||||
# Grado del mejor polinomico, con preferencia por la parsimonia: solo se
|
||||
# elige el grado 3 si supera al grado 2 por mas de 0.02.
|
||||
best_poly_degree = 3 if (r2_poly3 - r2_poly2) > 0.02 else 2
|
||||
|
||||
abs_s = abs(spearman)
|
||||
abs_p = abs(r)
|
||||
|
||||
# Decision en orden: debil-temprano -> monotona -> polinomica -> lineal.
|
||||
if abs_p < 0.3 and abs_s < 0.3 and mejor_poly < 0.3:
|
||||
# Ninguna senal supera el suelo de forma: relacion debil/sin forma.
|
||||
tipo = "débil/sin forma"
|
||||
best_degree = None
|
||||
coeffs = None
|
||||
elif (abs_s - abs_p) >= 0.1 and abs_s >= 0.4:
|
||||
# Spearman (rango) capta una monotonia que el Pearson lineal no:
|
||||
# relacion monotona no-lineal. No se fuerza un polinomio que tal vez
|
||||
# no ajusta bien; el capitulo dibuja la tendencia ordenada.
|
||||
tipo = "monótona no-lineal"
|
||||
best_degree = None
|
||||
coeffs = None
|
||||
elif (mejor_poly - r2_linear) >= 0.1 and mejor_poly >= 0.3:
|
||||
tipo = "polinómica (grado {})".format(best_poly_degree)
|
||||
best_degree = best_poly_degree
|
||||
best_coeffs = c2 if best_poly_degree == 2 else c3
|
||||
coeffs = [float(c) for c in best_coeffs]
|
||||
else:
|
||||
# Hay senal (no es debil) y no es ni monotona-pura ni polinomica:
|
||||
# la correlacion que existe es esencialmente lineal.
|
||||
tipo = "lineal"
|
||||
best_degree = 1
|
||||
coeffs = [float(c) for c in c1]
|
||||
|
||||
return {
|
||||
"tipo": tipo,
|
||||
"pearson": round(float(r), 6),
|
||||
"r2_linear": round(float(r2_linear), 6),
|
||||
"spearman": round(float(spearman), 6),
|
||||
"r2_poly2": round(float(r2_poly2), 6),
|
||||
"r2_poly3": round(float(r2_poly3), 6),
|
||||
"best_degree": best_degree,
|
||||
"coeffs": (
|
||||
[round(c, 8) for c in coeffs] if coeffs is not None else None
|
||||
),
|
||||
}
|
||||
except Exception:
|
||||
return dict(_WEAK)
|
||||
@@ -0,0 +1,174 @@
|
||||
"""Tests para classify_relationship_type."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from classify_relationship_type import classify_relationship_type
|
||||
|
||||
# Claves que el dict de salida debe contener SIEMPRE.
|
||||
_EXPECTED_KEYS = {
|
||||
"tipo", "pearson", "r2_linear", "spearman",
|
||||
"r2_poly2", "r2_poly3", "best_degree", "coeffs",
|
||||
}
|
||||
|
||||
|
||||
def _assert_shape(r):
|
||||
"""Toda salida tiene exactamente las 8 claves canonicas."""
|
||||
assert isinstance(r, dict)
|
||||
assert set(r.keys()) == _EXPECTED_KEYS
|
||||
|
||||
|
||||
def test_lineal():
|
||||
"""Golden: y = 2x + 1 con ruido pequeno -> 'lineal', best_degree=1."""
|
||||
rng = np.random.default_rng(42)
|
||||
x = np.linspace(0.0, 10.0, 50)
|
||||
y = 2.0 * x + 1.0 + rng.normal(0.0, 0.3, 50)
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "lineal"
|
||||
assert r["best_degree"] == 1
|
||||
assert r["r2_linear"] >= 0.5
|
||||
# coeffs ~ [pendiente, intercepto] del ajuste de grado 1.
|
||||
assert r["coeffs"] is not None and len(r["coeffs"]) == 2
|
||||
assert abs(r["coeffs"][0] - 2.0) < 0.1 # pendiente ~2
|
||||
assert abs(r["coeffs"][1] - 1.0) < 0.3 # intercepto ~1
|
||||
|
||||
|
||||
def test_polinomica_cuadratica():
|
||||
"""Golden: y = x**2 sobre [-10, 10] -> 'polinómica', best_degree in (2, 3)."""
|
||||
x = np.linspace(-10.0, 10.0, 60)
|
||||
y = x ** 2
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"].startswith("polinómica")
|
||||
assert r["best_degree"] in (2, 3)
|
||||
# Una parabola perfecta queda capturada por el grado 2 (parsimonia).
|
||||
assert r["best_degree"] == 2
|
||||
assert r["r2_poly2"] > 0.99
|
||||
assert r["coeffs"] is not None and len(r["coeffs"]) == r["best_degree"] + 1
|
||||
|
||||
|
||||
def test_monotona_no_lineal():
|
||||
"""Golden: monotona convexa de cola pesada -> 'monótona no-lineal'.
|
||||
|
||||
y = 1/(N+1-i)**2 es estrictamente creciente (Spearman ~ 1) pero su cola
|
||||
explosiva hace que ni la recta ni un polinomio de grado 2/3 la ajusten
|
||||
(R^2 polinomico < 0.5), de modo que el Pearson lineal NO capta la relacion
|
||||
que el rango (Spearman) si ve. Construccion deterministica (sin azar).
|
||||
"""
|
||||
n = 200
|
||||
i = np.arange(n, dtype=float)
|
||||
y = 1.0 / (n + 1 - i) ** 2
|
||||
|
||||
r = classify_relationship_type(list(i), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "monótona no-lineal"
|
||||
assert r["best_degree"] is None
|
||||
assert r["coeffs"] is None
|
||||
# Spearman fuerte y claramente por encima del Pearson.
|
||||
assert abs(r["spearman"]) >= 0.5
|
||||
assert abs(r["spearman"]) - abs(r["pearson"]) >= 0.15
|
||||
|
||||
|
||||
def test_monotona_exponencial():
|
||||
"""DoD literal: y = exp(x) (monotona no-lineal) -> 'monótona no-lineal'.
|
||||
|
||||
exp es estrictamente creciente (Spearman = 1) pero el Pearson lineal queda
|
||||
claramente por debajo (~0.86), así que la dominancia del rango la marca como
|
||||
monótona no-lineal en vez de lineal o polinómica.
|
||||
"""
|
||||
x = np.linspace(0.0, 5.0, 80)
|
||||
y = np.exp(x)
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "monótona no-lineal"
|
||||
assert r["best_degree"] is None and r["coeffs"] is None
|
||||
assert abs(r["spearman"]) >= 0.9
|
||||
assert abs(r["spearman"]) - abs(r["pearson"]) >= 0.1
|
||||
|
||||
|
||||
def test_debil_sin_forma():
|
||||
"""Golden: x e y independientes (semilla fija) -> 'débil/sin forma'."""
|
||||
rng = np.random.default_rng(0)
|
||||
x = rng.normal(0.0, 1.0, 200)
|
||||
y = rng.normal(0.0, 1.0, 200)
|
||||
|
||||
r = classify_relationship_type(list(x), list(y))
|
||||
_assert_shape(r)
|
||||
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["best_degree"] is None
|
||||
assert r["coeffs"] is None
|
||||
# Todas las senales son bajas.
|
||||
assert abs(r["pearson"]) < 0.3
|
||||
assert r["r2_linear"] < 0.1
|
||||
|
||||
|
||||
def test_lista_vacia_no_lanza():
|
||||
"""Edge: listas vacias -> dict debil canonico, sin lanzar."""
|
||||
r = classify_relationship_type([], [])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["pearson"] is None
|
||||
assert r["r2_linear"] is None
|
||||
assert r["spearman"] is None
|
||||
assert r["r2_poly2"] is None
|
||||
assert r["r2_poly3"] is None
|
||||
assert r["best_degree"] is None
|
||||
assert r["coeffs"] is None
|
||||
|
||||
|
||||
def test_longitudes_distintas_no_lanza():
|
||||
"""Edge: listas de distinta longitud -> empareja por indice, no lanza."""
|
||||
# zip trunca a la longitud minima: solo 3 pares (< 5) -> debil.
|
||||
r = classify_relationship_type([1, 2, 3, 4, 5, 6, 7, 8], [1.0, 2.0, 3.0])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["best_degree"] is None
|
||||
|
||||
|
||||
def test_todos_none_no_lanza():
|
||||
"""Edge: todos los valores None -> ningun par valido -> debil, no lanza."""
|
||||
r = classify_relationship_type([None, None, None, None, None, None],
|
||||
[None, None, None, None, None, None])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
assert r["coeffs"] is None
|
||||
|
||||
|
||||
def test_entradas_none_no_lanza():
|
||||
"""Edge: xs/ys None directamente -> debil, no lanza."""
|
||||
assert classify_relationship_type(None, None)["tipo"] == "débil/sin forma"
|
||||
assert classify_relationship_type([1.0, 2.0], None)["tipo"] == "débil/sin forma"
|
||||
|
||||
|
||||
def test_constante_no_lanza():
|
||||
"""Edge: ys constante (varianza ~0) -> debil, no lanza."""
|
||||
r = classify_relationship_type([1, 2, 3, 4, 5, 6, 7], [5, 5, 5, 5, 5, 5, 5])
|
||||
_assert_shape(r)
|
||||
assert r["tipo"] == "débil/sin forma"
|
||||
|
||||
|
||||
def test_filtra_nan_inf_bool():
|
||||
"""Edge: pares con NaN/inf/bool/None se descartan por indice."""
|
||||
nan = float("nan")
|
||||
inf = float("inf")
|
||||
# Solo i=0,1,2,3,4 quedan validos (5 pares) y forman una recta perfecta.
|
||||
xs = [0.0, 1.0, 2.0, 3.0, 4.0, nan, inf, True, None]
|
||||
ys = [1.0, 3.0, 5.0, 7.0, 9.0, 1.0, 2.0, 3.0, 4.0]
|
||||
r = classify_relationship_type(xs, ys)
|
||||
_assert_shape(r)
|
||||
# Los 5 pares validos son y = 2x + 1 exacto -> lineal.
|
||||
assert r["tipo"] == "lineal"
|
||||
assert r["best_degree"] == 1
|
||||
@@ -4,10 +4,10 @@ name: column_quality_score
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
version: "2.0.0"
|
||||
purity: pure
|
||||
signature: "def column_quality_score(col: dict) -> dict"
|
||||
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda, con desglose completeness/validity/consistency y lista de issues legibles. Funcion pura, no muta el input."
|
||||
description: "Calcula un score de calidad de datos 0-100 para un ColumnProfile del grupo eda. Combina completeness (0.6) y validity (0.4) con renormalizacion por aplicabilidad; los outliers, columnas constantes e ids NO bajan el score (van a observations). Devuelve desglose por dimension, issues (defectos) y observations (señales analiticas). Funcion pura, no muta el input."
|
||||
tags: [eda, data-quality, profiling, scoring, datascience]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
@@ -17,20 +17,26 @@ error_type: ""
|
||||
imports: []
|
||||
example: |
|
||||
from datascience import column_quality_score
|
||||
col = {"name": "precio", "inferred_type": "float", "null_pct": 0.2,
|
||||
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 0.08}}
|
||||
col = {"name": "precio", "inferred_type": "numeric", "null_pct": 0.2,
|
||||
"unique_pct": 0.4, "flags": [], "numeric": {"outlier_pct": 8.0}}
|
||||
column_quality_score(col)
|
||||
# {"score": 86.8, "completeness": 0.8, "validity": 0.92,
|
||||
# "consistency": 1.0, "issues": ["20% nulos", "8% outliers"]}
|
||||
# {"score": 88.0, "completeness": 0.8, "validity": 1.0,
|
||||
# "applicable": ["completeness", "validity"], "issues": ["20% nulos"],
|
||||
# "observations": ["8% de valores atípicos (z-score>3): ..."]}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_clean_column_high_score"
|
||||
- "test_half_null_lowers_completeness_and_score"
|
||||
- "test_constant_column_flags_issue"
|
||||
- "test_weights_60_40_native_type"
|
||||
- "test_outliers_do_not_penalize_score"
|
||||
- "test_nulls_lower_score_more_than_outliers"
|
||||
- "test_validity_from_parse_rate_lowers_score"
|
||||
- "test_validity_from_match_rate"
|
||||
- "test_free_text_renormalizes_to_completeness_only"
|
||||
- "test_all_null_column_scores_zero"
|
||||
- "test_constant_column_scores_full_and_is_observation"
|
||||
- "test_high_cardinality_id_scores_full_and_is_observation"
|
||||
- "test_mostly_null_no_double_counts_validity"
|
||||
- "test_empty_dict_does_not_crash"
|
||||
- "test_outliers_penalize_validity"
|
||||
- "test_mostly_null_flag_halves_validity"
|
||||
- "test_high_cardinality_text_flagged_as_id"
|
||||
- "test_none_values_treated_defensively"
|
||||
- "test_does_not_mutate_input"
|
||||
test_file_path: "python/functions/datascience/column_quality_score_test.py"
|
||||
@@ -38,16 +44,22 @@ file_path: "python/functions/datascience/column_quality_score.py"
|
||||
params:
|
||||
- name: col
|
||||
desc: >
|
||||
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb).
|
||||
Se leen sus claves de forma defensiva con .get(...) y se toleran valores
|
||||
None. Claves usadas: null_pct (0-1), inferred_type, semantic_type,
|
||||
unique_pct (0-1), flags (list[str], reconoce "constant"/"mostly_null"),
|
||||
numeric ({outlier_pct: 0-1, ...}|None) y match_rate (opcional, 0-1).
|
||||
ColumnProfile dict del grupo eda (p.ej. salida de summarize_table_duckdb /
|
||||
profile_table). Se leen sus claves de forma defensiva con .get(...) y se
|
||||
toleran valores None. Claves usadas: null_pct (0-1), n_rows, empty_count
|
||||
(texto), inferred_type, semantic_type, validity_rate (0-1, lo expone
|
||||
profile_table al promocionar texto a numero/fecha), match_rate (0-1),
|
||||
unique_pct (0-1), flags (list[str], reconoce
|
||||
"constant"/"possible_id"/"high_cardinality") y numeric ({outlier_pct: 0-100,
|
||||
skew, ...}|None).
|
||||
output: >
|
||||
dict con score (float 0-100, redondeado a 1 decimal), completeness (0-1),
|
||||
validity (0-1), consistency (0-1) e issues (list[str] de descripciones
|
||||
legibles de los problemas detectados). score = round(100 * (0.5*completeness
|
||||
+ 0.3*validity + 0.2*consistency), 1).
|
||||
dict con score (float 0-100, 1 decimal), completeness (0-1), validity (0-1 o
|
||||
None si no aplicable), dimensions ({completeness, validity}), applicable
|
||||
(list[str] de dimensiones que entraron en el score), issues (list[str] SOLO de
|
||||
defectos de calidad: nulos, vacios, valores no conformes) y observations
|
||||
(list[str] de señales analiticas que NO bajan el score: outliers, columna
|
||||
constante, posible id, asimetria). score = round(100 * (0.6*completeness +
|
||||
0.4*validity) / pesos_aplicables, 1), renormalizado cuando validity no aplica.
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
@@ -59,51 +71,71 @@ from datascience import column_quality_score
|
||||
col = {
|
||||
"name": "precio",
|
||||
"physical_type": "DOUBLE",
|
||||
"inferred_type": "float",
|
||||
"inferred_type": "numeric",
|
||||
"semantic_type": "",
|
||||
"count": 800,
|
||||
"n_rows": 1000,
|
||||
"null_count": 200,
|
||||
"null_pct": 0.20,
|
||||
"distinct_count": 400,
|
||||
"unique_pct": 0.40,
|
||||
"flags": [],
|
||||
"numeric": {"outlier_pct": 0.08},
|
||||
"numeric": {"outlier_pct": 8.0, "skew": 0.3},
|
||||
"categorical": None,
|
||||
"datetime": None,
|
||||
}
|
||||
|
||||
column_quality_score(col)
|
||||
# {
|
||||
# "score": 86.8,
|
||||
# "completeness": 0.8, # 1 - 0.20
|
||||
# "validity": 0.92, # 1 - min(0.08, 0.3)
|
||||
# "consistency": 1.0,
|
||||
# "issues": ["20% nulos", "8% outliers"],
|
||||
# "score": 88.0, # 100 * (0.6*0.8 + 0.4*1.0)
|
||||
# "completeness": 0.8, # 1 - 0.20
|
||||
# "validity": 1.0, # numerica nativa: el tipo es conforme
|
||||
# "dimensions": {"completeness": 0.8, "validity": 1.0},
|
||||
# "applicable": ["completeness", "validity"],
|
||||
# "issues": ["20% nulos"], # SOLO defectos de calidad
|
||||
# "observations": ["8% de valores atípicos (z-score>3): ..."], # NO bajan score
|
||||
# }
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando hayas perfilado una tabla con el grupo `eda` (p.ej.
|
||||
`summarize_table_duckdb`) y necesites un numero 0-100 por columna para
|
||||
ordenar/priorizar limpieza de datos, pintar semaforos de calidad en un
|
||||
dashboard, o decidir que columnas descartar antes de modelar. Es la capa de
|
||||
scoring sobre el ColumnProfile crudo: lee el perfil, no toca los datos.
|
||||
`summarize_table_duckdb` / `profile_table`) y necesites un numero 0-100 por
|
||||
columna para ordenar/priorizar limpieza de datos, pintar semaforos de calidad,
|
||||
o decidir que columnas descartar antes de modelar. Separa los **defectos de
|
||||
calidad reales** (`issues`: nulos, vacios, valores que no parsean a su tipo) de
|
||||
las **observaciones analiticas** (`observations`: outliers, columnas constantes,
|
||||
ids), que se reportan pero no penalizan. Es la capa de scoring sobre el
|
||||
ColumnProfile crudo: lee el perfil, no toca los datos.
|
||||
|
||||
## Notas
|
||||
## Gotchas
|
||||
|
||||
Funcion pura, sin I/O ni dependencias externas, no muta `col`. Lee todas las
|
||||
claves con `.get(...)` y tolera que vengan en `None` (un ColumnProfile recien
|
||||
salido de `summarize_table_duckdb` trae muchas claves a `None`), por lo que
|
||||
nunca falla por claves ausentes — un `{}` produce un resultado bien definido.
|
||||
Funcion pura, sin I/O, no muta `col`. Aun asi conviene saber:
|
||||
|
||||
Pesos del score: completeness 0.5, validity 0.3, consistency 0.2.
|
||||
- **Los outliers NO bajan el score.** Un valor extremo puede ser real y correcto
|
||||
(un cliente que compra mucho); detectar atipicos es analisis de la
|
||||
distribucion, no un juicio de correccion. Salen en `observations`, no en
|
||||
`issues`. Mismo trato para columnas constantes e identificadores de alta
|
||||
cardinalidad: son observaciones, no defectos.
|
||||
- **`validity` puede ser `None`** (no aplicable): texto libre sin `semantic_type`
|
||||
ni `validity_rate`, o columna 100% nula. En ese caso el score se renormaliza a
|
||||
solo `completeness` (la columna no se premia ni castiga por algo no medible).
|
||||
- **`outlier_pct` se interpreta en escala 0-100** (la que emite
|
||||
`describe_numeric`, z-score>3). Pasar una fraccion 0-1 produce un texto de
|
||||
observacion con el % equivocado, pero NUNCA afecta al score.
|
||||
- **`validity_rate` lo puebla `profile_table`** al promocionar una columna de
|
||||
texto a numero/fecha (fraccion que parsea). Si no esta presente y el tipo es
|
||||
nativo numerico/fecha/bool, `validity = 1.0`.
|
||||
- Sin doble conteo: la falta de datos cuenta solo en `completeness` (el antiguo
|
||||
castigo de `mostly_null` sobre `validity` se elimino).
|
||||
|
||||
- **completeness** = `1 - null_pct` (None -> 0 nulls -> 1.0).
|
||||
- **validity**: parte de 1.0 y penaliza `min(outlier_pct, 0.3)` en columnas
|
||||
numericas, `0.5 * (1 - match_rate)` si hay `semantic_type` declarado con
|
||||
`match_rate` bajo disponible, y multiplica por 0.5 si el flag `mostly_null`
|
||||
esta presente.
|
||||
- **consistency**: 1.0 salvo flag `constant` (-> 0.3, columna poco informativa)
|
||||
o texto con `unique_pct > 0.9` (-> 0.6, posible id de alta cardinalidad).
|
||||
## Capability growth log
|
||||
|
||||
- v2.0.0 (2026-06-30) — nueva formula de calidad (report 2046): pesos 60/40
|
||||
(completeness/validity) con renormalizacion por aplicabilidad; se elimina la
|
||||
dimension `consistency`-como-informatividad y el doble castigo de
|
||||
`mostly_null`; los outliers/constantes/ids salen del score a `observations`;
|
||||
validity mide conformidad real (parse rate / match rate / tipo nativo). Salida
|
||||
ampliada con `dimensions`, `applicable` y `observations`.
|
||||
- v1.0.0 — version inicial: pesos 50/30/20 (completeness/validity/consistency),
|
||||
los outliers penalizaban validity (con bug de escala) y consistency penalizaba
|
||||
informatividad.
|
||||
|
||||
@@ -1,34 +1,78 @@
|
||||
"""Score de calidad de datos (0-100) para un ColumnProfile del grupo eda.
|
||||
|
||||
Funcion pura: dado el perfil de una columna producido por el grupo de
|
||||
capacidad `eda` (p.ej. summarize_table_duckdb), calcula un score agregado
|
||||
de calidad junto a su desglose en completeness / validity / consistency y
|
||||
una lista de issues legibles. No realiza I/O ni muta el input.
|
||||
capacidad `eda` (p.ej. summarize_table_duckdb / profile_table), calcula un
|
||||
score agregado de calidad junto a su desglose por dimension y dos listas
|
||||
legibles separadas: `issues` (defectos de calidad reales que SI bajan el
|
||||
score) y `observations` (señales analiticas que NO bajan el score). No
|
||||
realiza I/O ni muta el input.
|
||||
|
||||
Modelo (DAMA-DMBOK / ISO 8000), ver report 2046:
|
||||
|
||||
- Solo entran en el score las dimensiones medibles automaticamente desde el
|
||||
perfil, sin fuente externa de verdad: completeness y validity por columna.
|
||||
- Renormalizacion por aplicabilidad: si una dimension no es medible en la
|
||||
columna (texto libre sin semantica -> validity no aplica; columna 100% nula
|
||||
-> validity no medible), se excluye y los pesos se renormalizan sobre las
|
||||
aplicables. Una columna ni se premia ni se castiga por algo no medible.
|
||||
- Sin doble conteo: la falta de datos cuenta solo en completeness (se elimino
|
||||
el antiguo castigo extra de `mostly_null` sobre validity).
|
||||
- Los OUTLIERS NO bajan la calidad. Un valor extremo puede ser real y
|
||||
correcto; detectar atipicos es analisis de la distribucion, no un juicio de
|
||||
coreccion. Outliers, columnas constantes e identificadores de alta
|
||||
cardinalidad pasan a `observations`, nunca a `issues`.
|
||||
"""
|
||||
|
||||
|
||||
# Pesos base de las dimensiones de columna (se renormalizan por aplicabilidad).
|
||||
_W_COMPLETENESS = 0.6
|
||||
_W_VALIDITY = 0.4
|
||||
|
||||
# Tipos inferidos cuyo almacen garantiza la conformidad de tipo (validity=1.0)
|
||||
# cuando NO vienen de una promocion de texto (en cuyo caso manda validity_rate).
|
||||
_NATIVE_TYPED = ("numeric", "integer", "float", "datetime", "date", "boolean", "bool")
|
||||
|
||||
|
||||
def column_quality_score(col: dict) -> dict:
|
||||
"""Calcula un score de calidad de datos 0-100 para un ColumnProfile.
|
||||
|
||||
El score pondera tres dimensiones:
|
||||
- completeness (0.5): proporcion de valores no nulos.
|
||||
- validity (0.3): ausencia de outliers / heuristicas de validez.
|
||||
- consistency (0.2): la columna aporta informacion (no constante, no ruido).
|
||||
El score combina solo dimensiones de calidad medibles desde el perfil, con
|
||||
renormalizacion por aplicabilidad:
|
||||
|
||||
- completeness (peso base 0.6, siempre aplica): proporcion de valores
|
||||
presentes = 1 - null_pct. En texto, las celdas vacias (`empty_count`)
|
||||
tambien cuentan como faltantes.
|
||||
- validity (peso base 0.4, cuando hay un criterio de validacion real):
|
||||
fraccion de valores no nulos conformes a su tipo/semantica. Tipo nativo
|
||||
numerico/fecha/bool = 1.0; texto promovido a numero/fecha = parse rate
|
||||
(`validity_rate`); texto con `semantic_type` regexable = `match_rate`;
|
||||
texto libre o columna 100% nula = NO aplicable (renormaliza a solo
|
||||
completeness).
|
||||
|
||||
Los outliers, columnas constantes, identificadores y asimetria fuerte NO
|
||||
bajan el score: se devuelven en `observations`.
|
||||
|
||||
Args:
|
||||
col: ColumnProfile dict del grupo eda. Se leen las claves de forma
|
||||
defensiva con .get(...) y se tolera que muchas vengan en None.
|
||||
Claves relevantes: null_pct, inferred_type, semantic_type,
|
||||
unique_pct, flags (list[str]), numeric ({outlier_pct, ...}|None),
|
||||
match_rate (opcional).
|
||||
Claves relevantes: null_pct (0-1), n_rows, empty_count,
|
||||
inferred_type, semantic_type, validity_rate (0-1, lo expone
|
||||
profile_table al promocionar texto a numero/fecha), match_rate
|
||||
(0-1), unique_pct (0-1), flags (list[str], reconoce
|
||||
"constant"/"possible_id"/"high_cardinality"), numeric
|
||||
({outlier_pct: 0-100, skew, ...}|None).
|
||||
|
||||
Returns:
|
||||
dict con:
|
||||
score (float, 0-100, redondeado a 1 decimal),
|
||||
completeness (float, 0-1),
|
||||
validity (float, 0-1),
|
||||
consistency (float, 0-1),
|
||||
issues (list[str]) descripciones legibles de los problemas.
|
||||
score (float 0-100, redondeado a 1 decimal),
|
||||
completeness (float 0-1),
|
||||
validity (float 0-1 | None si no aplicable),
|
||||
dimensions ({completeness, validity}),
|
||||
applicable (list[str] de dimensiones que entraron en el score),
|
||||
issues (list[str]) SOLO defectos de calidad (nulos, vacios,
|
||||
valores no conformes a su tipo/semantica),
|
||||
observations (list[str]) señales analiticas que NO bajan el score
|
||||
(outliers, columna constante, posible id, asimetria).
|
||||
"""
|
||||
if not isinstance(col, dict):
|
||||
col = {}
|
||||
@@ -39,103 +83,153 @@ def column_quality_score(col: dict) -> dict:
|
||||
flags = set(flags)
|
||||
|
||||
issues: list[str] = []
|
||||
observations: list[str] = []
|
||||
|
||||
inferred_type = col.get("inferred_type") or ""
|
||||
semantic_type = col.get("semantic_type") or ""
|
||||
|
||||
# --- completeness -------------------------------------------------
|
||||
null_pct = col.get("null_pct")
|
||||
if null_pct is None:
|
||||
null_pct = 0.0
|
||||
try:
|
||||
null_pct = float(null_pct)
|
||||
except (TypeError, ValueError):
|
||||
null_pct = 0.0
|
||||
null_pct = _clamp(null_pct, 0.0, 1.0)
|
||||
# Falta de datos = nulos + (en texto) celdas vacias. Es el unico sitio
|
||||
# donde la falta de datos cuenta: nunca se duplica en validity.
|
||||
null_pct = _clamp(_num(col.get("null_pct"), 0.0), 0.0, 1.0)
|
||||
completeness = 1.0 - null_pct
|
||||
if null_pct > 0:
|
||||
issues.append(f"{round(null_pct * 100)}% nulos")
|
||||
issues.append(f"{_pct(null_pct)} nulos")
|
||||
|
||||
# --- validity -----------------------------------------------------
|
||||
validity = 1.0
|
||||
inferred_type = col.get("inferred_type") or ""
|
||||
empty_frac = 0.0
|
||||
n_rows = col.get("n_rows")
|
||||
empty_count = col.get("empty_count")
|
||||
if (
|
||||
isinstance(n_rows, (int, float)) and not isinstance(n_rows, bool) and n_rows > 0
|
||||
and isinstance(empty_count, (int, float)) and not isinstance(empty_count, bool)
|
||||
and empty_count > 0
|
||||
):
|
||||
empty_frac = _clamp(float(empty_count) / float(n_rows), 0.0, 1.0)
|
||||
completeness = _clamp(completeness - empty_frac, 0.0, 1.0)
|
||||
issues.append(f"{_pct(empty_frac)} vacíos")
|
||||
|
||||
numeric = col.get("numeric")
|
||||
is_numeric = inferred_type in ("integer", "float", "numeric") or isinstance(numeric, dict)
|
||||
if isinstance(numeric, dict):
|
||||
outlier_pct = numeric.get("outlier_pct")
|
||||
if outlier_pct is not None:
|
||||
try:
|
||||
outlier_pct = float(outlier_pct)
|
||||
except (TypeError, ValueError):
|
||||
outlier_pct = 0.0
|
||||
outlier_pct = _clamp(outlier_pct, 0.0, 1.0)
|
||||
if outlier_pct > 0:
|
||||
penalty = min(outlier_pct, 0.3)
|
||||
validity -= penalty
|
||||
issues.append(f"{round(outlier_pct * 100)}% outliers")
|
||||
|
||||
# semantic_type declarado pero con baja tasa de match (si la conocemos).
|
||||
semantic_type = col.get("semantic_type") or ""
|
||||
match_rate = col.get("match_rate")
|
||||
if semantic_type and match_rate is not None:
|
||||
try:
|
||||
match_rate = float(match_rate)
|
||||
except (TypeError, ValueError):
|
||||
match_rate = None
|
||||
if match_rate is not None:
|
||||
match_rate = _clamp(match_rate, 0.0, 1.0)
|
||||
if match_rate < 1.0:
|
||||
shortfall = 1.0 - match_rate
|
||||
validity -= 0.5 * shortfall
|
||||
issues.append(
|
||||
f"semantic_type '{semantic_type}' con baja coincidencia "
|
||||
f"({round(match_rate * 100)}%)"
|
||||
)
|
||||
|
||||
if "mostly_null" in flags:
|
||||
validity *= 0.5
|
||||
issues.append("mayoritariamente nula")
|
||||
|
||||
validity = _clamp(validity, 0.0, 1.0)
|
||||
|
||||
# --- consistency --------------------------------------------------
|
||||
consistency = 1.0
|
||||
if "constant" in flags:
|
||||
consistency = 0.3
|
||||
issues.append("columna constante")
|
||||
# --- validity (con renormalizacion por aplicabilidad) -------------
|
||||
# None = no medible -> se excluye del score (no penaliza ni premia).
|
||||
validity = None
|
||||
if completeness <= 0.0:
|
||||
# Columna 100% faltante: no hay valores no nulos sobre los que medir
|
||||
# conformidad. validity no aplica -> el score sale solo de completeness
|
||||
# (= 0). Es el peor defecto de calidad posible.
|
||||
validity = None
|
||||
else:
|
||||
unique_pct = col.get("unique_pct")
|
||||
if unique_pct is not None:
|
||||
try:
|
||||
unique_pct = float(unique_pct)
|
||||
except (TypeError, ValueError):
|
||||
unique_pct = None
|
||||
if (
|
||||
inferred_type == "text"
|
||||
validity_rate = col.get("validity_rate")
|
||||
match_rate = col.get("match_rate")
|
||||
if validity_rate is not None:
|
||||
# Texto promovido a numero/fecha: parse rate real de la muestra.
|
||||
v = _num(validity_rate, None)
|
||||
if v is not None:
|
||||
validity = _clamp(v, 0.0, 1.0)
|
||||
if validity < 1.0:
|
||||
kind = (
|
||||
"número" if inferred_type == "numeric"
|
||||
else "fecha" if inferred_type == "datetime"
|
||||
else inferred_type or "su tipo"
|
||||
)
|
||||
issues.append(
|
||||
f"{_pct(1.0 - validity)} no parsea al tipo {kind}"
|
||||
)
|
||||
elif inferred_type in _NATIVE_TYPED:
|
||||
# Tipo nativo garantizado por el almacen: no hay valores que no
|
||||
# parseen. validity = 1.0 (no se confunde con tener outliers).
|
||||
validity = 1.0
|
||||
elif semantic_type and match_rate is not None:
|
||||
v = _num(match_rate, None)
|
||||
if v is not None:
|
||||
validity = _clamp(v, 0.0, 1.0)
|
||||
if validity < 1.0:
|
||||
issues.append(
|
||||
f"{_pct(1.0 - validity)} no casa con el "
|
||||
f"formato «{semantic_type}»"
|
||||
)
|
||||
else:
|
||||
# Texto libre / categorica sin semantica: no hay criterio honesto
|
||||
# de validez. No aplica.
|
||||
validity = None
|
||||
|
||||
# --- observations (NO bajan el score) -----------------------------
|
||||
numeric = col.get("numeric")
|
||||
if isinstance(numeric, dict):
|
||||
# outlier_pct viene en escala 0-100 desde describe_numeric (z-score>3).
|
||||
outlier_pct = _num(numeric.get("outlier_pct"), None)
|
||||
if outlier_pct is not None and outlier_pct >= 0.05:
|
||||
observations.append(
|
||||
f"{_pct(outlier_pct / 100.0)} de valores atípicos (z-score>3): "
|
||||
"revisar si son errores u observaciones legítimas"
|
||||
)
|
||||
skew = _num(numeric.get("skew"), None)
|
||||
if skew is not None and abs(skew) >= 1.0:
|
||||
observations.append(
|
||||
f"asimetría fuerte (skew={round(skew, 2)}): considerar "
|
||||
"re-expresión antes de modelar"
|
||||
)
|
||||
|
||||
if "constant" in flags:
|
||||
observations.append(
|
||||
"columna constante: aporta poca información para el análisis"
|
||||
)
|
||||
|
||||
unique_pct = _num(col.get("unique_pct"), None)
|
||||
is_id = (
|
||||
"possible_id" in flags
|
||||
or "high_cardinality" in flags
|
||||
or (
|
||||
inferred_type in ("text", "categorical")
|
||||
and unique_pct is not None
|
||||
and _clamp(unique_pct, 0.0, 1.0) > 0.9
|
||||
):
|
||||
consistency = 0.6
|
||||
issues.append("posible id de alta cardinalidad")
|
||||
|
||||
consistency = _clamp(consistency, 0.0, 1.0)
|
||||
|
||||
# --- score agregado ----------------------------------------------
|
||||
score = round(
|
||||
100.0 * (0.5 * completeness + 0.3 * validity + 0.2 * consistency),
|
||||
1,
|
||||
)
|
||||
)
|
||||
if is_id:
|
||||
observations.append(
|
||||
"valores casi únicos: posible identificador (no es un defecto de calidad)"
|
||||
)
|
||||
|
||||
# Silencia warnings sobre la variable de tipo no usada.
|
||||
_ = is_numeric
|
||||
# --- score agregado con renormalizacion ---------------------------
|
||||
applicable = ["completeness"]
|
||||
num = _W_COMPLETENESS * completeness
|
||||
den = _W_COMPLETENESS
|
||||
if validity is not None:
|
||||
applicable.append("validity")
|
||||
num += _W_VALIDITY * validity
|
||||
den += _W_VALIDITY
|
||||
score = round(100.0 * num / den, 1) if den > 0 else 0.0
|
||||
|
||||
return {
|
||||
"score": score,
|
||||
"completeness": completeness,
|
||||
"validity": validity,
|
||||
"consistency": consistency,
|
||||
"dimensions": {"completeness": completeness, "validity": validity},
|
||||
"applicable": applicable,
|
||||
"issues": issues,
|
||||
"observations": observations,
|
||||
}
|
||||
|
||||
|
||||
def _pct(frac: float) -> str:
|
||||
"""Formatea una fraccion 0-1 como porcentaje honesto: «N%» si >=1%, «0.N%»
|
||||
por debajo (para no mostrar «0%» cuando hay un defecto real pequeño)."""
|
||||
p = frac * 100.0
|
||||
if p >= 1.0:
|
||||
return f"{round(p)}%"
|
||||
return f"{p:.1f}%"
|
||||
|
||||
|
||||
def _num(x, default):
|
||||
"""Convierte x a float; devuelve `default` si es None o no parseable."""
|
||||
if x is None:
|
||||
return default
|
||||
if isinstance(x, bool):
|
||||
return default
|
||||
try:
|
||||
return float(x)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _clamp(x: float, lo: float, hi: float) -> float:
|
||||
"""Recorta x al rango [lo, hi]."""
|
||||
if x < lo:
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
"""Tests para column_quality_score."""
|
||||
"""Tests para column_quality_score (nueva fórmula, report 2046).
|
||||
|
||||
Verifica las invariantes de la fórmula de calidad:
|
||||
- completeness (0.6) + validity (0.4) con renormalización por aplicabilidad.
|
||||
- Los OUTLIERS no bajan el score (van a observations, no a issues).
|
||||
- Columnas constantes e ids no bajan el score (observations).
|
||||
- Sin doble conteo de la falta de datos.
|
||||
- all-null -> score 0; función pura (no muta el input).
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -9,11 +17,11 @@ from column_quality_score import column_quality_score
|
||||
|
||||
|
||||
def _clean_numeric_col() -> dict:
|
||||
"""ColumnProfile de una columna numerica sana, sin problemas."""
|
||||
"""ColumnProfile de una columna numérica nativa sana, sin problemas."""
|
||||
return {
|
||||
"name": "edad",
|
||||
"physical_type": "INTEGER",
|
||||
"inferred_type": "integer",
|
||||
"inferred_type": "numeric",
|
||||
"semantic_type": "",
|
||||
"count": 1000,
|
||||
"n_rows": 1000,
|
||||
@@ -28,85 +36,163 @@ def _clean_numeric_col() -> dict:
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_clean_column_high_score():
|
||||
out = column_quality_score(_clean_numeric_col())
|
||||
assert out["score"] > 90
|
||||
assert out["score"] == 100.0
|
||||
assert out["completeness"] == 1.0
|
||||
assert out["validity"] == 1.0
|
||||
assert out["consistency"] == 1.0
|
||||
assert out["applicable"] == ["completeness", "validity"]
|
||||
assert out["issues"] == []
|
||||
assert out["observations"] == []
|
||||
|
||||
|
||||
def test_half_null_lowers_completeness_and_score():
|
||||
def test_weights_60_40_native_type():
|
||||
"""30% nulos en numérica nativa: score = 100*(0.6*0.7 + 0.4*1.0) = 82."""
|
||||
col = _clean_numeric_col()
|
||||
col["null_count"] = 500
|
||||
col["null_pct"] = 0.5
|
||||
clean_score = column_quality_score(_clean_numeric_col())["score"]
|
||||
col["null_pct"] = 0.30
|
||||
col["null_count"] = 300
|
||||
out = column_quality_score(col)
|
||||
assert out["completeness"] == 0.5
|
||||
assert out["score"] < clean_score
|
||||
assert any("nulos" in issue for issue in out["issues"])
|
||||
assert out["completeness"] == 0.7
|
||||
assert out["validity"] == 1.0
|
||||
assert out["score"] == 82.0
|
||||
assert any("nulos" in i for i in out["issues"])
|
||||
|
||||
|
||||
def test_constant_column_flags_issue():
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Outliers FUERA del score
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_outliers_do_not_penalize_score():
|
||||
"""Columna con outliers pero sin nulos -> score máximo; outliers en observations."""
|
||||
col = _clean_numeric_col()
|
||||
col["numeric"] = {"outlier_pct": 18.0, "skew": 0.2} # 18% atípicos (escala 0-100)
|
||||
out = column_quality_score(col)
|
||||
assert out["score"] == 100.0 # los outliers NO bajan la calidad
|
||||
assert out["validity"] == 1.0
|
||||
# No aparecen como problema de calidad...
|
||||
assert not any("atípic" in i or "outlier" in i for i in out["issues"])
|
||||
# ...sino como observación analítica.
|
||||
assert any("atípic" in o for o in out["observations"])
|
||||
|
||||
|
||||
def test_nulls_lower_score_more_than_outliers():
|
||||
"""Vacíos sí penalizan; outliers no: comparar las dos columnas."""
|
||||
con_nulos = _clean_numeric_col()
|
||||
con_nulos["null_pct"] = 0.30
|
||||
con_outliers = _clean_numeric_col()
|
||||
con_outliers["numeric"] = {"outlier_pct": 30.0}
|
||||
assert column_quality_score(con_nulos)["score"] < \
|
||||
column_quality_score(con_outliers)["score"]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Validity: aplicabilidad y renormalización
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_validity_from_parse_rate_lowers_score():
|
||||
"""Numérica como texto con 20% basura: validity=0.8 -> score=92."""
|
||||
col = {
|
||||
"name": "precio_txt", "inferred_type": "numeric", "semantic_type": "decimal",
|
||||
"null_pct": 0.0, "validity_rate": 0.80, "flags": [], "numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 0.8
|
||||
assert out["score"] == 92.0 # 100*(0.6 + 0.4*0.8)
|
||||
assert any("no parsea" in i for i in out["issues"])
|
||||
|
||||
|
||||
def test_validity_from_match_rate():
|
||||
"""Texto con semantic_type y 5% no conforme: validity=0.95."""
|
||||
col = {
|
||||
"name": "email", "inferred_type": "text", "semantic_type": "email",
|
||||
"null_pct": 0.0, "match_rate": 0.95, "unique_pct": 0.5, "flags": [],
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 0.95
|
||||
assert out["score"] == 98.0 # 100*(0.6 + 0.4*0.95)
|
||||
assert any("no casa" in i for i in out["issues"])
|
||||
|
||||
|
||||
def test_free_text_renormalizes_to_completeness_only():
|
||||
"""Texto libre sin semántica: validity no aplica -> score = 100*completeness."""
|
||||
col = {
|
||||
"name": "comentario", "inferred_type": "text", "semantic_type": "",
|
||||
"null_pct": 0.30, "unique_pct": 0.5, "flags": [], "numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] is None
|
||||
assert out["applicable"] == ["completeness"]
|
||||
assert out["completeness"] == 0.7
|
||||
assert out["score"] == 70.0 # renormalizado a solo completeness
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Casos límite (report §4.6)
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_all_null_column_scores_zero():
|
||||
col = _clean_numeric_col()
|
||||
col["null_pct"] = 1.0
|
||||
col["null_count"] = 1000
|
||||
out = column_quality_score(col)
|
||||
assert out["completeness"] == 0.0
|
||||
assert out["validity"] is None # no medible sin valores no nulos
|
||||
assert out["score"] == 0.0
|
||||
|
||||
|
||||
def test_constant_column_scores_full_and_is_observation():
|
||||
"""Columna constante: dato válido y completo -> score 100; baja info = observación."""
|
||||
col = _clean_numeric_col()
|
||||
col["flags"] = ["constant"]
|
||||
col["distinct_count"] = 1
|
||||
col["unique_pct"] = 0.001
|
||||
out = column_quality_score(col)
|
||||
assert out["consistency"] == 0.3
|
||||
assert any("constante" in issue for issue in out["issues"])
|
||||
assert out["score"] == 100.0 # NO se castiga la baja informatividad
|
||||
assert not any("constante" in i for i in out["issues"])
|
||||
assert any("constante" in o for o in out["observations"])
|
||||
|
||||
|
||||
def test_high_cardinality_id_scores_full_and_is_observation():
|
||||
"""Id de alta cardinalidad: unicidad perfecta -> score 100; posible id = observación."""
|
||||
col = {
|
||||
"name": "uuid", "inferred_type": "text", "semantic_type": "",
|
||||
"null_pct": 0.0, "unique_pct": 0.99, "flags": ["possible_id"],
|
||||
"numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["score"] == 100.0
|
||||
assert not any("identificador" in i for i in out["issues"])
|
||||
assert any("identificador" in o for o in out["observations"])
|
||||
|
||||
|
||||
def test_mostly_null_no_double_counts_validity():
|
||||
"""85% nulos: solo completeness penaliza; validity nativa sigue 1.0 (sin doble castigo)."""
|
||||
col = _clean_numeric_col()
|
||||
col["null_pct"] = 0.85
|
||||
col["flags"] = ["mostly_null"]
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 1.0 # ya no se multiplica por 0.5
|
||||
# score = 100*(0.6*0.15 + 0.4*1.0) = 49
|
||||
assert out["score"] == 49.0
|
||||
assert not any("mayoritariamente" in o for o in out["observations"])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Robustez
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_empty_dict_does_not_crash():
|
||||
out = column_quality_score({})
|
||||
assert isinstance(out["score"], float)
|
||||
assert out["completeness"] == 1.0
|
||||
assert 0.0 <= out["score"] <= 100.0
|
||||
assert isinstance(out["issues"], list)
|
||||
|
||||
|
||||
def test_outliers_penalize_validity():
|
||||
col = _clean_numeric_col()
|
||||
col["numeric"] = {"outlier_pct": 0.2}
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] < 1.0
|
||||
assert any("outliers" in issue for issue in out["issues"])
|
||||
|
||||
|
||||
def test_mostly_null_flag_halves_validity():
|
||||
col = _clean_numeric_col()
|
||||
col["null_pct"] = 0.85
|
||||
col["flags"] = ["mostly_null"]
|
||||
out = column_quality_score(col)
|
||||
assert out["validity"] == 0.5
|
||||
assert any("mayoritariamente nula" in issue for issue in out["issues"])
|
||||
|
||||
|
||||
def test_high_cardinality_text_flagged_as_id():
|
||||
col = {
|
||||
"name": "uuid",
|
||||
"inferred_type": "text",
|
||||
"semantic_type": "",
|
||||
"null_pct": 0.0,
|
||||
"unique_pct": 0.99,
|
||||
"flags": [],
|
||||
"numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["consistency"] < 1.0
|
||||
assert any("alta cardinalidad" in issue for issue in out["issues"])
|
||||
assert isinstance(out["observations"], list)
|
||||
|
||||
|
||||
def test_none_values_treated_defensively():
|
||||
col = {
|
||||
"name": "x",
|
||||
"inferred_type": None,
|
||||
"semantic_type": None,
|
||||
"null_pct": None,
|
||||
"unique_pct": None,
|
||||
"flags": None,
|
||||
"numeric": None,
|
||||
"name": "x", "inferred_type": None, "semantic_type": None,
|
||||
"null_pct": None, "unique_pct": None, "flags": None, "numeric": None,
|
||||
}
|
||||
out = column_quality_score(col)
|
||||
assert out["completeness"] == 1.0
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
---
|
||||
id: compute_text_duplicates_py_datascience
|
||||
name: compute_text_duplicates
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_text_duplicates(texts, near_threshold=0.85, sample_max=2000) -> dict"
|
||||
description: "Detecta documentos duplicados en un corpus de texto. Los duplicados EXACTOS se calculan siempre con la stdlib: cada documento se normaliza (colapsa espacios, strip, lower) y se hashea con SHA-1; n_exact_dup es cuántos docs repiten uno ya visto y exact_dup_pct su porcentaje. Los CASI-duplicados (near-dup) usan la dependencia OPCIONAL datasketch (MinHash + LSH sobre 3-shingles de palabras); si no está instalada, esa parte degrada a available:False sin afectar al resto. Estilo dict-no-throw del grupo eda — nunca lanza."
|
||||
tags: [eda, datascience, text, nlp, duplicates, minhash, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [hashlib, re]
|
||||
example: |
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
texts = ["El gato come pescado", "El gato come pescado", "Un perro ladra"]
|
||||
result = compute_text_duplicates(texts)
|
||||
# {"n_docs": 3, "n_exact_dup": 1, "exact_dup_pct": 33.33, "n_unique": 2,
|
||||
# "near_dup": {"available": False, "n_near_dup_docs": 0}}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_duplicados_exactos"
|
||||
- "test_sin_duplicados"
|
||||
- "test_vacio"
|
||||
- "test_near_dup_degrada"
|
||||
test_file_path: "python/functions/datascience/compute_text_duplicates_test.py"
|
||||
file_path: "python/functions/datascience/compute_text_duplicates.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista de documentos de texto. Los elementos None o que no sean str se descartan silenciosamente; n_docs cuenta solo los documentos válidos. None como argumento se trata como lista vacía."
|
||||
- name: near_threshold
|
||||
desc: "Umbral de similitud Jaccard (0–1) para considerar dos documentos casi-duplicados en el cálculo near-dup vía MinHashLSH. Solo aplica si datasketch está instalada. Default 0.85."
|
||||
- name: sample_max
|
||||
desc: "Número máximo de documentos muestreados (los primeros) para el cálculo near-dup, que es O(n) en memoria de MinHashes. No afecta al conteo de duplicados exactos, que siempre recorre todo el corpus. Default 2000."
|
||||
output: "Dict con exactamente 5 claves, siempre presentes: n_docs (int, docs válidos), n_exact_dup (int, docs que repiten un texto normalizado ya visto = n_docs - n_unique), exact_dup_pct (float a 2 decimales = n_exact_dup/n_docs*100, o None si el corpus está vacío), n_unique (int, nº de textos normalizados distintos), y near_dup (sub-dict con available:bool y n_near_dup_docs:int; cuando available es True incluye además threshold con el near_threshold usado). La función nunca lanza: captura toda excepción y degrada."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
|
||||
# Tres copias del mismo texto (con espacios/casing distintos) + dos únicos.
|
||||
texts = [
|
||||
"El gato come pescado",
|
||||
"El gato come pescado",
|
||||
"el GATO come pescado", # mismo tras normalizar
|
||||
"Un perro ladra",
|
||||
"La luna brilla",
|
||||
]
|
||||
|
||||
compute_text_duplicates(texts)
|
||||
# {
|
||||
# "n_docs": 5,
|
||||
# "n_exact_dup": 2, # 3 copias del primer texto => 2 repeticiones
|
||||
# "exact_dup_pct": 40.0, # 2 / 5 * 100
|
||||
# "n_unique": 3, # 3 textos normalizados distintos
|
||||
# "near_dup": {"available": False, "n_near_dup_docs": 0}, # datasketch ausente
|
||||
# }
|
||||
|
||||
# Corpus vacío: contrato estable, exact_dup_pct None, sin excepción.
|
||||
compute_text_duplicates([])
|
||||
# {"n_docs": 0, "n_exact_dup": 0, "exact_dup_pct": None, "n_unique": 0,
|
||||
# "near_dup": {"available": False, "n_near_dup_docs": 0}}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en la fase de calidad de un EDA de texto, cuando quieras saber cuánto de
|
||||
tu corpus es ruido duplicado antes de entrenar, vectorizar o muestrear: te da
|
||||
el porcentaje de duplicados exactos (`exact_dup_pct`), el número de documentos
|
||||
únicos (`n_unique`) y, si tienes `datasketch` instalada, una estimación de
|
||||
casi-duplicados (paráfrasis, copias con pequeñas ediciones) vía MinHash + LSH.
|
||||
Pásale directamente la columna/lista de textos crudos; la función filtra None y
|
||||
no-str por ti y nunca lanza, así que es segura para encadenar en pipelines de
|
||||
perfilado.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Near-dup requiere `datasketch` (opcional).** Si la librería no está
|
||||
instalada, `near_dup` degrada a `{"available": False, "n_near_dup_docs": 0}`
|
||||
(sin clave `threshold`) y el resto del resultado se calcula igual. Los
|
||||
duplicados **exactos** funcionan siempre porque solo usan la stdlib (hash).
|
||||
- **Normalización de exactos.** Dos textos cuentan como el mismo duplicado
|
||||
exacto si coinciden tras `" ".join(doc.split()).strip().lower()`: se colapsan
|
||||
espacios/tabuladores/saltos, se recortan extremos y se ignora el caso. Cambios
|
||||
de puntuación o acentos SÍ los distinguen (no se eliminan).
|
||||
- **`n_exact_dup` cuenta repeticiones, no grupos.** Con 3 copias de un mismo
|
||||
texto, `n_exact_dup` es 2 (las dos copias extra), no 1. Equivale a
|
||||
`n_docs - n_unique`.
|
||||
- **`exact_dup_pct` es `None` con corpus vacío** (no `ZeroDivisionError`); en
|
||||
cualquier otro caso es un float redondeado a 2 decimales.
|
||||
- **`sample_max` solo limita el near-dup.** El conteo de duplicados exactos
|
||||
recorre todo el corpus; el near-dup muestrea los primeros `sample_max`
|
||||
documentos para acotar memoria. Si el corpus está ordenado, considera barajar
|
||||
antes para que la muestra sea representativa.
|
||||
- **Elementos no-str se descartan.** `True`/`False` no cuentan como str y se
|
||||
ignoran igual que `None`; `n_docs` refleja solo los documentos válidos.
|
||||
@@ -0,0 +1,128 @@
|
||||
"""Detección de documentos duplicados en un corpus de texto.
|
||||
|
||||
Función pura, estilo dict-no-throw del grupo `eda`: nunca lanza, siempre
|
||||
devuelve el mismo contrato de claves. Los duplicados EXACTOS se calculan
|
||||
siempre con la stdlib (normalización + hash SHA-1). Los CASI-duplicados
|
||||
(near-dup) requieren la dependencia opcional `datasketch`; si no está
|
||||
instalada, esa parte degrada limpiamente a ``available: False`` sin afectar
|
||||
al resto del cálculo.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
|
||||
|
||||
def _compute_near_dup(valid, near_threshold, sample_max):
|
||||
"""Cuenta documentos con al menos otro casi-duplicado vía MinHash + LSH.
|
||||
|
||||
Import perezoso de ``datasketch``. Si la librería no está disponible (o
|
||||
cualquier paso falla), degrada a ``{"available": False, "n_near_dup_docs": 0}``
|
||||
sin propagar la excepción.
|
||||
|
||||
Args:
|
||||
valid: lista de str ya filtrada (sin None ni no-str).
|
||||
near_threshold: umbral de similitud Jaccard para LSH.
|
||||
sample_max: número máximo de documentos a muestrear.
|
||||
|
||||
Returns:
|
||||
dict con ``available`` (bool) y ``n_near_dup_docs`` (int). Cuando
|
||||
``available`` es True, incluye además ``threshold``.
|
||||
"""
|
||||
try:
|
||||
from datasketch import MinHash, MinHashLSH
|
||||
except Exception:
|
||||
return {"available": False, "n_near_dup_docs": 0}
|
||||
|
||||
try:
|
||||
docs = valid[:sample_max]
|
||||
num_perm = 128
|
||||
lsh = MinHashLSH(threshold=near_threshold, num_perm=num_perm)
|
||||
minhashes = {}
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
tokens = re.findall(r"\w+", doc.lower())
|
||||
shingles = set()
|
||||
for j in range(len(tokens) - 2):
|
||||
shingles.add(" ".join(tokens[j:j + 3]))
|
||||
# Documentos con menos de 3 tokens no generan 3-shingles: caemos a
|
||||
# los tokens sueltos para no perderlos del todo.
|
||||
if not shingles:
|
||||
shingles = set(tokens)
|
||||
if not shingles:
|
||||
# Documento sin tokens (cadena vacía / solo símbolos): se omite.
|
||||
continue
|
||||
m = MinHash(num_perm=num_perm)
|
||||
for sh in shingles:
|
||||
m.update(sh.encode("utf-8"))
|
||||
key = "d{}".format(i)
|
||||
minhashes[key] = m
|
||||
lsh.insert(key, m)
|
||||
|
||||
n_near = 0
|
||||
for key, m in minhashes.items():
|
||||
matches = lsh.query(m)
|
||||
if len(matches) > 1:
|
||||
n_near += 1
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"n_near_dup_docs": int(n_near),
|
||||
"threshold": near_threshold,
|
||||
}
|
||||
except Exception:
|
||||
return {"available": False, "n_near_dup_docs": 0}
|
||||
|
||||
|
||||
def compute_text_duplicates(texts, near_threshold=0.85, sample_max=2000) -> dict:
|
||||
"""Detecta duplicados exactos y casi-duplicados en un corpus de texto.
|
||||
|
||||
Args:
|
||||
texts: lista de documentos. Los elementos None o que no sean str se
|
||||
descartan; ``n_docs`` cuenta solo los válidos.
|
||||
near_threshold: umbral de similitud Jaccard para considerar dos
|
||||
documentos casi-duplicados (solo near-dup, requiere datasketch).
|
||||
sample_max: tope de documentos muestreados para el cálculo near-dup.
|
||||
|
||||
Returns:
|
||||
dict con las claves ``n_docs``, ``n_exact_dup``, ``exact_dup_pct``
|
||||
(float redondeado a 2 decimales, o None si el corpus está vacío),
|
||||
``n_unique`` y ``near_dup`` (sub-dict con ``available`` y
|
||||
``n_near_dup_docs``, más ``threshold`` cuando está disponible).
|
||||
Nunca lanza: captura toda excepción y degrada.
|
||||
"""
|
||||
# Filtrado defensivo de documentos válidos.
|
||||
try:
|
||||
valid = [t for t in texts if isinstance(t, str)] if texts is not None else []
|
||||
except Exception:
|
||||
valid = []
|
||||
|
||||
n_docs = len(valid)
|
||||
|
||||
# Duplicados exactos: normalizar + hash SHA-1 (stdlib, siempre disponible).
|
||||
try:
|
||||
seen = set()
|
||||
n_exact_dup = 0
|
||||
for doc in valid:
|
||||
norm = " ".join(doc.split()).strip().lower()
|
||||
digest = hashlib.sha1(norm.encode("utf-8")).hexdigest()
|
||||
if digest in seen:
|
||||
n_exact_dup += 1
|
||||
else:
|
||||
seen.add(digest)
|
||||
n_unique = len(seen)
|
||||
except Exception:
|
||||
n_exact_dup = 0
|
||||
n_unique = 0
|
||||
|
||||
exact_dup_pct = round(n_exact_dup / n_docs * 100, 2) if n_docs > 0 else None
|
||||
|
||||
# Casi-duplicados: opcional vía datasketch, degrada solo.
|
||||
near_dup = _compute_near_dup(valid, near_threshold, sample_max)
|
||||
|
||||
return {
|
||||
"n_docs": n_docs,
|
||||
"n_exact_dup": n_exact_dup,
|
||||
"exact_dup_pct": exact_dup_pct,
|
||||
"n_unique": n_unique,
|
||||
"near_dup": near_dup,
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
"""Tests para compute_text_duplicates.
|
||||
|
||||
Importa el modulo hoja directamente (`datascience.compute_text_duplicates`)
|
||||
para no depender de que el paquete reexporte la funcion en su __init__.
|
||||
datasketch normalmente NO esta instalada en el venv, asi que near_dup
|
||||
degrada a available=False; los tests no requieren la libreria.
|
||||
"""
|
||||
|
||||
from datascience.compute_text_duplicates import compute_text_duplicates
|
||||
|
||||
|
||||
EXPECTED_KEYS = {"n_docs", "n_exact_dup", "exact_dup_pct", "n_unique", "near_dup"}
|
||||
|
||||
|
||||
def test_duplicados_exactos():
|
||||
"""3 copias del mismo texto + 2 únicos: n_exact_dup=2, pct>0."""
|
||||
texts = [
|
||||
"El gato come pescado",
|
||||
"El gato come pescado",
|
||||
"el GATO come pescado", # mismo tras normalizar (espacios + case)
|
||||
"Un perro ladra",
|
||||
"La luna brilla",
|
||||
]
|
||||
result = compute_text_duplicates(texts)
|
||||
|
||||
assert set(result.keys()) == EXPECTED_KEYS
|
||||
assert result["n_docs"] == 5
|
||||
# 3 copias del primer texto (2 son repeticion) + 2 textos unicos.
|
||||
assert result["n_exact_dup"] == 2
|
||||
assert result["n_unique"] == 3
|
||||
assert result["exact_dup_pct"] is not None
|
||||
assert result["exact_dup_pct"] > 0
|
||||
# 2 / 5 * 100 = 40.0
|
||||
assert abs(result["exact_dup_pct"] - 40.0) < 1e-9
|
||||
|
||||
|
||||
def test_sin_duplicados():
|
||||
"""Corpus sin repeticiones: n_exact_dup=0, n_unique==n_docs."""
|
||||
texts = [
|
||||
"primero documento distinto",
|
||||
"segundo documento distinto",
|
||||
"tercero documento distinto",
|
||||
]
|
||||
result = compute_text_duplicates(texts)
|
||||
|
||||
assert result["n_docs"] == 3
|
||||
assert result["n_exact_dup"] == 0
|
||||
assert result["n_unique"] == 3
|
||||
assert abs(result["exact_dup_pct"] - 0.0) < 1e-9
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Corpus vacio: n_docs 0, exact_dup_pct None, no lanza."""
|
||||
result = compute_text_duplicates([])
|
||||
|
||||
assert set(result.keys()) == EXPECTED_KEYS
|
||||
assert result["n_docs"] == 0
|
||||
assert result["n_exact_dup"] == 0
|
||||
assert result["exact_dup_pct"] is None
|
||||
assert result["n_unique"] == 0
|
||||
assert result["near_dup"]["n_near_dup_docs"] == 0
|
||||
|
||||
|
||||
def test_near_dup_degrada():
|
||||
"""near_dup expone 'available' (bool) y no lanza aunque falte datasketch."""
|
||||
texts = ["uno dos tres cuatro", "uno dos tres cuatro cinco", "algo distinto"]
|
||||
result = compute_text_duplicates(texts)
|
||||
|
||||
near = result["near_dup"]
|
||||
assert "available" in near
|
||||
assert isinstance(near["available"], bool)
|
||||
assert "n_near_dup_docs" in near
|
||||
assert isinstance(near["n_near_dup_docs"], int)
|
||||
# Tambien tolera None y entradas no-str sin lanzar.
|
||||
mixed = compute_text_duplicates(["hola", None, 123, "hola"])
|
||||
assert mixed["n_docs"] == 2
|
||||
assert mixed["n_exact_dup"] == 1
|
||||
@@ -0,0 +1,86 @@
|
||||
---
|
||||
id: compute_text_length_stats_py_datascience
|
||||
name: compute_text_length_stats
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_text_length_stats(texts, n_bins=20) -> dict"
|
||||
description: "Profiles the length distribution of a corpus of text documents for EDA: per-document characters, words (unicode \\w+ tokens) and sentences (segments split on .!?… with a minimum of 1 per non-empty doc), each summarized with mean/p50/p90/p99/min/max (nearest-rank percentiles), plus an equal-width histogram of per-document word counts. None and non-str items are discarded. Dict-no-throw: never raises. Stdlib only (re)."
|
||||
tags: [eda, datascience, text, nlp, length, statistics, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, math]
|
||||
example: |
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
result = compute_text_length_stats(["Hola mundo.", "Una frase mas larga aqui."], n_bins=5)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_basico"
|
||||
- "test_vacio"
|
||||
- "test_descarta_none"
|
||||
- "test_un_documento"
|
||||
test_file_path: "python/functions/datascience/compute_text_length_stats_test.py"
|
||||
file_path: "python/functions/datascience/compute_text_length_stats.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "List of text documents (str). None entries and any non-str items (ints, floats, etc.) are discarded before any computation. An empty string \"\" is kept (chars 0, words 0, sentences 0)."
|
||||
- name: n_bins
|
||||
desc: "Number of equal-width bins for the per-document word-count histogram. Default 20. When all docs have the same word count, there are <2 docs, or n_bins < 1, a single covering bin is returned instead."
|
||||
output: "Dict with keys n_docs (int), chars, words, sentences and word_hist. Each of the three axis sub-dicts has the exact keys mean (float, 2 decimals), p50, p90, p99, min, max (ints). When there are no valid documents, n_docs is 0, every axis statistic is None and word_hist is []. word_hist is a list of {lo: float, hi: float, count: int} bins; the sum of all bin counts equals n_docs."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
|
||||
compute_text_length_stats(
|
||||
[
|
||||
"Hola mundo.",
|
||||
"Una frase mas larga con varias palabras aqui.",
|
||||
"Esto. Tiene. Tres frases distintas!",
|
||||
],
|
||||
n_bins=5,
|
||||
)
|
||||
# {
|
||||
# "n_docs": 3,
|
||||
# "chars": {"mean": 30.33, "p50": 35, "p90": 45, "p99": 45, "min": 11, "max": 45},
|
||||
# "words": {"mean": 5.0, "p50": 5, "p90": 8, "p99": 8, "min": 2, "max": 8},
|
||||
# "sentences": {"mean": 1.67, "p50": 1, "p90": 3, "p99": 3, "min": 1, "max": 3},
|
||||
# "word_hist": [
|
||||
# {"lo": 2.0, "hi": 3.2, "count": 1},
|
||||
# {"lo": 3.2, "hi": 4.4, "count": 0},
|
||||
# {"lo": 4.4, "hi": 5.6, "count": 1},
|
||||
# {"lo": 5.6, "hi": 6.8, "count": 0},
|
||||
# {"lo": 6.8, "hi": 8.0, "count": 1},
|
||||
# ],
|
||||
# }
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala al perfilar una columna o corpus de texto libre en un EDA: cuando
|
||||
necesites saber lo largos que son los documentos (en caracteres, palabras y
|
||||
frases) y cómo se reparte esa longitud antes de tokenizar, vectorizar o decidir
|
||||
truncados/ventanas para un modelo. Pásale la lista de strings crudos de la
|
||||
columna; `None` y valores no-texto se descartan solos. Encaja en el grupo `eda`
|
||||
como bloque de longitud junto a `summarize_categorical`.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Función pura, solo stdlib (`re`). No usa numpy, pandas ni sklearn.
|
||||
- Percentiles por método **nearest-rank** (devuelven un valor real de la lista,
|
||||
no interpolan); por eso p50/p90/p99/min/max son enteros y `mean` es el único
|
||||
float (redondeado a 2 decimales).
|
||||
- El conteo de frases es una **aproximación** por puntuación (`.!?…`): un texto
|
||||
sin esa puntuación cuenta como 1 frase si no está vacío; abreviaturas o
|
||||
ellipsis pueden inflar o reducir el conteo.
|
||||
- `word_hist` es equal-width entre min y max de palabras: con todos los docs
|
||||
del mismo tamaño, menos de 2 docs, o `n_bins < 1`, devuelve un único bin.
|
||||
- Dict-no-throw: ante input inesperado devuelve la forma vacía
|
||||
(`n_docs` 0, ejes `None`, `word_hist` []) en vez de lanzar.
|
||||
@@ -0,0 +1,168 @@
|
||||
"""Pure EDA helper: document length distribution for the `eda` group.
|
||||
|
||||
Given a list of text documents, computes the length distribution along three
|
||||
axes (characters, words and sentences) plus an equal-width histogram of the
|
||||
per-document word counts. Stdlib only (``re`` + ``statistics`` semantics via a
|
||||
hand-rolled nearest-rank percentile). No numpy, no sklearn.
|
||||
|
||||
The function is dict-no-throw: it never raises. On any unexpected input it
|
||||
degrades to the empty-shape result.
|
||||
"""
|
||||
|
||||
import math
|
||||
import re
|
||||
|
||||
_WORD_RE = re.compile(r"\w+", re.UNICODE)
|
||||
_SENT_RE = re.compile(r"[.!?…]+")
|
||||
|
||||
|
||||
def _empty_axis() -> dict:
|
||||
"""Return an axis sub-dict with every statistic set to ``None``."""
|
||||
return {"mean": None, "p50": None, "p90": None, "p99": None, "min": None, "max": None}
|
||||
|
||||
|
||||
def _pct(sorted_vals, q):
|
||||
"""Nearest-rank percentile of an already-sorted list.
|
||||
|
||||
Args:
|
||||
sorted_vals: List of numbers sorted ascending.
|
||||
q: Percentile in the 0..100 range.
|
||||
|
||||
Returns:
|
||||
The value at the nearest rank, or ``None`` for an empty list.
|
||||
"""
|
||||
n = len(sorted_vals)
|
||||
if n == 0:
|
||||
return None
|
||||
if q <= 0:
|
||||
return sorted_vals[0]
|
||||
rank = math.ceil(q / 100.0 * n)
|
||||
if rank < 1:
|
||||
rank = 1
|
||||
if rank > n:
|
||||
rank = n
|
||||
return sorted_vals[rank - 1]
|
||||
|
||||
|
||||
def _axis_stats(values) -> dict:
|
||||
"""Compute mean/p50/p90/p99/min/max over a list of integer counts.
|
||||
|
||||
``mean`` is rounded to 2 decimals; every other statistic is an integer
|
||||
(they are counts). Returns an all-``None`` axis for an empty list.
|
||||
"""
|
||||
if not values:
|
||||
return _empty_axis()
|
||||
sv = sorted(values)
|
||||
return {
|
||||
"mean": round(sum(sv) / len(sv), 2),
|
||||
"p50": int(_pct(sv, 50)),
|
||||
"p90": int(_pct(sv, 90)),
|
||||
"p99": int(_pct(sv, 99)),
|
||||
"min": int(sv[0]),
|
||||
"max": int(sv[-1]),
|
||||
}
|
||||
|
||||
|
||||
def _word_hist(word_counts, n_bins) -> list:
|
||||
"""Equal-width histogram of per-document word counts.
|
||||
|
||||
Builds ``n_bins`` bins between ``min`` and ``max`` of the word counts. When
|
||||
every document has the same number of words, there are fewer than 2
|
||||
documents, or ``n_bins`` is not at least 1, a single covering bin is
|
||||
returned. With no documents the result is ``[]``. The sum of bin ``count``
|
||||
always equals ``len(word_counts)``.
|
||||
"""
|
||||
if not word_counts:
|
||||
return []
|
||||
wmin = min(word_counts)
|
||||
wmax = max(word_counts)
|
||||
if wmax == wmin or len(word_counts) < 2 or n_bins < 1:
|
||||
return [{"lo": float(wmin), "hi": float(wmax), "count": len(word_counts)}]
|
||||
|
||||
width = (wmax - wmin) / n_bins
|
||||
bins = []
|
||||
for i in range(n_bins):
|
||||
lo = wmin + i * width
|
||||
hi = wmin + (i + 1) * width
|
||||
bins.append({"lo": float(lo), "hi": float(hi), "count": 0})
|
||||
# Pin the last upper edge to the real maximum to avoid float drift.
|
||||
bins[-1]["hi"] = float(wmax)
|
||||
|
||||
for wc in word_counts:
|
||||
if wc >= wmax:
|
||||
idx = n_bins - 1
|
||||
else:
|
||||
idx = int((wc - wmin) / width)
|
||||
if idx < 0:
|
||||
idx = 0
|
||||
elif idx >= n_bins:
|
||||
idx = n_bins - 1
|
||||
bins[idx]["count"] += 1
|
||||
return bins
|
||||
|
||||
|
||||
def compute_text_length_stats(texts, n_bins=20) -> dict:
|
||||
"""Summarize the length distribution of a corpus of text documents.
|
||||
|
||||
For each document three lengths are measured: characters (``len(doc)``),
|
||||
words (count of ``\\w+`` unicode tokens) and sentences (non-empty segments
|
||||
after splitting on ``.!?…``, with a minimum of 1 for any non-empty
|
||||
document). For each axis the mean, p50, p90, p99, min and max are reported,
|
||||
plus an equal-width histogram of the per-document word counts.
|
||||
|
||||
``None`` entries and any non-``str`` items in ``texts`` are discarded.
|
||||
The function never raises: on empty/``None`` input or any internal error it
|
||||
returns the empty-shape result (``n_docs`` 0, all-``None`` axes, ``[]``
|
||||
histogram).
|
||||
|
||||
Args:
|
||||
texts: List of text documents (``str``). ``None`` and non-``str``
|
||||
items are dropped.
|
||||
n_bins: Number of equal-width bins for the word-count histogram.
|
||||
Default 20.
|
||||
|
||||
Returns:
|
||||
Dict with keys ``n_docs``, ``chars``, ``words``, ``sentences`` and
|
||||
``word_hist``. Each of the three axes is a sub-dict with ``mean``
|
||||
(float, 2 decimals), ``p50``, ``p90``, ``p99``, ``min`` and ``max``
|
||||
(ints), all ``None`` when there are no documents. ``word_hist`` is a
|
||||
list of ``{lo, hi, count}`` bins whose ``count`` sums to ``n_docs``.
|
||||
"""
|
||||
empty_axis = _empty_axis()
|
||||
fallback = {
|
||||
"n_docs": 0,
|
||||
"chars": dict(empty_axis),
|
||||
"words": dict(empty_axis),
|
||||
"sentences": dict(empty_axis),
|
||||
"word_hist": [],
|
||||
}
|
||||
try:
|
||||
if not texts:
|
||||
return fallback
|
||||
|
||||
docs = [t for t in texts if isinstance(t, str)]
|
||||
n_docs = len(docs)
|
||||
if n_docs == 0:
|
||||
return fallback
|
||||
|
||||
char_counts = [len(d) for d in docs]
|
||||
word_counts = [len(_WORD_RE.findall(d)) for d in docs]
|
||||
|
||||
sent_counts = []
|
||||
for d in docs:
|
||||
segments = [s for s in _SENT_RE.split(d) if s.strip()]
|
||||
n = len(segments)
|
||||
if d and n == 0:
|
||||
# Non-empty document with no detectable sentence: count as 1.
|
||||
n = 1
|
||||
sent_counts.append(n)
|
||||
|
||||
return {
|
||||
"n_docs": n_docs,
|
||||
"chars": _axis_stats(char_counts),
|
||||
"words": _axis_stats(word_counts),
|
||||
"sentences": _axis_stats(sent_counts),
|
||||
"word_hist": _word_hist(word_counts, n_bins),
|
||||
}
|
||||
except Exception:
|
||||
return fallback
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Tests para compute_text_length_stats.
|
||||
|
||||
Inserta `python/functions` en sys.path (relativo a este archivo) para importar
|
||||
el modulo hoja por su paquete `datascience`, sin depender de que el paquete lo
|
||||
reexporte en su __init__.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from datascience.compute_text_length_stats import compute_text_length_stats
|
||||
|
||||
|
||||
def test_basico():
|
||||
"""Varios textos de longitudes distintas: stats y histograma coherentes."""
|
||||
texts = [
|
||||
"Hola mundo.", # 2 words, 1 sentence
|
||||
"Una frase mas larga con varias palabras aqui.", # 8 words, 1 sentence
|
||||
"Corto.", # 1 word, 1 sentence
|
||||
"Esto. Tiene. Tres frases distintas!", # 5 words, 3 sentences
|
||||
]
|
||||
result = compute_text_length_stats(texts)
|
||||
|
||||
assert result["n_docs"] == 4
|
||||
# Diferentes longitudes en palabras -> max estrictamente mayor que min.
|
||||
assert result["words"]["max"] > result["words"]["min"]
|
||||
# El histograma de palabras no esta vacio.
|
||||
assert result["word_hist"] != []
|
||||
# La suma de counts del histograma cubre todos los documentos.
|
||||
assert sum(b["count"] for b in result["word_hist"]) == result["n_docs"]
|
||||
# mean es float redondeado; min/max son enteros.
|
||||
assert isinstance(result["words"]["mean"], float)
|
||||
assert isinstance(result["words"]["min"], int)
|
||||
assert isinstance(result["words"]["max"], int)
|
||||
# El documento con 3 frases empuja el max de sentences a >= 3.
|
||||
assert result["sentences"]["max"] >= 3
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Lista vacia: n_docs 0, subdicts None, word_hist []."""
|
||||
result = compute_text_length_stats([])
|
||||
assert result["n_docs"] == 0
|
||||
for axis in ("chars", "words", "sentences"):
|
||||
for key in ("mean", "p50", "p90", "p99", "min", "max"):
|
||||
assert result[axis][key] is None
|
||||
assert result["word_hist"] == []
|
||||
|
||||
|
||||
def test_descarta_none():
|
||||
"""None y valores no-str se descartan del computo."""
|
||||
result = compute_text_length_stats(["hello world", None, 123, 4.5, "foo bar baz"])
|
||||
# Solo dos strings validos.
|
||||
assert result["n_docs"] == 2
|
||||
assert result["words"]["min"] == 2 # "hello world"
|
||||
assert result["words"]["max"] == 3 # "foo bar baz"
|
||||
assert sum(b["count"] for b in result["word_hist"]) == 2
|
||||
|
||||
|
||||
def test_un_documento():
|
||||
"""Un solo documento: word_hist tiene exactamente un bin con count 1."""
|
||||
result = compute_text_length_stats(["solo un documento aqui"])
|
||||
assert result["n_docs"] == 1
|
||||
assert len(result["word_hist"]) == 1
|
||||
assert result["word_hist"][0]["count"] == 1
|
||||
# Con un unico documento, p50 == min == max == su numero de palabras (4).
|
||||
assert result["words"]["min"] == 4
|
||||
assert result["words"]["max"] == 4
|
||||
assert result["words"]["p50"] == 4
|
||||
@@ -0,0 +1,88 @@
|
||||
---
|
||||
id: compute_text_readability_py_datascience
|
||||
name: compute_text_readability
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_text_readability(texts, sample_max=500) -> dict"
|
||||
description: "Calcula la legibilidad Flesch Reading Ease de un corpus de texto usando textstat con import perezoso y degradación. Filtra None/no-str/vacíos, muestrea hasta sample_max documentos (los primeros) y agrega los scores Flesch en {mean, p50, min, max}. Si textstat no está instalada devuelve available=False sin lanzar. Estilo dict-no-throw del grupo eda — nunca lanza."
|
||||
tags: [eda, datascience, text, nlp, readability, flesch, textstat, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [math, textstat]
|
||||
example: |
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
out = compute_text_readability(["The cat sat on the mat. It was warm and sunny."])
|
||||
# {"available": True, "n_scored": 1, "flesch": {"mean": 109.0, "p50": 109.0, "min": 108.96..., "max": 108.96...}}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_prosa_ingles"
|
||||
- "test_vacio"
|
||||
- "test_degradacion"
|
||||
test_file_path: "python/functions/datascience/compute_text_readability_test.py"
|
||||
file_path: "python/functions/datascience/compute_text_readability.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista de str (documentos del corpus). Los elementos None, no-str o vacíos tras strip() se descartan silenciosamente. El orden se respeta: el muestreo toma los primeros documentos válidos."
|
||||
- name: sample_max
|
||||
desc: "Número máximo de documentos válidos a puntuar (los primeros). Default 500. Acota el coste en corpus grandes. Valores no convertibles a int caen a 500; negativos se tratan como 0."
|
||||
output: "Dict con exactamente 3 claves siempre presentes: available (bool: True si textstat se pudo importar), n_scored (int: nº de documentos efectivamente puntuados), flesch (dict con mean, p50, min, max). mean y p50 redondeados a 1 decimal; p50 por nearest-rank sobre los scores ordenados; min/max son los scores extremos sin redondear. Todos los valores de flesch son None cuando n_scored es 0. La función nunca lanza: cualquier excepción global (incluida ImportError de textstat) degrada a available=False, n_scored=0 y flesch todo None."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
|
||||
textos = [
|
||||
"The cat sat on the mat. It was a warm and sunny day in the park.",
|
||||
"Reading is a wonderful habit. Books open doors to new worlds and ideas.",
|
||||
"He ran quickly to the store to buy some fresh bread and a bottle of milk.",
|
||||
]
|
||||
|
||||
compute_text_readability(textos)
|
||||
# {
|
||||
# "available": True,
|
||||
# "n_scored": 3,
|
||||
# "flesch": {"mean": 91.4, "p50": 95.4, "min": 70.08..., "max": 108.83...}
|
||||
# }
|
||||
|
||||
# Corpus vacío (textstat presente): available True pero nada que puntuar.
|
||||
compute_text_readability([])
|
||||
# {"available": True, "n_scored": 0,
|
||||
# "flesch": {"mean": None, "p50": None, "min": None, "max": None}}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en un EDA de texto cuando necesites una métrica única y comparable de
|
||||
**lo fácil que es de leer** un corpus de documentos (descripciones, reviews,
|
||||
artículos, tickets). Devuelve el resumen Flesch Reading Ease agregado
|
||||
(`mean`/`p50`/`min`/`max`) listo para un report o un bloque del notebook, sin
|
||||
tener que iterar `textstat` a mano. Pásale la lista de textos crudos y, si el
|
||||
corpus es grande, limita el coste con `sample_max`. El estilo dict-no-throw
|
||||
permite incrustarla en pipelines del grupo `eda` sin envolver en try/except.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **`textstat` es una dependencia opcional.** Si no está instalada (o falla al
|
||||
importar) la función NO lanza: devuelve `available=False`, `n_scored=0` y
|
||||
`flesch` todo `None`. Comprueba `available` antes de interpretar los números.
|
||||
- **Flesch Reading Ease está pensado para prosa en inglés.** Aplicado a otros
|
||||
idiomas o a texto no-prosa (código, listas, tablas, cadenas muy cortas) los
|
||||
scores no son interpretables, aunque se calculen sin error.
|
||||
- **Escala Flesch:** valores **altos** = más fácil de leer (≈90–100 muy fácil),
|
||||
valores **bajos** = más difícil (puede ser negativo en texto muy denso). No
|
||||
se recortan a ningún rango: se reportan tal cual los devuelve `textstat`.
|
||||
- **`available=True` con `n_scored=0`** significa que `textstat` está presente
|
||||
pero el corpus no aportó documentos puntuables (vacío, solo None/no-str, o
|
||||
todos los docs fallaron al puntuar). Es distinto de `available=False`.
|
||||
- **Muestreo = los primeros `sample_max`**, no aleatorio. Si el orden del corpus
|
||||
está sesgado, el resumen reflejará ese sesgo.
|
||||
- **`mean` y `p50` redondean a 1 decimal**; `min`/`max` se devuelven sin
|
||||
redondear (los scores extremos reales).
|
||||
@@ -0,0 +1,121 @@
|
||||
"""Legibilidad Flesch Reading Ease de un corpus de texto.
|
||||
|
||||
Función pura del grupo `eda`, estilo dict-no-throw: nunca lanza. Usa la
|
||||
librería `textstat` con import perezoso y degradación: si `textstat` no está
|
||||
instalada (o falla al importar), devuelve un resultado con `available=False`
|
||||
en lugar de propagar el error.
|
||||
"""
|
||||
|
||||
|
||||
def _percentile_nearest_rank(sorted_values, pct):
|
||||
"""Percentil por nearest-rank sobre una lista ya ordenada ascendente.
|
||||
|
||||
rank = ceil(pct/100 * n); índice 1-based recortado a [1, n].
|
||||
Devuelve None si la lista está vacía.
|
||||
"""
|
||||
n = len(sorted_values)
|
||||
if n == 0:
|
||||
return None
|
||||
import math
|
||||
|
||||
rank = math.ceil((pct / 100.0) * n)
|
||||
if rank < 1:
|
||||
rank = 1
|
||||
if rank > n:
|
||||
rank = n
|
||||
return sorted_values[rank - 1]
|
||||
|
||||
|
||||
def compute_text_readability(texts, sample_max=500) -> dict:
|
||||
"""Calcula la legibilidad Flesch Reading Ease de un corpus.
|
||||
|
||||
Args:
|
||||
texts: lista de str. Los elementos None, no-str o vacíos (tras strip)
|
||||
se descartan. Se muestrean los primeros `sample_max` documentos
|
||||
válidos.
|
||||
sample_max: número máximo de documentos a puntuar (los primeros).
|
||||
|
||||
Returns:
|
||||
Dict con la forma exacta::
|
||||
|
||||
{"available": bool, "n_scored": int,
|
||||
"flesch": {"mean": float|None, "p50": float|None,
|
||||
"min": float|None, "max": float|None}}
|
||||
|
||||
`available` es True si `textstat` se pudo importar. La función nunca
|
||||
lanza: cualquier excepción global degrada a `available=False`.
|
||||
"""
|
||||
empty = {
|
||||
"available": False,
|
||||
"n_scored": 0,
|
||||
"flesch": {"mean": None, "p50": None, "min": None, "max": None},
|
||||
}
|
||||
try:
|
||||
# Import perezoso con degradación: textstat es una dependencia opcional.
|
||||
try:
|
||||
import textstat
|
||||
except Exception:
|
||||
return {
|
||||
"available": False,
|
||||
"n_scored": 0,
|
||||
"flesch": {"mean": None, "p50": None, "min": None, "max": None},
|
||||
}
|
||||
|
||||
# Filtrar y muestrear documentos válidos (los primeros sample_max).
|
||||
docs = []
|
||||
if texts is not None:
|
||||
try:
|
||||
limit = int(sample_max)
|
||||
except Exception:
|
||||
limit = 500
|
||||
if limit < 0:
|
||||
limit = 0
|
||||
for item in texts:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
if item.strip() == "":
|
||||
continue
|
||||
docs.append(item)
|
||||
if len(docs) >= limit:
|
||||
break
|
||||
|
||||
scores = []
|
||||
for doc in docs:
|
||||
try:
|
||||
score = textstat.flesch_reading_ease(doc)
|
||||
except Exception:
|
||||
continue
|
||||
try:
|
||||
score = float(score)
|
||||
except Exception:
|
||||
continue
|
||||
scores.append(score)
|
||||
|
||||
n_scored = len(scores)
|
||||
if n_scored == 0:
|
||||
# textstat presente pero corpus vacío / sin puntuar.
|
||||
return {
|
||||
"available": True,
|
||||
"n_scored": 0,
|
||||
"flesch": {"mean": None, "p50": None, "min": None, "max": None},
|
||||
}
|
||||
|
||||
mean_val = round(sum(scores) / n_scored, 1)
|
||||
sorted_scores = sorted(scores)
|
||||
p50_raw = _percentile_nearest_rank(sorted_scores, 50)
|
||||
p50_val = round(p50_raw, 1) if p50_raw is not None else None
|
||||
min_val = sorted_scores[0]
|
||||
max_val = sorted_scores[-1]
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"n_scored": n_scored,
|
||||
"flesch": {
|
||||
"mean": mean_val,
|
||||
"p50": p50_val,
|
||||
"min": min_val,
|
||||
"max": max_val,
|
||||
},
|
||||
}
|
||||
except Exception:
|
||||
return empty
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Tests para compute_text_readability."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import builtins
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from datascience.compute_text_readability import compute_text_readability
|
||||
|
||||
|
||||
EXPECTED_KEYS = {"available", "n_scored", "flesch"}
|
||||
FLESCH_KEYS = {"mean", "p50", "min", "max"}
|
||||
|
||||
|
||||
def test_prosa_ingles():
|
||||
"""Varios textos en prosa inglesa: available True, n_scored>0, mean no None."""
|
||||
texts = [
|
||||
"The cat sat on the mat. It was a warm and sunny day in the park.",
|
||||
"She sells sea shells by the sea shore. The shells she sells are surely sea shells.",
|
||||
"Reading is a wonderful habit. Books open doors to new worlds and ideas.",
|
||||
"He ran quickly to the store to buy some fresh bread and a bottle of milk.",
|
||||
]
|
||||
out = compute_text_readability(texts)
|
||||
|
||||
assert set(out.keys()) == EXPECTED_KEYS
|
||||
assert out["available"] is True
|
||||
assert out["n_scored"] > 0
|
||||
assert set(out["flesch"].keys()) == FLESCH_KEYS
|
||||
assert out["flesch"]["mean"] is not None
|
||||
assert out["flesch"]["p50"] is not None
|
||||
assert out["flesch"]["min"] is not None
|
||||
assert out["flesch"]["max"] is not None
|
||||
# min <= mean/p50 <= max coherente.
|
||||
assert out["flesch"]["min"] <= out["flesch"]["max"]
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Corpus vacío con textstat presente: available True, n_scored 0, flesch None."""
|
||||
out = compute_text_readability([])
|
||||
|
||||
assert set(out.keys()) == EXPECTED_KEYS
|
||||
assert out["available"] is True
|
||||
assert out["n_scored"] == 0
|
||||
assert out["flesch"]["mean"] is None
|
||||
assert out["flesch"]["p50"] is None
|
||||
assert out["flesch"]["min"] is None
|
||||
assert out["flesch"]["max"] is None
|
||||
|
||||
# Elementos no-str / vacíos también se descartan -> n_scored 0.
|
||||
out2 = compute_text_readability([None, "", " ", 123])
|
||||
assert out2["available"] is True
|
||||
assert out2["n_scored"] == 0
|
||||
|
||||
|
||||
def test_degradacion(monkeypatch):
|
||||
"""Sin textstat (ImportError forzado): degrada a available False sin lanzar."""
|
||||
import datascience.compute_text_readability as m
|
||||
|
||||
real = builtins.__import__
|
||||
|
||||
def fake(name, *a, **k):
|
||||
if name == "textstat" or name.startswith("textstat."):
|
||||
raise ImportError("simulado")
|
||||
return real(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake)
|
||||
out = m.compute_text_readability(["The cat sat on the mat. It was happy and warm."])
|
||||
assert out["available"] is False
|
||||
assert out["n_scored"] == 0
|
||||
assert out["flesch"]["mean"] is None
|
||||
assert out["flesch"]["p50"] is None
|
||||
assert out["flesch"]["min"] is None
|
||||
assert out["flesch"]["max"] is None
|
||||
@@ -0,0 +1,103 @@
|
||||
---
|
||||
id: compute_top_ngrams_py_datascience
|
||||
name: compute_top_ngrams
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict"
|
||||
description: "Calcula los n-gramas de palabras más frecuentes de un corpus de texto (n=1 unigramas, 2 bigramas, 3 trigramas...). Tokeniza a minúsculas con re.findall(r'\\w+', ...), descarta tokens numéricos y, si remove_stopwords=True, elimina stopwords ES+EN ANTES de formar los n-gramas (n-gramas contiguos sobre la secuencia de tokens de contenido, sin cruzar documentos). Pura y autocontenida con collections.Counter, sin sklearn. Estilo dict-no-throw del grupo eda: nunca lanza."
|
||||
tags: [eda, datascience, text, nlp, ngrams, bigrams, trigrams, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, collections]
|
||||
example: |
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
texts = ["machine learning rocks", "we love machine learning"]
|
||||
compute_top_ngrams(texts, n=2, top_k=5)
|
||||
# {"n": 2, "top": [{"ngram": "machine learning", "count": 2}, ...]}
|
||||
tested: true
|
||||
tests:
|
||||
- "test_bigramas"
|
||||
- "test_trigramas"
|
||||
- "test_vacio"
|
||||
- "test_stopwords"
|
||||
test_file_path: "python/functions/datascience/compute_top_ngrams_test.py"
|
||||
file_path: "python/functions/datascience/compute_top_ngrams.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista (o tupla) de cadenas. Los elementos None o que no sean str se descartan silenciosamente. Cada documento se tokeniza por separado; los n-gramas no cruzan la frontera entre documentos."
|
||||
- name: n
|
||||
desc: "Tamaño del n-grama: 1 unigramas, 2 bigramas, 3 trigramas, etc. Valores < 1 o no enteros producen top vacío (se conserva tal cual en la clave 'n' del retorno)."
|
||||
- name: top_k
|
||||
desc: "Número máximo de n-gramas a devolver, ordenados por frecuencia descendente con desempate alfabético determinista. Default 15. Valores negativos se tratan como 0."
|
||||
- name: remove_stopwords
|
||||
desc: "Si True (default) elimina las stopwords ES+EN de una lista inline (~130 términos de altísima frecuencia) ANTES de formar los n-gramas, de modo que los n-gramas se construyen sobre la secuencia de tokens de contenido."
|
||||
output: "Dict con exactamente 2 claves: n (el n recibido, sin normalizar) y top (lista de dicts {'ngram': str, 'count': int} ordenada por count descendente, longitud <= top_k). ngram es la unión de los tokens del n-grama por un espacio. Corpus vacío, tokens insuficientes para formar n-gramas o cualquier excepción interna degradan a {'n': n, 'top': []}. La función nunca lanza."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
|
||||
texts = [
|
||||
"machine learning rocks",
|
||||
"machine learning is fun",
|
||||
"we love machine learning",
|
||||
]
|
||||
|
||||
# Bigramas (n=2): "machine learning" aparece en los 3 documentos.
|
||||
compute_top_ngrams(texts, n=2, top_k=5)
|
||||
# {
|
||||
# "n": 2,
|
||||
# "top": [
|
||||
# {"ngram": "machine learning", "count": 3},
|
||||
# {"ngram": "learning fun", "count": 1},
|
||||
# {"ngram": "learning rocks", "count": 1},
|
||||
# {"ngram": "love machine", "count": 1},
|
||||
# ],
|
||||
# }
|
||||
|
||||
# Unigramas con stopwords fuera (default): solo palabras de contenido.
|
||||
compute_top_ngrams(["the cat sat on the mat"], n=1, top_k=3)
|
||||
# {"n": 1, "top": [{"ngram": "cat", "count": 1},
|
||||
# {"ngram": "mat", "count": 1},
|
||||
# {"ngram": "sat", "count": 1}]}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en la fase de EDA de texto cuando, además del vocabulario suelto, necesites
|
||||
ver qué **combinaciones de palabras contiguas** dominan un corpus: colocaciones,
|
||||
frases técnicas recurrentes ("machine learning", "data analyst"), o patrones de
|
||||
trigramas en titulares/descripciones. Es el complemento natural de un perfil de
|
||||
vocabulario: pasa de "qué palabras aparecen" a "qué secuencias aparecen". Llámala
|
||||
con `n=1` para unigramas, `n=2` para bigramas y `n=3` para trigramas, y ajusta
|
||||
`top_k` al tamaño de la tabla que vas a renderizar. Deja `remove_stopwords=True`
|
||||
para que los n-gramas reflejen contenido y no conectores gramaticales.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Las stopwords se eliminan ANTES de formar los n-gramas.** Con
|
||||
`remove_stopwords=True` la frase "data of analysis" produce el bigrama
|
||||
"data analysis" (el "of" intermedio desaparece y los tokens de contenido se
|
||||
vuelven contiguos), no "data of" ni "of analysis". Si quieres preservar la
|
||||
adyacencia literal del texto original, pasa `remove_stopwords=False`.
|
||||
- **Los n-gramas NO cruzan documentos.** Cada elemento de `texts` se tokeniza y
|
||||
recorre por separado; el último token de un documento nunca se combina con el
|
||||
primero del siguiente.
|
||||
- **Tokens puramente numéricos se descartan** (`tok.isdigit()`), pero los
|
||||
alfanuméricos mixtos no: "3d" o "covid19" sí cuentan como tokens. Un decimal
|
||||
como "3.5" se parte en "3" y "5" por `\w+` y ambos se descartan por numéricos.
|
||||
- **La lista de stopwords es inline ES+EN**, pensada para textos generales en
|
||||
esos dos idiomas. Para otros idiomas o jerga específica de dominio puede dejar
|
||||
pasar conectores; en ese caso filtra el corpus aguas arriba o usa
|
||||
`remove_stopwords=False` y posfiltra.
|
||||
- **`top` puede tener menos de `top_k` elementos** si el corpus no tiene tantos
|
||||
n-gramas distintos. El desempate por frecuencia es alfabético (determinista),
|
||||
no por orden de aparición.
|
||||
@@ -0,0 +1,94 @@
|
||||
"""Top n-gramas de palabras más frecuentes de un corpus de texto.
|
||||
|
||||
Función pura, autocontenida (solo stdlib: re + collections.Counter). No depende
|
||||
de scikit-learn ni de ninguna otra librería externa. Estilo dict-no-throw del
|
||||
grupo `eda`: ante cualquier entrada degenerada o excepción interna devuelve
|
||||
``{"n": n, "top": []}`` en vez de lanzar.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
# Lista inline de stopwords ES + EN (~80 términos de altísima frecuencia).
|
||||
# Se eliminan ANTES de formar los n-gramas: los n-gramas se construyen sobre la
|
||||
# secuencia de tokens de contenido, no sobre el texto original.
|
||||
_STOPWORDS = frozenset({
|
||||
# Español
|
||||
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
||||
"un", "para", "con", "no", "una", "su", "al", "lo", "como", "más", "mas",
|
||||
"pero", "sus", "le", "ya", "o", "este", "sí", "si", "porque", "esta",
|
||||
"entre", "cuando", "muy", "sin", "sobre", "también", "tambien", "me",
|
||||
"hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
|
||||
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
|
||||
"ellos", "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo",
|
||||
"otro", "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes",
|
||||
"nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas",
|
||||
"algo", "nosotros",
|
||||
# Inglés
|
||||
"the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as",
|
||||
"are", "was", "be", "this", "that", "by", "an", "or", "at", "from", "but",
|
||||
"not", "have", "has", "had", "they", "you", "we", "he", "she", "his",
|
||||
"her", "their", "its", "i", "my", "me", "our", "us", "do", "does", "did",
|
||||
"will", "would", "can", "could", "should", "there", "which", "who", "what",
|
||||
"when", "where", "how", "all", "if", "so", "than", "then", "out", "up",
|
||||
})
|
||||
|
||||
|
||||
def compute_top_ngrams(texts, n=2, top_k=15, remove_stopwords=True) -> dict:
|
||||
"""Calcula los n-gramas de palabras más frecuentes de un corpus.
|
||||
|
||||
Args:
|
||||
texts: lista de cadenas. Los elementos ``None`` o que no sean ``str`` se
|
||||
descartan silenciosamente.
|
||||
n: tamaño del n-grama (1 = unigramas, 2 = bigramas, 3 = trigramas...).
|
||||
Valores < 1 o no enteros producen ``top`` vacío.
|
||||
top_k: número máximo de n-gramas a devolver, ordenados por frecuencia
|
||||
descendente (con desempate alfabético determinista).
|
||||
remove_stopwords: si ``True`` elimina las stopwords ES+EN ANTES de
|
||||
formar los n-gramas, de modo que los n-gramas se construyen sobre la
|
||||
secuencia de tokens de contenido (no cruzando documentos).
|
||||
|
||||
Returns:
|
||||
``{"n": n, "top": [{"ngram": "w1 w2", "count": int}, ...]}``. Corpus
|
||||
vacío, sin tokens suficientes o cualquier excepción interna degrada a
|
||||
``{"n": n, "top": []}``. Nunca lanza.
|
||||
"""
|
||||
try:
|
||||
if not isinstance(n, int) or n < 1:
|
||||
return {"n": n, "top": []}
|
||||
|
||||
try:
|
||||
limit = int(top_k)
|
||||
except (TypeError, ValueError):
|
||||
limit = 0
|
||||
if limit < 0:
|
||||
limit = 0
|
||||
|
||||
if not isinstance(texts, (list, tuple)):
|
||||
return {"n": n, "top": []}
|
||||
|
||||
counter = Counter()
|
||||
for doc in texts:
|
||||
if not isinstance(doc, str):
|
||||
continue
|
||||
tokens = [
|
||||
tok
|
||||
for tok in re.findall(r"\w+", doc.lower(), re.UNICODE)
|
||||
if not tok.isdigit()
|
||||
]
|
||||
if remove_stopwords:
|
||||
tokens = [tok for tok in tokens if tok not in _STOPWORDS]
|
||||
if len(tokens) < n:
|
||||
continue
|
||||
for i in range(len(tokens) - n + 1):
|
||||
ngram = " ".join(tokens[i:i + n])
|
||||
counter[ngram] += 1
|
||||
|
||||
if not counter:
|
||||
return {"n": n, "top": []}
|
||||
|
||||
ordered = sorted(counter.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||
top = [{"ngram": ngram, "count": count} for ngram, count in ordered[:limit]]
|
||||
return {"n": n, "top": top}
|
||||
except Exception:
|
||||
return {"n": n, "top": []}
|
||||
@@ -0,0 +1,65 @@
|
||||
"""Tests para compute_top_ngrams."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# sys.path estándar: añade `python/functions/` para importar por paquete raíz.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from datascience.compute_top_ngrams import compute_top_ngrams
|
||||
|
||||
|
||||
def test_bigramas():
|
||||
# "machine learning" se repite en cada documento -> bigrama más frecuente.
|
||||
texts = [
|
||||
"machine learning rocks",
|
||||
"machine learning is fun",
|
||||
"we love machine learning",
|
||||
]
|
||||
result = compute_top_ngrams(texts, n=2, top_k=5)
|
||||
assert result["n"] == 2
|
||||
assert result["top"], "esperaba al menos un bigrama"
|
||||
assert result["top"][0]["ngram"] == "machine learning"
|
||||
assert result["top"][0]["count"] == 3
|
||||
# Cada entrada respeta el contrato {"ngram": str, "count": int}.
|
||||
for item in result["top"]:
|
||||
assert isinstance(item["ngram"], str)
|
||||
assert isinstance(item["count"], int)
|
||||
|
||||
|
||||
def test_trigramas():
|
||||
texts = [
|
||||
"alpha beta gamma delta",
|
||||
"alpha beta gamma omega",
|
||||
]
|
||||
# Con stopwords desactivadas para no descartar tokens de contenido.
|
||||
result = compute_top_ngrams(texts, n=3, top_k=5, remove_stopwords=False)
|
||||
assert result["n"] == 3
|
||||
ngrams = {item["ngram"]: item["count"] for item in result["top"]}
|
||||
# "alpha beta gamma" aparece en ambos documentos.
|
||||
assert ngrams.get("alpha beta gamma") == 2
|
||||
# Trigramas únicos de cada documento.
|
||||
assert ngrams.get("beta gamma delta") == 1
|
||||
assert ngrams.get("beta gamma omega") == 1
|
||||
|
||||
|
||||
def test_vacio():
|
||||
assert compute_top_ngrams([], n=2) == {"n": 2, "top": []}
|
||||
# Documentos no-str / None se descartan -> corpus efectivamente vacío.
|
||||
assert compute_top_ngrams([None, 123, {"a": 1}], n=2) == {"n": 2, "top": []}
|
||||
|
||||
|
||||
def test_stopwords():
|
||||
# "the cat" debería desaparecer al quitar stopwords ("the" es stopword EN).
|
||||
texts = ["the cat the cat the cat"]
|
||||
con = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=True)
|
||||
sin = compute_top_ngrams(texts, n=2, top_k=10, remove_stopwords=False)
|
||||
|
||||
con_ngrams = {item["ngram"] for item in con["top"]}
|
||||
sin_ngrams = {item["ngram"] for item in sin["top"]}
|
||||
|
||||
# Sin filtrar, el bigrama dominante es "the cat".
|
||||
assert "the cat" in sin_ngrams
|
||||
# Al filtrar stopwords, ya no aparece "the cat" (queda solo "cat cat").
|
||||
assert "the cat" not in con_ngrams
|
||||
assert con_ngrams != sin_ngrams
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
id: compute_vocabulary_stats_py_datascience
|
||||
name: compute_vocabulary_stats
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def compute_vocabulary_stats(texts: list, top_k: int = 20, remove_stopwords: bool = True) -> dict"
|
||||
description: "Profiles the vocabulary of a text corpus for EDA: tokenises a list of documents, counts term frequencies and derives lexical-richness measures — total tokens, unique types, type-token ratio (TTR), hapax legomena and the top-k most frequent terms. Pure, stdlib only (re + collections.Counter); no nltk, no sklearn. Inline ES+EN stopword list, opt-out via remove_stopwords. Never raises: empty/degenerate input returns the zeroed result."
|
||||
tags: [eda, datascience, text, nlp, vocabulary, ttr, hapax, pure, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [re, collections]
|
||||
example: |
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
result = compute_vocabulary_stats(["el gato y el perro", "gato veloz"], top_k=5)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_basico"
|
||||
- "test_vacio"
|
||||
- "test_stopwords_quitadas"
|
||||
- "test_stopwords_conservadas"
|
||||
test_file_path: "python/functions/datascience/compute_vocabulary_stats_test.py"
|
||||
file_path: "python/functions/datascience/compute_vocabulary_stats.py"
|
||||
params:
|
||||
- name: texts
|
||||
desc: "List of documents (strings) forming the corpus. Entries that are None or not a str are silently discarded. Tokens are extracted per document with re.findall(r'\\w+', doc.lower(), re.UNICODE); purely numeric tokens (tok.isdigit()) are dropped."
|
||||
- name: top_k
|
||||
desc: "Maximum number of most-frequent terms to return in top_terms. Default 20. Does not affect n_tokens/n_types/ttr/hapax — only the length of the top_terms list."
|
||||
- name: remove_stopwords
|
||||
desc: "When True (default) common Spanish+English stopwords from the inline _STOPWORDS set (~120 entries) are removed from the token stream before any counting. Set False to keep every word (raw lexical profile)."
|
||||
output: "Dict with the exact keys n_tokens (int), n_types (int), ttr (float|None, n_types/n_tokens rounded to 4 dp), n_hapax (int, terms occurring exactly once), hapax_pct (float|None, n_hapax/n_types*100 rounded to 2 dp) and top_terms (list of {term, count, pct} sorted by count descending, pct = count/n_tokens*100 rounded to 2 dp). For an empty corpus (no tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0, hapax_pct=None, top_terms=[]. Any exception degrades to that same empty result — the function never throws."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
|
||||
compute_vocabulary_stats(
|
||||
["el gato y el perro", "gato veloz corre", "perro perro perro"],
|
||||
top_k=5,
|
||||
)
|
||||
# {
|
||||
# "n_tokens": 6, # stopwords (el, y) eliminadas por defecto
|
||||
# "n_types": 3, # gato, perro, veloz, corre -> tras quitar stopwords
|
||||
# "ttr": 0.5, # n_types / n_tokens
|
||||
# "n_hapax": 2, # veloz, corre (1 aparicion cada uno)
|
||||
# "hapax_pct": 50.0, # n_hapax / n_types * 100
|
||||
# "top_terms": [
|
||||
# {"term": "perro", "count": 4, "pct": 44.44},
|
||||
# {"term": "gato", "count": 2, "pct": 22.22},
|
||||
# ...
|
||||
# ],
|
||||
# }
|
||||
|
||||
# Perfil lexico crudo (sin filtrar stopwords):
|
||||
compute_vocabulary_stats(["the cat and the dog"], remove_stopwords=False)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala al perfilar una columna o corpus de texto libre en un EDA del grupo `eda`:
|
||||
cuando necesites medir la riqueza léxica (cuántos tokens y cuántas palabras
|
||||
distintas, type-token ratio, porcentaje de palabras que solo aparecen una vez) y
|
||||
ver qué términos dominan el vocabulario (top-k frecuencias). Pásale la lista de
|
||||
documentos crudos (filas de la columna); `None` y valores no-string se ignoran
|
||||
solos. Es el equivalente para texto largo de `summarize_categorical`, que perfila
|
||||
categorías cortas.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Función pura y stdlib-only, pero el resultado depende del **idioma**: la lista
|
||||
`_STOPWORDS` cubre español e inglés. Para otros idiomas pon
|
||||
`remove_stopwords=False` o filtra fuera, o el perfil mezclará stopwords no
|
||||
reconocidas en `top_terms`.
|
||||
- La tokenización es `\w+` con `re.UNICODE`: separa por puntuación y conserva
|
||||
acentos/ñ, pero NO hace stemming ni lematización — "gato" y "gatos" cuentan
|
||||
como tipos distintos. Tampoco hace stripping de acentos, así que "más" (con
|
||||
tilde) y "mas" son tokens diferentes (ambos están en la stoplist).
|
||||
- Los tokens **puramente numéricos** (`"123"`) se descartan siempre; un token
|
||||
alfanumérico mixto (`"covid19"`) se conserva.
|
||||
- `ttr` baja artificialmente en corpus grandes (más texto, más repetición): no
|
||||
compares TTR entre corpus de tamaños muy distintos sin normalizar.
|
||||
- Nunca lanza: entrada vacía, `None`, o cualquier excepción interna devuelven el
|
||||
resultado con ceros/`None`/`[]`. Comprueba `n_tokens == 0` para detectar el
|
||||
caso degenerado.
|
||||
@@ -0,0 +1,99 @@
|
||||
"""Profile the vocabulary of a text corpus for EDA (pure, stdlib only).
|
||||
|
||||
Tokenises a list of documents, counts term frequencies and derives lexical
|
||||
richness measures (type-token ratio, hapax legomena) plus the top-k terms.
|
||||
No external NLP dependencies (no nltk, no sklearn) — only ``re`` and
|
||||
``collections`` from the standard library.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
# Common Spanish + English stopwords. Inline, lowercase, no accents stripped
|
||||
# beyond what already appears here. Filtering is opt-in via remove_stopwords.
|
||||
_STOPWORDS = {
|
||||
# Spanish
|
||||
"de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por",
|
||||
"un", "para", "con", "no", "una", "su", "al", "es", "lo", "como", "mas",
|
||||
"más", "pero", "sus", "le", "ya", "o", "este", "si", "sí", "porque",
|
||||
"esta", "entre", "cuando", "muy", "sin", "sobre", "tambien", "también",
|
||||
"me", "hasta", "hay", "donde", "quien", "desde", "todo", "nos", "durante",
|
||||
"todos", "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante",
|
||||
"ellos", "e", "esto", "antes", "algunos", "que", "unos", "yo", "otro",
|
||||
"otras", "otra", "el", "tanto", "esa", "estos", "mucho", "nada", "muchos",
|
||||
# English
|
||||
"the", "of", "and", "to", "in", "is", "it", "for", "on", "with", "as",
|
||||
"was", "but", "are", "this", "that", "an", "be", "by", "or", "not", "at",
|
||||
"from", "my", "i", "you", "he", "she", "we", "they", "his", "her", "its",
|
||||
"our", "their", "what", "which", "who", "whom", "has", "have", "had", "do",
|
||||
"does", "did", "will", "would", "can", "could", "should", "may", "might",
|
||||
"must", "if", "then", "than", "so", "too", "very", "just", "also", "were",
|
||||
"been", "being", "there", "here", "all", "any", "some", "more", "most",
|
||||
"out", "up", "down", "into", "over", "such", "only", "own", "same",
|
||||
}
|
||||
|
||||
|
||||
def compute_vocabulary_stats(texts, top_k=20, remove_stopwords=True) -> dict:
|
||||
"""Profile the vocabulary of a corpus of documents.
|
||||
|
||||
Args:
|
||||
texts: List of strings (the corpus). Entries that are None or not a
|
||||
string are discarded silently.
|
||||
top_k: Maximum number of most-frequent terms to include in
|
||||
``top_terms``. Default 20. Does not affect the other measures.
|
||||
remove_stopwords: When True (default) common ES+EN stopwords are
|
||||
dropped from the token stream before any counting.
|
||||
|
||||
Returns:
|
||||
A dict with the exact keys ``n_tokens``, ``n_types``, ``ttr``,
|
||||
``n_hapax``, ``hapax_pct`` and ``top_terms``. For an empty corpus (no
|
||||
tokens after filtering): n_tokens=0, n_types=0, ttr=None, n_hapax=0,
|
||||
hapax_pct=None, top_terms=[]. Never raises — any exception degrades to
|
||||
the empty-corpus result.
|
||||
"""
|
||||
empty = {
|
||||
"n_tokens": 0,
|
||||
"n_types": 0,
|
||||
"ttr": None,
|
||||
"n_hapax": 0,
|
||||
"hapax_pct": None,
|
||||
"top_terms": [],
|
||||
}
|
||||
try:
|
||||
tokens = []
|
||||
for doc in texts or []:
|
||||
if not isinstance(doc, str):
|
||||
continue
|
||||
for tok in re.findall(r"\w+", doc.lower(), re.UNICODE):
|
||||
if tok.isdigit():
|
||||
continue
|
||||
if remove_stopwords and tok in _STOPWORDS:
|
||||
continue
|
||||
tokens.append(tok)
|
||||
|
||||
n_tokens = len(tokens)
|
||||
if n_tokens == 0:
|
||||
return dict(empty)
|
||||
|
||||
counts = Counter(tokens)
|
||||
n_types = len(counts)
|
||||
ttr = round(n_types / n_tokens, 4)
|
||||
|
||||
n_hapax = sum(1 for c in counts.values() if c == 1)
|
||||
hapax_pct = round(n_hapax / n_types * 100, 2)
|
||||
|
||||
top_terms = [
|
||||
{"term": term, "count": count, "pct": round(count / n_tokens * 100, 2)}
|
||||
for term, count in counts.most_common(top_k)
|
||||
]
|
||||
|
||||
return {
|
||||
"n_tokens": n_tokens,
|
||||
"n_types": n_types,
|
||||
"ttr": ttr,
|
||||
"n_hapax": n_hapax,
|
||||
"hapax_pct": hapax_pct,
|
||||
"top_terms": top_terms,
|
||||
}
|
||||
except Exception:
|
||||
return dict(empty)
|
||||
@@ -0,0 +1,74 @@
|
||||
"""Tests para compute_vocabulary_stats."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.join(os.path.dirname(__file__), "..", "..", "functions")
|
||||
)
|
||||
|
||||
from datascience.compute_vocabulary_stats import compute_vocabulary_stats
|
||||
|
||||
|
||||
def test_basico():
|
||||
# Corpus con repeticiones y hapax. Stopwords desactivadas para controlar
|
||||
# exactamente que tokens entran.
|
||||
texts = ["gato gato perro", "perro perro raton", "elefante"]
|
||||
r = compute_vocabulary_stats(texts, top_k=10, remove_stopwords=False)
|
||||
|
||||
# n_types < n_tokens cuando hay repeticiones.
|
||||
assert r["n_types"] < r["n_tokens"]
|
||||
assert r["n_tokens"] == 7
|
||||
assert r["n_types"] == 4 # gato, perro, raton, elefante
|
||||
|
||||
# ttr en (0, 1].
|
||||
assert 0 < r["ttr"] <= 1
|
||||
assert r["ttr"] == round(4 / 7, 4)
|
||||
|
||||
# top_terms ordenado por count descendente.
|
||||
counts = [t["count"] for t in r["top_terms"]]
|
||||
assert counts == sorted(counts, reverse=True)
|
||||
assert r["top_terms"][0]["term"] == "perro"
|
||||
assert r["top_terms"][0]["count"] == 3
|
||||
|
||||
# hapax: raton y elefante aparecen exactamente una vez.
|
||||
assert r["n_hapax"] == 2
|
||||
assert r["hapax_pct"] == round(2 / 4 * 100, 2)
|
||||
|
||||
# pct coherente con count/n_tokens.
|
||||
assert r["top_terms"][0]["pct"] == round(3 / 7 * 100, 2)
|
||||
|
||||
|
||||
def test_vacio():
|
||||
# Sin documentos validos -> ceros / None / [].
|
||||
for arg in ([], None, [None, 123, ""], ["123 456"]):
|
||||
r = compute_vocabulary_stats(arg)
|
||||
assert r["n_tokens"] == 0
|
||||
assert r["n_types"] == 0
|
||||
assert r["ttr"] is None
|
||||
assert r["n_hapax"] == 0
|
||||
assert r["hapax_pct"] is None
|
||||
assert r["top_terms"] == []
|
||||
|
||||
|
||||
def test_stopwords_quitadas():
|
||||
texts = ["the gato the perro", "de la casa azul"]
|
||||
r = compute_vocabulary_stats(texts, remove_stopwords=True)
|
||||
terms = {t["term"] for t in r["top_terms"]}
|
||||
# Stopwords ES+EN no deben aparecer.
|
||||
assert "the" not in terms
|
||||
assert "de" not in terms
|
||||
assert "la" not in terms
|
||||
# Palabras de contenido si.
|
||||
assert "gato" in terms
|
||||
assert "casa" in terms
|
||||
|
||||
|
||||
def test_stopwords_conservadas():
|
||||
texts = ["the gato the perro", "de la casa azul"]
|
||||
r = compute_vocabulary_stats(texts, remove_stopwords=False)
|
||||
terms = {t["term"] for t in r["top_terms"]}
|
||||
# Con el filtro desactivado, las stopwords se conservan.
|
||||
assert "the" in terms
|
||||
assert "de" in terms
|
||||
assert "la" in terms
|
||||
@@ -0,0 +1,80 @@
|
||||
---
|
||||
name: detect_corpus_language
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def detect_corpus_language(texts, top_k=10, sample_max=1000) -> dict"
|
||||
description: "Estima la distribucion de idiomas de un corpus de textos con la libreria langdetect (import perezoso). Funcion pura y defensiva del grupo eda: filtra documentos None/no-str/vacios, muestrea hasta sample_max docs, clasifica cada uno con detect() ignorando los que langdetect no puede resolver (LangDetectException), y devuelve la distribucion top_k por frecuencia mas el idioma dominante. Si langdetect no esta instalada o algo falla, degrada a {available: False, ...} y NUNCA lanza (dict-no-throw). Seed fija (DetectorFactory.seed=0) para deteccion determinista."
|
||||
tags: [eda, datascience, text, nlp, language-detection, langdetect, pure, python]
|
||||
params:
|
||||
- name: texts
|
||||
desc: "Lista de strings (documentos). Los elementos None, no-str o vacios tras strip se descartan antes de clasificar."
|
||||
- name: top_k
|
||||
desc: "Numero maximo de idiomas a devolver en distribution, ordenados por count descendente (desempate por codigo ISO ascendente). Default 10."
|
||||
- name: sample_max
|
||||
desc: "Numero maximo de documentos a clasificar (se toman los primeros del corpus) para acotar el coste. Default 1000."
|
||||
output: >
|
||||
Dict con forma fija (dict-no-throw, nunca lanza):
|
||||
{"available": bool, "n_detected": int,
|
||||
"distribution": [{"lang": str, "count": int, "pct": float}, ...],
|
||||
"dominant": str|None}.
|
||||
available=True si langdetect es importable; lang son codigos ISO 639-1 ("es","en","fr",...);
|
||||
pct = count/n_detected*100 redondeado a 2 decimales; n_detected = docs clasificados con exito;
|
||||
dominant = idioma mas frecuente (None si no hubo detecciones). Corpus vacio con langdetect
|
||||
presente -> available True, n_detected 0, distribution [], dominant None. Sin langdetect (o
|
||||
fallo global) -> available False y el resto de campos a su valor vacio.
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: [langdetect]
|
||||
tested: true
|
||||
tests: ["test_mixto_es_en", "test_vacio", "test_degradacion"]
|
||||
test_file_path: "python/functions/datascience/detect_corpus_language_test.py"
|
||||
file_path: "python/functions/datascience/detect_corpus_language.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.detect_corpus_language import detect_corpus_language
|
||||
|
||||
corpus = [
|
||||
"este es un texto bastante largo en español para detectar el idioma correctamente",
|
||||
"la inteligencia artificial transforma la manera en que trabajamos cada dia",
|
||||
"this is a fairly long english text to detect the language correctly without issues",
|
||||
]
|
||||
out = detect_corpus_language(corpus)
|
||||
# {"available": True, "n_detected": 3,
|
||||
# "distribution": [{"lang": "es", "count": 2, "pct": 66.67},
|
||||
# {"lang": "en", "count": 1, "pct": 33.33}],
|
||||
# "dominant": "es"}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando perfiles una columna o corpus de texto en un EDA y necesites saber en
|
||||
que idioma(s) esta escrito antes de elegir tokenizadores, stopwords, modelos
|
||||
NLP o stemmers. Util tambien como check de calidad: detectar corpus mezclados
|
||||
o un idioma inesperado. Llamala con la lista de textos crudos; la funcion
|
||||
limpia, muestrea y resume sola.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- `langdetect` es **opcional**: si no esta instalada, la funcion no lanza —
|
||||
devuelve `{"available": False, "n_detected": 0, "distribution": [], "dominant": None}`.
|
||||
Comprueba `out["available"]` antes de usar la distribucion.
|
||||
- **Textos cortos** (pocas palabras o sin features lingüisticas) pueden no
|
||||
detectarse: langdetect lanza `LangDetectException`, que se ignora y el doc no
|
||||
cuenta en `n_detected`. Pasa frases razonablemente largas para resultados fiables.
|
||||
- **Determinismo**: se fija `DetectorFactory.seed = 0` en cada llamada para que la
|
||||
deteccion sea reproducible; sin esa semilla langdetect puede dar resultados
|
||||
ligeramente distintos entre ejecuciones.
|
||||
- `distribution` esta truncada a `top_k`; si el corpus tiene mas idiomas que
|
||||
`top_k`, la suma de los `count` mostrados puede ser menor que `n_detected`
|
||||
(pero `dominant` siempre refleja el idioma mas frecuente del corpus completo).
|
||||
@@ -0,0 +1,91 @@
|
||||
"""Detecta la distribucion de idiomas de un corpus de textos.
|
||||
|
||||
Funcion pura y defensiva: el computo es determinista y local (sin I/O de red).
|
||||
La libreria opcional `langdetect` se importa de forma perezosa dentro de la
|
||||
funcion; si no esta instalada (o cualquier paso falla), la funcion degrada
|
||||
limpiamente a `available=False` y NUNCA lanza excepciones.
|
||||
"""
|
||||
|
||||
|
||||
def detect_corpus_language(texts, top_k=10, sample_max=1000) -> dict:
|
||||
"""Estima la distribucion de idiomas de un corpus con `langdetect`.
|
||||
|
||||
Args:
|
||||
texts: lista de strings (documentos). Los elementos None, no-str o
|
||||
vacios tras strip se descartan.
|
||||
top_k: numero maximo de idiomas a devolver en `distribution`,
|
||||
ordenados por frecuencia descendente.
|
||||
sample_max: numero maximo de documentos a clasificar (se toman los
|
||||
primeros) para acotar el coste.
|
||||
|
||||
Returns:
|
||||
dict con la forma fija (dict-no-throw):
|
||||
{
|
||||
"available": bool, # True si langdetect es importable
|
||||
"n_detected": int, # documentos clasificados con exito
|
||||
"distribution": [{"lang": str, "count": int, "pct": float}, ...],
|
||||
"dominant": str | None,
|
||||
}
|
||||
"""
|
||||
degraded = {
|
||||
"available": False,
|
||||
"n_detected": 0,
|
||||
"distribution": [],
|
||||
"dominant": None,
|
||||
}
|
||||
try:
|
||||
# Import perezoso con degradacion: si langdetect no esta disponible,
|
||||
# devolvemos el dict degradado sin lanzar.
|
||||
try:
|
||||
from langdetect import detect, DetectorFactory
|
||||
|
||||
# Semilla fija -> deteccion determinista entre ejecuciones.
|
||||
DetectorFactory.seed = 0
|
||||
except Exception:
|
||||
return dict(degraded)
|
||||
|
||||
# Normaliza y filtra el corpus.
|
||||
docs = []
|
||||
if texts:
|
||||
for t in texts:
|
||||
if isinstance(t, str):
|
||||
s = t.strip()
|
||||
if s:
|
||||
docs.append(s)
|
||||
|
||||
# Muestreo de los primeros `sample_max` documentos.
|
||||
if sample_max is not None and sample_max >= 0:
|
||||
docs = docs[:sample_max]
|
||||
|
||||
# Conteo por idioma; langdetect lanza LangDetectException en textos
|
||||
# sin features detectables -> se ignora y se sigue.
|
||||
counts: dict = {}
|
||||
for doc in docs:
|
||||
try:
|
||||
lang = detect(doc)
|
||||
except Exception:
|
||||
continue
|
||||
counts[lang] = counts.get(lang, 0) + 1
|
||||
|
||||
n_detected = sum(counts.values())
|
||||
|
||||
# Orden estable: por count descendente, desempate por codigo de idioma.
|
||||
ordered = sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
||||
|
||||
k = top_k if (top_k is not None and top_k >= 0) else len(ordered)
|
||||
distribution = []
|
||||
for lang, count in ordered[:k]:
|
||||
pct = round(count / n_detected * 100, 2) if n_detected else 0.0
|
||||
distribution.append({"lang": lang, "count": count, "pct": pct})
|
||||
|
||||
dominant = ordered[0][0] if ordered else None
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"n_detected": n_detected,
|
||||
"distribution": distribution,
|
||||
"dominant": dominant,
|
||||
}
|
||||
except Exception:
|
||||
# Cualquier fallo global degrada a available False sin lanzar.
|
||||
return dict(degraded)
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Tests para detect_corpus_language."""
|
||||
|
||||
import builtins
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Anade python/functions a sys.path para importar el paquete `datascience`.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from datascience.detect_corpus_language import detect_corpus_language
|
||||
|
||||
_ES = [
|
||||
"este es un texto bastante largo en español para detectar el idioma correctamente sin problemas",
|
||||
"la inteligencia artificial transforma la manera en que trabajamos cada dia en muchos sectores",
|
||||
]
|
||||
_EN = [
|
||||
"this is a fairly long english text to detect the language correctly without any length issues",
|
||||
"machine learning models can classify documents into many different categories quite reliably",
|
||||
]
|
||||
|
||||
|
||||
def test_mixto_es_en():
|
||||
"""Golden: corpus mixto ES+EN claro -> available True, >=2 idiomas, counts coherentes."""
|
||||
out = detect_corpus_language(_ES + _EN)
|
||||
assert out["available"] is True
|
||||
assert out["dominant"] in {"es", "en"}
|
||||
assert len(out["distribution"]) >= 2
|
||||
total = sum(item["count"] for item in out["distribution"])
|
||||
assert total == out["n_detected"]
|
||||
assert out["n_detected"] == 4
|
||||
|
||||
|
||||
def test_vacio():
|
||||
"""Edge: lista vacia con langdetect presente -> available True, sin detecciones."""
|
||||
out = detect_corpus_language([])
|
||||
assert out["available"] is True
|
||||
assert out["n_detected"] == 0
|
||||
assert out["distribution"] == []
|
||||
assert out["dominant"] is None
|
||||
|
||||
|
||||
def test_degradacion(monkeypatch):
|
||||
"""Error path: si langdetect no es importable -> degrada a available False sin lanzar."""
|
||||
import datascience.detect_corpus_language as m
|
||||
|
||||
real_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *a, **k):
|
||||
if name == "langdetect" or name.startswith("langdetect."):
|
||||
raise ImportError("simulado")
|
||||
return real_import(name, *a, **k)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||
out = m.detect_corpus_language(["hola mundo", "hello world"])
|
||||
assert out["available"] is False
|
||||
assert out["n_detected"] == 0
|
||||
assert out["distribution"] == []
|
||||
assert out["dominant"] is None
|
||||
@@ -0,0 +1,107 @@
|
||||
---
|
||||
name: detect_declared_keys_duckdb
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def detect_declared_keys_duckdb(db_path: str, table: str = None) -> dict"
|
||||
description: "Detecta las claves DECLARADAS (constraints reales) de un schema DuckDB leyendo la table function duckdb_constraints(): extrae PRIMARY KEY, FOREIGN KEY y UNIQUE (ignora NOT NULL y CHECK) y las devuelve normalizadas con sus columnas, y para las FK con su tabla y columnas referenciadas. Con table=None procesa todas las tablas; con table='X' filtra a PK/UNIQUE de X y a FK cuyo origen es X (case-sensitive). A diferencia de infer_fk_containment_duckdb (que INFIERE FKs candidatas por containment de valores cuando el schema no las declara), esta funcion devuelve las relaciones de clave REALES del schema. Estilo dict-no-throw: nunca lanza. Parte del grupo eda (relaciones de clave)."
|
||||
tags: [eda, duckdb, datascience, relations, primary-key, foreign-key, schema, exploratory-data-analysis]
|
||||
params:
|
||||
- name: db_path
|
||||
desc: "Ruta al archivo DuckDB. Debe existir (lectura read-only via duckdb_query_readonly; no se crea). Un path inexistente devuelve {status:'error', ...}."
|
||||
- name: table
|
||||
desc: "Si se pasa, filtra los resultados a esa tabla: incluye PRIMARY KEY y UNIQUE cuya tabla sea `table`, y FOREIGN KEY cuya tabla ORIGEN sea `table` (no la referenciada). None (default) devuelve los constraints de todas las tablas. La comparacion es case-sensitive (nombres tal cual los devuelve DuckDB)."
|
||||
output: "dict dict-no-throw. En exito {status:'ok', primary_keys:[{table:str, columns:[str,...]}, ...], foreign_keys:[{table:str, columns:[str,...], referenced_table:str, referenced_columns:[str,...]}, ...], unique:[{table:str, columns:[str,...]}, ...], tables:[str,...]} donde tables es la lista ordenada de tablas (origen) que poseen al menos un constraint PK/FK/UNIQUE emitido. Solo se emiten constraints de clave: NOT NULL y CHECK se ignoran. En error {status:'error', error:str}."
|
||||
uses_functions: [duckdb_query_readonly_py_infra]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["test_golden_detecta_pks_y_fk", "test_golden_ignora_not_null_y_check", "test_edge_filtra_por_tabla_orders", "test_edge_filtra_por_tabla_customers", "test_edge_unique_declarado", "test_edge_sin_constraints_listas_vacias", "test_error_db_inexistente_no_lanza", "test_shape_resultado"]
|
||||
test_file_path: "python/functions/datascience/detect_declared_keys_duckdb_test.py"
|
||||
file_path: "python/functions/datascience/detect_declared_keys_duckdb.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os, duckdb
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience import detect_declared_keys_duckdb
|
||||
|
||||
# Base de ejemplo en /tmp: orders.customer_id -> customers.id (FK declarada)
|
||||
path = "/tmp/declared_keys_demo.duckdb"
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)")
|
||||
con.execute(
|
||||
"CREATE TABLE orders("
|
||||
" id INTEGER PRIMARY KEY,"
|
||||
" customer_id INTEGER REFERENCES customers(id),"
|
||||
" amt DOUBLE)"
|
||||
)
|
||||
con.close()
|
||||
|
||||
res = detect_declared_keys_duckdb(path)
|
||||
if res["status"] == "ok":
|
||||
for pk in res["primary_keys"]:
|
||||
print(f"PK {pk['table']}({', '.join(pk['columns'])})")
|
||||
for fk in res["foreign_keys"]:
|
||||
print(f"FK {fk['table']}({', '.join(fk['columns'])}) -> "
|
||||
f"{fk['referenced_table']}({', '.join(fk['referenced_columns'])})")
|
||||
# PK customers(id)
|
||||
# PK orders(id)
|
||||
# FK orders(customer_id) -> customers(id)
|
||||
else:
|
||||
print("error:", res["error"])
|
||||
|
||||
# Filtrar a una tabla concreta (PK/UNIQUE de orders + FK con origen orders):
|
||||
solo_orders = detect_declared_keys_duckdb(path, table="orders")
|
||||
print(solo_orders["tables"]) # ['orders']
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
- Cuando exploras un esquema DuckDB y quieres mostrar las relaciones de clave REALES (PK/FK/UNIQUE) que el schema ha declarado, sin inferir nada.
|
||||
- Como paso del capitulo RELACIONES del grupo `eda`: primero mira las claves declaradas con esta funcion; si el schema no declara FKs, complementa con `infer_fk_containment_duckdb` (inferencia por containment).
|
||||
- Antes de documentar o migrar un esquema, para listar el contrato de integridad referencial que el motor ya conoce.
|
||||
- Para validar que las constraints que esperas (esa FK que creaste con `REFERENCES`) realmente estan declaradas en la base materializada.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura**: lee de disco via la primitiva read-only `duckdb_query_readonly` (no crea ni modifica la base). El `db_path` debe existir; un path inexistente devuelve `{status:'error'}` (read_only NO crea la base).
|
||||
- **Requiere `duckdb_constraints()`**: usa la table function `duckdb_constraints()`, disponible en DuckDB modernos (verificado en 1.5.2). En versiones antiguas sin esa funcion, la query falla y se devuelve `{status:'error'}`.
|
||||
- **Solo claves DECLARADAS**: devuelve lo que el schema declaro con `PRIMARY KEY` / `FOREIGN KEY (... REFERENCES ...)` / `UNIQUE`. Una tabla materializada con `CREATE TABLE AS SELECT` NO lleva constraints — para esos casos no habra claves que mostrar y hay que INFERIRLAS (`infer_fk_containment_duckdb`).
|
||||
- **NOT NULL y CHECK se ignoran**: `duckdb_constraints()` tambien emite filas `NOT NULL` (DuckDB genera una por cada columna PK) y `CHECK`; esta funcion las descarta y solo conserva PK/FK/UNIQUE.
|
||||
- **Nombres case-sensitive**: el filtro `table='Orders'` no casa con una tabla `orders`. Se comparan los nombres tal cual los devuelve DuckDB.
|
||||
- **FK atribuida al origen**: una FOREIGN KEY se atribuye a su tabla ORIGEN (el `table` de la entrada), no a la referenciada. El filtro `table='X'` trae las FK cuyo origen es X, no las que apuntan a X.
|
||||
- **`tables` = tablas dueñas de constraints emitidos**: la lista `tables` contiene solo las tablas que poseen al menos un PK/FK/UNIQUE en el resultado (su campo `table`), ordenadas. No incluye tablas referenciadas que no tengan constraint propio en la salida.
|
||||
- **Columnas como listas**: `constraint_column_names` y `referenced_column_names` son columnas LIST de DuckDB; en 1.5.2 llegan como listas Python. La funcion las normaliza a listas de strings con una red de seguridad por si llegaran como string.
|
||||
|
||||
## Notas
|
||||
|
||||
`duckdb_constraints()` devuelve una fila por constraint con los campos
|
||||
`table_name`, `constraint_type`, `constraint_column_names`, `referenced_table`,
|
||||
`referenced_column_names`. Mapeo a la salida:
|
||||
|
||||
```text
|
||||
PRIMARY KEY -> primary_keys[]: {table, columns}
|
||||
UNIQUE -> unique[]: {table, columns}
|
||||
FOREIGN KEY -> foreign_keys[]: {table, columns, referenced_table, referenced_columns}
|
||||
NOT NULL -> ignorado
|
||||
CHECK -> ignorado
|
||||
```
|
||||
|
||||
Para una FK, `referenced_table` y `referenced_column_names` vienen poblados; para
|
||||
PK/UNIQUE, `referenced_table` es NULL y `referenced_column_names` una lista vacia.
|
||||
|
||||
Complementa a `infer_fk_containment_duckdb`: esta funcion devuelve las relaciones
|
||||
de clave REALES del schema (declaradas); la otra INFIERE FKs candidatas por
|
||||
containment de valores cuando el schema no las declaro. En el capitulo RELACIONES
|
||||
de AutomaticEDA se usan en orden: primero las declaradas, luego la inferencia como
|
||||
respaldo.
|
||||
@@ -0,0 +1,127 @@
|
||||
"""detect_declared_keys_duckdb — lee las claves DECLARADAS de un schema DuckDB.
|
||||
|
||||
Funcion impura: lee de disco a traves de la primitiva read-only del grupo
|
||||
`duckdb` (duckdb_query_readonly). Pertenece al grupo de capacidad `eda`
|
||||
(relaciones de clave): a diferencia de infer_fk_containment_duckdb, que INFIERE
|
||||
FOREIGN KEYs candidatas por containment de valores, esta funcion devuelve las
|
||||
constraints REALES que el schema ha declarado (PRIMARY KEY / FOREIGN KEY /
|
||||
UNIQUE) leyendo la table function `duckdb_constraints()`.
|
||||
|
||||
Es la pieza del capitulo RELACIONES de AutomaticEDA que muestra las relaciones de
|
||||
clave reales cuando existen — frente a la inferencia, que se usa cuando el schema
|
||||
no las declaro.
|
||||
|
||||
Estilo dict-no-throw del grupo duckdb: nunca lanza; captura cualquier error y
|
||||
devuelve {status:'error', error:str}.
|
||||
"""
|
||||
|
||||
from infra import duckdb_query_readonly
|
||||
|
||||
|
||||
def _as_list(value) -> list:
|
||||
"""Normaliza el valor de una columna LIST de DuckDB a una lista de strings.
|
||||
|
||||
En DuckDB 1.5.2, `constraint_column_names` y `referenced_column_names` llegan
|
||||
ya como listas Python a traves de duckdb_query_readonly. Este helper es solo
|
||||
una red de seguridad: si por cualquier motivo llegara como string (p.ej. la
|
||||
representacion `[id, customer_id]`), la parsea de forma defensiva.
|
||||
"""
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, (list, tuple)):
|
||||
return [str(v) for v in value]
|
||||
if isinstance(value, str):
|
||||
s = value.strip()
|
||||
if s.startswith("[") and s.endswith("]"):
|
||||
s = s[1:-1]
|
||||
if not s.strip():
|
||||
return []
|
||||
return [
|
||||
part.strip().strip("'\"")
|
||||
for part in s.split(",")
|
||||
if part.strip().strip("'\"")
|
||||
]
|
||||
return [str(value)]
|
||||
|
||||
|
||||
def detect_declared_keys_duckdb(db_path: str, table: str = None) -> dict:
|
||||
"""Detecta las claves PRIMARY KEY / FOREIGN KEY / UNIQUE declaradas en DuckDB.
|
||||
|
||||
Lee la table function `duckdb_constraints()` y extrae solo las constraints de
|
||||
clave (PRIMARY KEY, FOREIGN KEY, UNIQUE), ignorando NOT NULL y CHECK.
|
||||
|
||||
Args:
|
||||
db_path: ruta al archivo DuckDB. Debe existir (lectura read-only; no se
|
||||
crea). Un path inexistente devuelve {status:'error', ...} sin lanzar.
|
||||
table: si se pasa, filtra los resultados a esa tabla: incluye PRIMARY KEY
|
||||
y UNIQUE cuya tabla sea `table`, y FOREIGN KEY cuya tabla ORIGEN sea
|
||||
`table`. None (default) devuelve los constraints de todas las tablas.
|
||||
La comparacion de nombres es case-sensitive (tal cual los devuelve
|
||||
DuckDB).
|
||||
|
||||
Returns:
|
||||
dict dict-no-throw. En exito:
|
||||
{status:'ok',
|
||||
primary_keys:[{table:str, columns:[str, ...]}, ...],
|
||||
foreign_keys:[{table:str, columns:[str, ...],
|
||||
referenced_table:str,
|
||||
referenced_columns:[str, ...]}, ...],
|
||||
unique:[{table:str, columns:[str, ...]}, ...],
|
||||
tables:[str, ...]} # tablas (origen) con algun PK/FK/UNIQUE emitido
|
||||
En error (sin lanzar): {status:'error', error:str}.
|
||||
"""
|
||||
try:
|
||||
sql = (
|
||||
"SELECT table_name, constraint_type, constraint_column_names, "
|
||||
"referenced_table, referenced_column_names FROM duckdb_constraints()"
|
||||
)
|
||||
res = duckdb_query_readonly(db_path, sql)
|
||||
if res["status"] != "ok":
|
||||
return {"status": "error", "error": res["error"]}
|
||||
|
||||
primary_keys = []
|
||||
foreign_keys = []
|
||||
unique = []
|
||||
tables = set()
|
||||
|
||||
for row in res["rows"]:
|
||||
ctype = row["constraint_type"]
|
||||
tname = row["table_name"]
|
||||
|
||||
# Filtro por tabla origen: para PK/FK/UNIQUE el dueño del constraint es
|
||||
# `table_name`. Una FK se atribuye a su tabla origen (no a la
|
||||
# referenciada), igual que el filtro pide.
|
||||
if table is not None and tname != table:
|
||||
continue
|
||||
|
||||
cols = _as_list(row["constraint_column_names"])
|
||||
|
||||
if ctype == "PRIMARY KEY":
|
||||
primary_keys.append({"table": tname, "columns": cols})
|
||||
tables.add(tname)
|
||||
elif ctype == "UNIQUE":
|
||||
unique.append({"table": tname, "columns": cols})
|
||||
tables.add(tname)
|
||||
elif ctype == "FOREIGN KEY":
|
||||
foreign_keys.append(
|
||||
{
|
||||
"table": tname,
|
||||
"columns": cols,
|
||||
"referenced_table": row["referenced_table"],
|
||||
"referenced_columns": _as_list(
|
||||
row["referenced_column_names"]
|
||||
),
|
||||
}
|
||||
)
|
||||
tables.add(tname)
|
||||
# NOT NULL y CHECK se ignoran: no son relaciones de clave.
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"primary_keys": primary_keys,
|
||||
"foreign_keys": foreign_keys,
|
||||
"unique": unique,
|
||||
"tables": sorted(tables),
|
||||
}
|
||||
except Exception as e: # noqa: BLE001
|
||||
return {"status": "error", "error": str(e)}
|
||||
@@ -0,0 +1,167 @@
|
||||
"""Tests para detect_declared_keys_duckdb."""
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
from .detect_declared_keys_duckdb import detect_declared_keys_duckdb
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db(tmp_path):
|
||||
"""DuckDB temporal con claves declaradas.
|
||||
|
||||
- customers(id PRIMARY KEY, name)
|
||||
- orders(id PRIMARY KEY, customer_id REFERENCES customers(id), amt)
|
||||
|
||||
Esto declara dos PRIMARY KEY (customers.id, orders.id) y una FOREIGN KEY
|
||||
(orders.customer_id -> customers.id). DuckDB ademas genera constraints
|
||||
NOT NULL para las columnas PK, que la funcion debe ignorar.
|
||||
"""
|
||||
path = str(tmp_path / "keys_test.duckdb")
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE customers(id INTEGER PRIMARY KEY, name TEXT)")
|
||||
con.execute(
|
||||
"CREATE TABLE orders("
|
||||
" id INTEGER PRIMARY KEY,"
|
||||
" customer_id INTEGER REFERENCES customers(id),"
|
||||
" amt DOUBLE"
|
||||
")"
|
||||
)
|
||||
con.close()
|
||||
return path
|
||||
|
||||
|
||||
def _pk_for(res, table):
|
||||
"""Devuelve la entrada primary_keys cuya tabla es `table`, o None."""
|
||||
for pk in res["primary_keys"]:
|
||||
if pk["table"] == table:
|
||||
return pk
|
||||
return None
|
||||
|
||||
|
||||
def test_golden_detecta_pks_y_fk(db):
|
||||
"""Golden: detecta las dos PK y la FK declaradas, con valores concretos."""
|
||||
res = detect_declared_keys_duckdb(db)
|
||||
assert res["status"] == "ok"
|
||||
|
||||
# PRIMARY KEY de customers y de orders.
|
||||
pk_customers = _pk_for(res, "customers")
|
||||
pk_orders = _pk_for(res, "orders")
|
||||
assert pk_customers is not None
|
||||
assert pk_customers["columns"] == ["id"]
|
||||
assert pk_orders is not None
|
||||
assert pk_orders["columns"] == ["id"]
|
||||
|
||||
# FOREIGN KEY orders.customer_id -> customers.id.
|
||||
assert len(res["foreign_keys"]) == 1
|
||||
fk = res["foreign_keys"][0]
|
||||
assert fk["table"] == "orders"
|
||||
assert fk["columns"] == ["customer_id"]
|
||||
assert fk["referenced_table"] == "customers"
|
||||
assert fk["referenced_columns"] == ["id"]
|
||||
|
||||
# tables incluye ambas (origen de algun constraint).
|
||||
assert res["tables"] == ["customers", "orders"]
|
||||
|
||||
|
||||
def test_golden_ignora_not_null_y_check(db):
|
||||
"""NOT NULL (auto-generado por las PK) no aparece como clave."""
|
||||
res = detect_declared_keys_duckdb(db)
|
||||
assert res["status"] == "ok"
|
||||
# Solo 2 PK reales (no las NOT NULL que DuckDB genera por cada columna PK).
|
||||
assert len(res["primary_keys"]) == 2
|
||||
# No hay UNIQUE declarado en este schema.
|
||||
assert res["unique"] == []
|
||||
|
||||
|
||||
def test_edge_filtra_por_tabla_orders(db):
|
||||
"""Edge table='orders': PK de orders + su FK; NO la PK de customers."""
|
||||
res = detect_declared_keys_duckdb(db, table="orders")
|
||||
assert res["status"] == "ok"
|
||||
|
||||
# Solo la PK de orders.
|
||||
assert len(res["primary_keys"]) == 1
|
||||
assert res["primary_keys"][0]["table"] == "orders"
|
||||
assert res["primary_keys"][0]["columns"] == ["id"]
|
||||
# La PK de customers NO esta.
|
||||
assert _pk_for(res, "customers") is None
|
||||
|
||||
# La FK de orders si esta (origen = orders).
|
||||
assert len(res["foreign_keys"]) == 1
|
||||
assert res["foreign_keys"][0]["table"] == "orders"
|
||||
assert res["foreign_keys"][0]["referenced_table"] == "customers"
|
||||
|
||||
# tables solo contiene orders (la dueña de los constraints emitidos).
|
||||
assert res["tables"] == ["orders"]
|
||||
|
||||
|
||||
def test_edge_filtra_por_tabla_customers(db):
|
||||
"""Edge table='customers': solo su PK; ninguna FK (orders queda fuera)."""
|
||||
res = detect_declared_keys_duckdb(db, table="customers")
|
||||
assert res["status"] == "ok"
|
||||
assert len(res["primary_keys"]) == 1
|
||||
assert res["primary_keys"][0]["table"] == "customers"
|
||||
assert res["foreign_keys"] == []
|
||||
assert res["tables"] == ["customers"]
|
||||
|
||||
|
||||
def test_edge_unique_declarado(tmp_path):
|
||||
"""Edge: una constraint UNIQUE declarada aparece en `unique`."""
|
||||
path = str(tmp_path / "unique_test.duckdb")
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE products(sku INTEGER UNIQUE, name TEXT)")
|
||||
con.close()
|
||||
|
||||
res = detect_declared_keys_duckdb(path)
|
||||
assert res["status"] == "ok"
|
||||
assert len(res["unique"]) == 1
|
||||
assert res["unique"][0]["table"] == "products"
|
||||
assert res["unique"][0]["columns"] == ["sku"]
|
||||
assert res["primary_keys"] == []
|
||||
assert res["foreign_keys"] == []
|
||||
assert res["tables"] == ["products"]
|
||||
|
||||
|
||||
def test_edge_sin_constraints_listas_vacias(tmp_path):
|
||||
"""Edge: tabla sin PK/FK/UNIQUE -> todas las listas vacias, status ok."""
|
||||
path = str(tmp_path / "no_keys.duckdb")
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE log(a INTEGER, b INTEGER)")
|
||||
con.close()
|
||||
|
||||
res = detect_declared_keys_duckdb(path)
|
||||
assert res["status"] == "ok"
|
||||
assert res["primary_keys"] == []
|
||||
assert res["foreign_keys"] == []
|
||||
assert res["unique"] == []
|
||||
assert res["tables"] == []
|
||||
|
||||
|
||||
def test_error_db_inexistente_no_lanza(tmp_path):
|
||||
"""Error: db_path inexistente -> status error, sin lanzar excepcion."""
|
||||
path = str(tmp_path / "does_not_exist.duckdb")
|
||||
res = detect_declared_keys_duckdb(path)
|
||||
assert res["status"] == "error"
|
||||
assert isinstance(res["error"], str)
|
||||
assert res["error"] != ""
|
||||
|
||||
|
||||
def test_shape_resultado(db):
|
||||
"""El retorno tiene exactamente las claves esperadas."""
|
||||
res = detect_declared_keys_duckdb(db)
|
||||
assert set(res.keys()) == {
|
||||
"status",
|
||||
"primary_keys",
|
||||
"foreign_keys",
|
||||
"unique",
|
||||
"tables",
|
||||
}
|
||||
for pk in res["primary_keys"]:
|
||||
assert set(pk.keys()) == {"table", "columns"}
|
||||
for fk in res["foreign_keys"]:
|
||||
assert set(fk.keys()) == {
|
||||
"table",
|
||||
"columns",
|
||||
"referenced_table",
|
||||
"referenced_columns",
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
---
|
||||
name: extract_null_mask
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_null_mask(query_fn, table: str, columns: list, max_rows: int = 5000) -> dict"
|
||||
description: "Extrae la mascara de nulos (1=falta / 0=presente) de una muestra de filas de una tabla, una lista 0/1 por columna alineada por fila, para alimentar el capitulo de calidad / patron de nulos de AutomaticEDA sin que el capitulo toque la base de datos. Recibe un lector read-only inyectado `query_fn(sql) -> dict` (mismo contrato que duckdb_query_readonly / pg_query / el `_q` de profile_table) y NO abre ninguna conexion por su cuenta. Construye UNA sola query que proyecta por cada columna `CASE WHEN \"col\" IS NULL THEN 1 ELSE 0 END` con identificadores escapados y LIMIT. Devuelve dict dict-no-throw: columns (efectivamente leidas, en orden), mask (lista int 0/1 por columna, misma longitud todas) y n. Una celda None se cuenta defensivamente como 1 (falta)."
|
||||
tags: [eda, nulls, missing, datascience, automatic-eda, extraction, read-only, duckdb, postgres, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
params:
|
||||
- name: query_fn
|
||||
desc: "callable lector read-only del backend activo. Recibe un string SQL y devuelve un dict {'status':'ok','rows':[{col:val,...},...]} (mismo contrato que duckdb_query_readonly o el `_q` de profile_table). NO se abre ninguna conexion dentro de la funcion: toda la lectura pasa por query_fn. Si es None -> error."
|
||||
- name: table
|
||||
desc: "nombre de la tabla de la que muestrear la mascara de nulos. Se escapa con comillas dobles en la query. Vacio o None -> status error."
|
||||
- name: columns
|
||||
desc: "lista de nombres de columna a evaluar. Cada una produce una entrada en `mask` con una lista 0/1 paralela por fila (1=IS NULL, 0=presente). Cada nombre se escapa con comillas dobles. Vacia o None -> status error."
|
||||
- name: max_rows
|
||||
desc: "limite de filas a muestrear (clausula LIMIT). Default 5000. Protege frente a tablas enormes; con LIMIT obtienes el primer tramo, no un muestreo uniforme."
|
||||
output: "dict (nunca lanza). En exito: {'status':'ok','table':str,'columns':[str,...] (en orden),'mask':{col:[int 0/1,...],...} (1=falta/IS NULL, 0=presente; todas las listas con misma longitud = n),'n':int}. En error (sin lanzar): {'status':'error','error':str,'table':str,'columns':[],'mask':{},'n':0}. Errores: query_fn None, table vacia, columns vacia, o query_fn devuelve status!='ok' (se propaga su error)."
|
||||
tested: true
|
||||
tests: ["test_golden_mask_alineada", "test_celda_none_cuenta_como_falta", "test_columns_vacia_status_error", "test_query_fn_status_error_propaga", "test_query_fn_none_da_error_sin_reventar", "test_sql_contiene_case_y_limit"]
|
||||
test_file_path: "python/functions/datascience/extract_null_mask_test.py"
|
||||
file_path: "python/functions/datascience/extract_null_mask.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.extract_null_mask import extract_null_mask
|
||||
from infra import duckdb_query_readonly
|
||||
|
||||
# El lector read-only se inyecta como closure (igual que el `_q` de profile_table).
|
||||
db = "data/clientes.duckdb"
|
||||
def _q(sql):
|
||||
return duckdb_query_readonly(db, sql)
|
||||
|
||||
res = extract_null_mask(_q, "clientes", ["email", "telefono", "edad"])
|
||||
# res == {
|
||||
# "status": "ok",
|
||||
# "table": "clientes",
|
||||
# "columns": ["email", "telefono", "edad"],
|
||||
# "mask": {
|
||||
# "email": [0, 0, 1, 0, ...], # fila 2 sin email
|
||||
# "telefono": [1, 0, 1, 0, ...],
|
||||
# "edad": [0, 0, 0, 1, ...],
|
||||
# },
|
||||
# "n": 5000,
|
||||
# }
|
||||
|
||||
# % de nulos por columna a partir de la muestra:
|
||||
pct = {c: 100 * sum(bits) / max(res["n"], 1) for c, bits in res["mask"].items()}
|
||||
|
||||
# Se entrega al capitulo de calidad sin que este toque la BD:
|
||||
ctx = {"null_mask": res}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando el capitulo de calidad / patron de nulos de AutomaticEDA necesita saber
|
||||
DONDE faltan los valores (no solo cuantos) y NO debe abrir la base de datos por
|
||||
su cuenta: extraes aqui la mascara 0/1 por columna alineada por fila y se la pasas
|
||||
en `ctx['null_mask']`. Usala siempre que quieras detectar co-ocurrencia de nulos
|
||||
(filas que fallan en varias columnas a la vez), calcular el % de nulos sobre una
|
||||
muestra, o pintar un heatmap de missingness reutilizando un unico lector read-only
|
||||
inyectado, en vez de hacer N `COUNT(*) WHERE col IS NULL` por separado.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura**: lee de la base de datos a traves de `query_fn`. No abre conexiones
|
||||
por su cuenta — depende por completo del lector inyectado. Sigue el estilo
|
||||
dict-no-throw del grupo `eda`: nunca lanza; ante cualquier fallo devuelve
|
||||
`{"status":"error","error":...}` con `columns=[]`, `mask={}`, `n=0`.
|
||||
- **`error_type` en el frontmatter es `error_go_core` por convencion del registry**
|
||||
(toda funcion impura debe declararlo y el indexer lo exige), pero el codigo
|
||||
NO lanza esa excepcion: degrada al dict de error. Es metadata, no comportamiento.
|
||||
- **Muestra, no censo**: con `LIMIT max_rows` obtienes el primer tramo de filas que
|
||||
devuelva el backend, no un muestreo uniforme ni la tabla entera. El % de nulos
|
||||
derivado es una estimacion sobre esa muestra; para el conteo exacto usa un
|
||||
agregado `COUNT(*)`/`COUNT(col)` aparte.
|
||||
- **Alineacion por fila**: `mask[col][i]` corresponde a la misma fila `i` que
|
||||
`mask[otra_col][i]`. Todas las listas tienen longitud `n`, asi que puedes cruzar
|
||||
columnas por indice (co-ocurrencia de nulos) sin re-alinear.
|
||||
- **Defensa None -> 1**: el SQL ya devuelve 0/1, pero si una celda llega como `None`
|
||||
(CASE no aplicado, columna ausente en la fila, backend que nulifica) se cuenta
|
||||
como 1 (falta). Un valor inesperado no convertible a int se trata como presente (0).
|
||||
- **No loguear los datos crudos**: aunque `mask` es solo 0/1, los nombres de columna
|
||||
pueden revelar el esquema. En trazas usa `n` y el numero de columnas, no el dict
|
||||
completo.
|
||||
@@ -0,0 +1,101 @@
|
||||
"""extract_null_mask — extrae la mascara de nulos (1=falta / 0=presente) de una tabla.
|
||||
|
||||
Lector read-only inyectado: recibe `query_fn(sql) -> dict` con el mismo contrato
|
||||
que duckdb_query_readonly / pg_query (y que el `_q` de profile_table):
|
||||
`{"status": "ok", "rows": [{col: val, ...}, ...]}`. Esta funcion NO abre ninguna
|
||||
conexion por su cuenta — solo usa `query_fn`. Construye UNA sola query que, por
|
||||
cada columna pedida, evalua `CASE WHEN "col" IS NULL THEN 1 ELSE 0 END` y devuelve
|
||||
una muestra de filas con esos bits. El resultado es un dict `mask` con una lista
|
||||
0/1 por columna, alineada por fila (1 = el valor falta / IS NULL, 0 = presente),
|
||||
listo para alimentar el capitulo de calidad / patron de nulos de AutomaticEDA sin
|
||||
que el capitulo toque la base de datos.
|
||||
|
||||
Estilo dict-no-throw del grupo `eda`: nunca lanza; captura cualquier excepcion y
|
||||
degrada a `{"status": "error", "error": str, ...}`.
|
||||
"""
|
||||
|
||||
|
||||
def _to_bit(value):
|
||||
"""Coacciona el valor 0/1 del CASE a int de forma defensiva.
|
||||
|
||||
El SQL ya devuelve 0 (presente) o 1 (falta). Por si una celda llega como None
|
||||
(el CASE no se aplico o el backend la nulifico), se cuenta como 1 (falta). El
|
||||
resto se reduce a int: un entero distinto de 0 cuenta como 1 (falta), 0 como
|
||||
presente. Un valor no convertible se trata como presente (0) — nunca lanza.
|
||||
"""
|
||||
if value is None:
|
||||
return 1
|
||||
try:
|
||||
return 1 if int(value) != 0 else 0
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
def extract_null_mask(query_fn, table, columns, max_rows=5000):
|
||||
"""Extrae la mascara de nulos (1=falta / 0=presente) de una muestra de la tabla.
|
||||
|
||||
Args:
|
||||
query_fn: callable lector read-only del backend activo. Recibe un string
|
||||
SQL y devuelve un dict {"status": "ok", "rows": [{col: val, ...}]}
|
||||
(mismo contrato que duckdb_query_readonly / el `_q` de profile_table).
|
||||
No se abre ninguna conexion aqui: toda la lectura pasa por query_fn.
|
||||
table: nombre de la tabla. Se escapa con comillas dobles en la query.
|
||||
columns: lista de nombres de columna a evaluar. Cada una produce una
|
||||
entrada en `mask` con una lista 0/1 paralela por fila. Vacia o None ->
|
||||
status error.
|
||||
max_rows: limite de filas a muestrear (clausula LIMIT). Default 5000.
|
||||
|
||||
Returns:
|
||||
dict (nunca lanza):
|
||||
{
|
||||
"status": "ok" | "error",
|
||||
"error": str, # solo si status == "error"
|
||||
"table": str,
|
||||
"columns": [str, ...], # columnas efectivamente leidas, en orden
|
||||
"mask": {col: [int 0/1, ...], ...}, # alineada por fila, 1=falta, 0=presente
|
||||
"n": int # nº de filas muestreadas
|
||||
}
|
||||
Todas las listas de `mask` tienen la misma longitud (= n).
|
||||
"""
|
||||
base = {"status": "ok", "table": table, "columns": [], "mask": {}, "n": 0}
|
||||
try:
|
||||
if query_fn is None:
|
||||
return {**base, "status": "error", "error": "query_fn es None"}
|
||||
if not table:
|
||||
return {**base, "status": "error", "error": "table es obligatorio"}
|
||||
if not columns:
|
||||
return {**base, "status": "error", "error": "columns vacío"}
|
||||
|
||||
# Identificadores escapados con comillas dobles (como hace profile_table)
|
||||
# para tolerar nombres con mayusculas/espacios/palabras reservadas. Cada
|
||||
# columna se proyecta como su propio bit IS NULL conservando el alias.
|
||||
select_sql = ", ".join(
|
||||
f'(CASE WHEN "{c}" IS NULL THEN 1 ELSE 0 END) AS "{c}"' for c in columns
|
||||
)
|
||||
sql = f'SELECT {select_sql} FROM "{table}" LIMIT {int(max_rows)}'
|
||||
|
||||
q = query_fn(sql)
|
||||
if not isinstance(q, dict) or q.get("status") != "ok":
|
||||
err = (
|
||||
q.get("error", "query_fn fallo")
|
||||
if isinstance(q, dict)
|
||||
else "query_fn no devolvio un dict"
|
||||
)
|
||||
return {**base, "status": "error", "error": err}
|
||||
|
||||
rows = q.get("rows", []) or []
|
||||
mask = {c: [] for c in columns}
|
||||
for row in rows:
|
||||
for c in columns:
|
||||
# row.get tolera filas que no traigan la columna (None -> falta).
|
||||
mask[c].append(_to_bit(row.get(c) if isinstance(row, dict) else None))
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"table": table,
|
||||
"columns": list(columns),
|
||||
"mask": mask,
|
||||
"n": len(rows),
|
||||
}
|
||||
except Exception as e: # noqa: BLE001 - dict-no-throw: degradar, nunca lanzar
|
||||
return {**base, "status": "error", "error": str(e)}
|
||||
@@ -0,0 +1,116 @@
|
||||
"""Tests para extract_null_mask.
|
||||
|
||||
No usa DuckDB real: inyecta un query_fn FAKE (closure) que devuelve filas
|
||||
predefinidas (simulando el SELECT de bits 0/1) y, opcionalmente, captura el SQL
|
||||
recibido para verificar la query generada (CASE WHEN ... IS NULL + LIMIT). Asi el
|
||||
test es autocontenido y no depende de ningun backend.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from extract_null_mask import extract_null_mask
|
||||
|
||||
|
||||
def _fake_query(rows, captured=None, status="ok", error=None):
|
||||
"""Crea un query_fn FAKE.
|
||||
|
||||
`captured` (lista opcional) recibe el SQL ejecutado para poder inspeccionarlo.
|
||||
`status`/`error` permiten simular un fallo del backend.
|
||||
"""
|
||||
|
||||
def _q(sql):
|
||||
if captured is not None:
|
||||
captured.append(sql)
|
||||
if status != "ok":
|
||||
return {"status": "error", "error": error or "boom"}
|
||||
return {"status": "ok", "rows": rows}
|
||||
|
||||
return _q
|
||||
|
||||
|
||||
def test_golden_mask_alineada():
|
||||
"""Golden: mask 0/1 por columna alineada por fila, n correcto, status ok."""
|
||||
# Cada fila simula el SELECT (CASE WHEN col IS NULL THEN 1 ELSE 0 END) AS col.
|
||||
rows = [
|
||||
{"email": 0, "telefono": 1, "edad": 0},
|
||||
{"email": 0, "telefono": 0, "edad": 1},
|
||||
{"email": 1, "telefono": 1, "edad": 0},
|
||||
]
|
||||
res = extract_null_mask(_fake_query(rows), "clientes", ["email", "telefono", "edad"])
|
||||
assert res["status"] == "ok"
|
||||
assert res["table"] == "clientes"
|
||||
assert res["columns"] == ["email", "telefono", "edad"]
|
||||
assert res["n"] == 3
|
||||
assert res["mask"]["email"] == [0, 0, 1]
|
||||
assert res["mask"]["telefono"] == [1, 0, 1]
|
||||
assert res["mask"]["edad"] == [0, 1, 0]
|
||||
# Todas las listas con la misma longitud.
|
||||
assert all(len(v) == res["n"] for v in res["mask"].values())
|
||||
|
||||
|
||||
def test_celda_none_cuenta_como_falta():
|
||||
"""Una celda None se cuenta defensivamente como 1 (falta)."""
|
||||
rows = [
|
||||
{"email": 0, "telefono": None},
|
||||
{"email": None, "telefono": 1},
|
||||
{"email": 1, "telefono": 0},
|
||||
]
|
||||
res = extract_null_mask(_fake_query(rows), "clientes", ["email", "telefono"])
|
||||
assert res["status"] == "ok"
|
||||
assert res["mask"]["email"] == [0, 1, 1]
|
||||
assert res["mask"]["telefono"] == [1, 1, 0]
|
||||
assert res["n"] == 3
|
||||
|
||||
|
||||
def test_columns_vacia_status_error():
|
||||
"""columns vacia -> status error con columns/mask/n vacios."""
|
||||
res = extract_null_mask(_fake_query([]), "clientes", [])
|
||||
assert res["status"] == "error"
|
||||
assert "columns" in res["error"]
|
||||
assert res["table"] == "clientes"
|
||||
assert res["columns"] == []
|
||||
assert res["mask"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_query_fn_status_error_propaga():
|
||||
"""query_fn que devuelve status != ok -> se propaga como error, mask {}."""
|
||||
res = extract_null_mask(
|
||||
_fake_query([], status="error", error="db locked"),
|
||||
"clientes",
|
||||
["email"],
|
||||
)
|
||||
assert res["status"] == "error"
|
||||
assert "db locked" in res["error"]
|
||||
assert res["mask"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_query_fn_none_da_error_sin_reventar():
|
||||
"""query_fn None -> error degradado, sin excepcion."""
|
||||
res = extract_null_mask(None, "clientes", ["email"])
|
||||
assert res["status"] == "error"
|
||||
assert res["columns"] == []
|
||||
assert res["mask"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_sql_contiene_case_y_limit():
|
||||
"""La query genera un CASE WHEN IS NULL por columna escapada + LIMIT sobre la tabla."""
|
||||
captured = []
|
||||
rows = [{"email": 0}]
|
||||
extract_null_mask(
|
||||
_fake_query(rows, captured),
|
||||
"clientes_tbl",
|
||||
["email"],
|
||||
max_rows=123,
|
||||
)
|
||||
assert len(captured) == 1
|
||||
sql = captured[0]
|
||||
assert 'CASE WHEN "email" IS NULL THEN 1 ELSE 0 END' in sql
|
||||
assert 'AS "email"' in sql
|
||||
assert 'FROM "clientes_tbl"' in sql
|
||||
assert "LIMIT 123" in sql
|
||||
@@ -0,0 +1,102 @@
|
||||
---
|
||||
name: extract_text_sample
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def extract_text_sample(db_path: str, table: str, columns: list, backend: str = 'duckdb', sample: int = 2000) -> dict"
|
||||
description: "Muestrea columnas de texto de una tabla DuckDB/Postgres con push-down SQL (LIMIT sample), SIN traer la tabla entera a RAM. Funcion impura del grupo de capacidad `eda`: la usan los capitulos de texto/NLP del AutomaticEDA que necesitan valores crudos de texto (longitudes, tokens, ejemplos) sobre una muestra acotada. Construye el lector read-only query_fn(sql)->dict igual que build_eda_render_ctx (closure sobre duckdb_query_readonly / pg_query importados perezosamente desde infra). Escapa los identificadores con comillas dobles y lanza una sola query SELECT \"c1\", \"c2\" FROM \"table\" LIMIT n. Por columna, la lista de strings solo contiene valores NO None y NO vacios: cada celda no nula se convierte con str(...) y se descarta si queda cadena vacia. Estilo dict-no-throw del grupo eda: NUNCA lanza; ante cualquier fallo (query, conversion, backend desconocido) devuelve {status:'error', error:str, columns:{}, n:0}. La clave n reporta el numero de FILAS leidas por la query (antes de filtrar None/vacios)."
|
||||
tags: [eda, datascience, text, nlp, extraction, read-only, duckdb, postgres, python]
|
||||
uses_functions: [duckdb_query_readonly_py_infra, pg_query_py_infra]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
params:
|
||||
- name: db_path
|
||||
desc: "ruta al archivo DuckDB, o DSN PostgreSQL si backend='postgres'. Se inyecta en el closure query_fn. No se valida aqui: si la base no existe o el DSN es invalido, la query devuelve status error y el resultado es {status:'error', ...} (no lanza)."
|
||||
- name: table
|
||||
desc: "nombre de la tabla. Se escapa con comillas dobles en la query (SELECT ... FROM \"table\")."
|
||||
- name: columns
|
||||
desc: "lista de nombres de columna de texto a muestrear. Se filtra a las entradas que sean str no vacio; cada nombre se escapa con comillas dobles. Si tras filtrar queda vacia -> {status:'ok', columns:{}, n:0} sin tocar la base."
|
||||
- name: backend
|
||||
desc: "'duckdb' (default) o 'postgres'. Selecciona el lector read-only del registry (duckdb_query_readonly / pg_query). Cualquier otro valor -> {status:'error', error:'backend desconocido: <valor>', columns:{}, n:0}."
|
||||
- name: sample
|
||||
desc: "maximo de filas a muestrear (clausula LIMIT). Default 2000. Acota memoria y tiempo: con tablas grandes obtienes el primer tramo por orden fisico (sin ORDER BY), no un muestreo uniforme."
|
||||
output: "dict dict-no-throw (NUNCA lanza): {status:'ok'|'error', columns:{col_name:[str,...]}, n:int, error:str}. En exito (status='ok') columns mapea cada columna pedida a la lista de sus valores de texto NO None y NO vacios (cada celda convertida con str(...)); n es el numero de FILAS leidas por la query (antes de filtrar None/vacios). columns vacio -> {status:'ok', columns:{}, n:0}. En error (backend desconocido, query con status!='ok', o cualquier excepcion) -> {status:'error', error:str, columns:{}, n:0}; la clave error solo aparece en este caso."
|
||||
tested: true
|
||||
tests: ["test_extract_basic", "test_backend_desconocido", "test_columns_vacio", "test_sample_limit"]
|
||||
test_file_path: "python/functions/datascience/extract_text_sample_test.py"
|
||||
file_path: "python/functions/datascience/extract_text_sample.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
# Import directo del submodulo (no requiere export en datascience/__init__.py).
|
||||
from datascience.extract_text_sample import extract_text_sample
|
||||
|
||||
# Muestrea hasta 2000 filas de dos columnas de texto de una tabla DuckDB.
|
||||
res = extract_text_sample(
|
||||
"data/reviews.duckdb", "reviews", ["title", "body"],
|
||||
backend="duckdb", sample=2000,
|
||||
)
|
||||
# res == {
|
||||
# "status": "ok",
|
||||
# "columns": {
|
||||
# "title": ["Gran producto", "No funciona", ...], # solo no-None, no-""
|
||||
# "body": ["Lo uso a diario...", ...],
|
||||
# },
|
||||
# "n": 2000, # filas leidas por la query (antes de filtrar None/vacios)
|
||||
# }
|
||||
|
||||
# Postgres: db_path es el DSN.
|
||||
res_pg = extract_text_sample(
|
||||
"postgresql://user:pass@localhost:5433/trends", "comentarios", ["texto"],
|
||||
backend="postgres", sample=500,
|
||||
)
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando necesites valores CRUDOS de texto de una o varias columnas para analisis
|
||||
NLP/texto (distribucion de longitudes, conteo de tokens, ejemplos representativos,
|
||||
deteccion de idioma) pero NO quieras cargar la tabla entera en memoria. Es el
|
||||
muestreador de texto del grupo `eda`: una sola llamada con push-down `LIMIT`
|
||||
devuelve listas de strings por columna, limpias de None y vacios, listas para
|
||||
alimentar un capitulo de texto del AutomaticEDA o cualquier rutina de tokenizado.
|
||||
Usala junto a `profile_table` / `build_eda_render_ctx` cuando el perfil agregado
|
||||
no basta y hace falta el texto real.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura**: lee de la base de datos a traves de `query_fn` (closure sobre
|
||||
`duckdb_query_readonly` / `pg_query`). No abre conexiones fuera de esos wrappers
|
||||
del registry. Estilo dict-no-throw del grupo `eda`: NUNCA lanza; ante cualquier
|
||||
fallo devuelve `{status:'error', error:str, columns:{}, n:0}`.
|
||||
- **`error_type` en el frontmatter es `error_go_core` por convencion del registry**
|
||||
(toda funcion impura debe declararlo y el indexer lo exige), pero el codigo NO
|
||||
lanza esa excepcion: degrada al dict de error. Es metadata, no comportamiento.
|
||||
- **Backend desconocido**: con un `backend` que no sea `duckdb` ni `postgres`
|
||||
devuelve `{status:'error', error:'backend desconocido: <valor>', columns:{},
|
||||
n:0}` sin tocar la base.
|
||||
- **Las listas NO incluyen None ni cadenas vacias**: cada celda no nula se pasa
|
||||
por `str(...)` y se descarta si queda `""`. Por eso `len(columns[col])` puede ser
|
||||
menor que `n` (que cuenta las filas leidas). Si necesitas alineacion por fila
|
||||
(una entrada por fila aunque sea None), usa `build_eda_render_ctx` (raw_numeric),
|
||||
no esta funcion.
|
||||
- **`LIMIT sample` sin `ORDER BY`**: con tablas grandes obtienes el primer tramo
|
||||
por orden fisico del backend, no un muestreo uniforme ni reproducible. Sube
|
||||
`sample` para mas cobertura, o pre-ordena/aleatoriza la tabla si necesitas
|
||||
representatividad.
|
||||
- **DuckDB en sandbox por defecto**: `duckdb_query_readonly` abre la conexion con
|
||||
`enable_external_access=False`, asi que la query solo puede leer la propia base
|
||||
(no `read_csv`/`httpfs`/`ATTACH` a paths externos). Lee tablas ya existentes en
|
||||
el archivo DuckDB sin problema.
|
||||
- **No loguear los datos crudos**: las listas de `columns` pueden contener texto
|
||||
sensible (reviews, comentarios, PII). En trazas usa solo conteos (`n`,
|
||||
`len(columns[col])`) y nombres de columna, no el dict completo.
|
||||
@@ -0,0 +1,112 @@
|
||||
"""extract_text_sample — muestrea columnas de texto de una tabla sin cargarla en RAM.
|
||||
|
||||
Funcion impura (lee de la base de datos) del grupo de capacidad `eda`. Dado un
|
||||
``db_path`` + ``table`` (DuckDB o PostgreSQL) y una lista de ``columns`` de texto,
|
||||
trae una MUESTRA de esas columnas con push-down SQL (``LIMIT sample``), nunca la
|
||||
tabla entera. La usan los capitulos de texto/NLP del AutomaticEDA que necesitan
|
||||
valores crudos de texto (longitudes, tokens, ejemplos) sin materializar millones
|
||||
de filas en memoria.
|
||||
|
||||
El lector read-only ``query_fn(sql) -> dict`` se construye igual que en
|
||||
``build_eda_render_ctx`` / ``profile_table``: un closure sobre el wrapper del
|
||||
registry (``duckdb_query_readonly`` / ``pg_query``), importado perezosamente
|
||||
dentro de la funcion para no crear ciclos al cargar el ``__init__`` del paquete
|
||||
``datascience``. Nunca abre conexiones fuera de esos wrappers.
|
||||
|
||||
Estilo dict-no-throw del grupo `eda`: la funcion NUNCA lanza. Captura cualquier
|
||||
excepcion (query, conversion) y devuelve ``{"status":"error", "error":str(e),
|
||||
"columns":{}, "n":0}``. Si la query subyacente devuelve ``status != "ok"``, se
|
||||
propaga como error con el mensaje del wrapper.
|
||||
|
||||
Por columna, la lista de strings solo contiene valores NO nulos y NO vacios:
|
||||
cada celda no-None se convierte con ``str(...)`` y se descarta si queda ``""``.
|
||||
La clave ``n`` reporta el numero de FILAS leidas por la query (antes de filtrar
|
||||
los None/vacios), util para saber cuanto se muestreo realmente.
|
||||
"""
|
||||
|
||||
|
||||
def extract_text_sample(db_path, table, columns, backend="duckdb", sample=2000):
|
||||
"""Muestrea columnas de texto de una tabla DuckDB/Postgres con push-down SQL.
|
||||
|
||||
Args:
|
||||
db_path: ruta al archivo DuckDB, o DSN PostgreSQL si backend="postgres".
|
||||
Se inyecta en el closure query_fn. No se valida aqui: si la base no
|
||||
existe o el DSN es invalido, la query devuelve status error y el
|
||||
resultado es {status:'error', ...} (no lanza).
|
||||
table: nombre de la tabla. Se escapa con comillas dobles en la query.
|
||||
columns: lista de nombres de columna de texto a muestrear. Se filtra a las
|
||||
entradas que sean str no vacio; cada nombre se escapa con comillas
|
||||
dobles. Si tras filtrar queda vacia -> {status:'ok', columns:{}, n:0}.
|
||||
backend: "duckdb" (default) o "postgres". Selecciona el lector read-only
|
||||
del registry (duckdb_query_readonly / pg_query). Cualquier otro valor
|
||||
-> {status:'error', error:'backend desconocido: ...', columns:{}, n:0}.
|
||||
sample: maximo de filas a muestrear (clausula LIMIT). Default 2000. Acota
|
||||
memoria y tiempo: con tablas grandes obtienes el primer tramo por
|
||||
orden fisico, no un muestreo uniforme.
|
||||
|
||||
Returns:
|
||||
dict (dict-no-throw, NUNCA lanza):
|
||||
{"status": "ok"|"error",
|
||||
"columns": {col_name: [str, str, ...], ...}, # solo no-None, no-""
|
||||
"n": int, # nº de filas leidas por la query (antes de filtrar)
|
||||
"error": str} # solo presente si status == "error"
|
||||
"""
|
||||
try:
|
||||
# 1) Lector read-only del backend activo, construido como en
|
||||
# build_eda_render_ctx (closure sobre el wrapper del registry). Imports
|
||||
# perezosos: este modulo vive en el paquete `datascience`, importar a
|
||||
# `infra` a nivel de modulo crearia un ciclo al cargar el __init__.
|
||||
if backend == "duckdb":
|
||||
from infra import duckdb_query_readonly
|
||||
|
||||
def query_fn(sql):
|
||||
return duckdb_query_readonly(db_path, sql)
|
||||
|
||||
elif backend == "postgres":
|
||||
from infra import pg_query
|
||||
|
||||
def query_fn(sql):
|
||||
return pg_query(db_path, sql)
|
||||
|
||||
else:
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"backend desconocido: {backend}",
|
||||
"columns": {},
|
||||
"n": 0,
|
||||
}
|
||||
|
||||
# 2) Columnas validas (str no vacio). Si no queda ninguna, nada que
|
||||
# muestrear: ok con columns vacio.
|
||||
cols = []
|
||||
if isinstance(columns, (list, tuple)):
|
||||
cols = [c for c in columns if isinstance(c, str) and c != ""]
|
||||
if not cols:
|
||||
return {"status": "ok", "columns": {}, "n": 0}
|
||||
|
||||
# 3) Push-down: una sola query con LIMIT. Identificadores escapados con
|
||||
# comillas dobles, igual que build_eda_render_ctx.
|
||||
cols_sql = ", ".join(f'"{c}"' for c in cols)
|
||||
sql = f'SELECT {cols_sql} FROM "{table}" LIMIT {int(sample)}'
|
||||
q = query_fn(sql)
|
||||
if not isinstance(q, dict) or q.get("status") != "ok":
|
||||
err = q.get("error") if isinstance(q, dict) else "query sin resultado"
|
||||
return {"status": "error", "error": str(err), "columns": {}, "n": 0}
|
||||
|
||||
rows = q.get("rows") or []
|
||||
out = {c: [] for c in cols}
|
||||
for row in rows:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
for c in cols:
|
||||
value = row.get(c)
|
||||
if value is None:
|
||||
continue
|
||||
s = str(value)
|
||||
if s == "":
|
||||
continue
|
||||
out[c].append(s)
|
||||
|
||||
return {"status": "ok", "columns": out, "n": len(rows)}
|
||||
except Exception as exc: # noqa: BLE001 - dict-no-throw del grupo eda
|
||||
return {"status": "error", "error": str(exc), "columns": {}, "n": 0}
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Tests para extract_text_sample.
|
||||
|
||||
Self-contained: crea un DuckDB temporal pequeño con una columna de texto (algunas
|
||||
filas con NULL) y una numerica, y verifica que la muestra de texto trae solo los
|
||||
valores no nulos, que el backend desconocido y la lista de columnas vacia se
|
||||
manejan dict-no-throw, y que sample acota el numero de filas leidas.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_FUNCTIONS = os.path.abspath(os.path.join(_HERE, "..")) # python/functions
|
||||
if _FUNCTIONS not in sys.path:
|
||||
sys.path.insert(0, _FUNCTIONS)
|
||||
|
||||
import duckdb # noqa: E402
|
||||
|
||||
from datascience.extract_text_sample import extract_text_sample # noqa: E402
|
||||
|
||||
_TABLE = "t"
|
||||
# 6 filas: txt VARCHAR con dos NULL, other INT siempre presente.
|
||||
_ROWS = [
|
||||
("alpha", 1),
|
||||
("beta", 2),
|
||||
(None, 3),
|
||||
("gamma", 4),
|
||||
(None, 5),
|
||||
("delta", 6),
|
||||
]
|
||||
_TXT_NON_NULL = {"alpha", "beta", "gamma", "delta"}
|
||||
|
||||
|
||||
def _make_db(tmp_path):
|
||||
"""Crea un DuckDB temporal con la tabla de prueba y devuelve su ruta."""
|
||||
db_path = os.path.join(str(tmp_path), "text_sample.duckdb")
|
||||
con = duckdb.connect(db_path)
|
||||
try:
|
||||
con.execute(f'CREATE TABLE "{_TABLE}" (txt VARCHAR, other INTEGER)')
|
||||
con.executemany(f'INSERT INTO "{_TABLE}" VALUES (?, ?)', _ROWS)
|
||||
finally:
|
||||
con.close()
|
||||
return db_path
|
||||
|
||||
|
||||
def test_extract_basic(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, ["txt"])
|
||||
assert res["status"] == "ok"
|
||||
# n = filas leidas por la query (6), antes de filtrar None.
|
||||
assert res["n"] == len(_ROWS)
|
||||
# columns["txt"] trae solo los strings no nulos (los dos NULL fuera).
|
||||
assert "txt" in res["columns"]
|
||||
assert set(res["columns"]["txt"]) == _TXT_NON_NULL
|
||||
assert len(res["columns"]["txt"]) == len(_TXT_NON_NULL)
|
||||
# No se pidio "other", no debe aparecer.
|
||||
assert "other" not in res["columns"]
|
||||
|
||||
|
||||
def test_backend_desconocido(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, ["txt"], backend="mysql")
|
||||
assert res["status"] == "error"
|
||||
assert "backend desconocido" in res["error"]
|
||||
assert res["columns"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_columns_vacio(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, [])
|
||||
assert res["status"] == "ok"
|
||||
assert res["columns"] == {}
|
||||
assert res["n"] == 0
|
||||
|
||||
|
||||
def test_sample_limit(tmp_path):
|
||||
db_path = _make_db(tmp_path)
|
||||
res = extract_text_sample(db_path, _TABLE, ["txt"], sample=2)
|
||||
assert res["status"] == "ok"
|
||||
# sample=2 -> la query lee como mucho 2 filas.
|
||||
assert res["n"] == 2
|
||||
assert len(res["columns"]["txt"]) <= 2
|
||||
@@ -0,0 +1,103 @@
|
||||
---
|
||||
id: missingness_corr_heatmap_figure_py_datascience
|
||||
name: missingness_corr_heatmap_figure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def missingness_corr_heatmap_figure(matrix, labels, title=\"Co-ocurrencia de ausencias\") -> \"matplotlib.figure.Figure\""
|
||||
description: "Construye una figura matplotlib (heatmap) de la matriz NxN de correlación de ausencias entre columnas: +1 = dos columnas suelen ser nulas a la vez, -1 = cuando una falta la otra está presente, 0 = ausencias independientes. Usa ax.imshow con coolwarm fijado a [-1,1], ticks con los labels truncados (X rotados 45º), colorbar y anota el valor de cada celda si N<=12. Devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA (capítulo de datos faltantes). Backend Agg sin pyplot global; defensivo ante matrix/labels vacíos o celdas no numéricas (nunca lanza)."
|
||||
tags: [eda, missing, missingness, correlation, heatmap, matplotlib, figure, visualization, datascience, impure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [matplotlib]
|
||||
example: |
|
||||
from datascience.missingness_corr_heatmap_figure import missingness_corr_heatmap_figure
|
||||
matrix = [
|
||||
[1.0, 0.82, -0.10],
|
||||
[0.82, 1.0, 0.05],
|
||||
[-0.10, 0.05, 1.0],
|
||||
]
|
||||
labels = ["telefono", "movil", "email"]
|
||||
fig = missingness_corr_heatmap_figure(matrix, labels, title="Co-ocurrencia de ausencias")
|
||||
tested: true
|
||||
tests:
|
||||
- "test_returns_figure_with_axes"
|
||||
- "test_empty_matrix_does_not_raise_and_returns_figure"
|
||||
- "test_empty_labels_returns_message_figure"
|
||||
- "test_large_matrix_omits_annotations"
|
||||
- "test_ragged_and_non_numeric_cells_are_handled"
|
||||
test_file_path: "python/functions/datascience/missingness_corr_heatmap_figure_test.py"
|
||||
file_path: "python/functions/datascience/missingness_corr_heatmap_figure.py"
|
||||
params:
|
||||
- name: matrix
|
||||
desc: "Lista de listas (NxN) de floats en [-1,1]: la correlación de ausencias por pares de columnas. Puede venir vacía. Filas de longitud desigual se toleran (se rellenan/recortan a N); celdas None, NaN o no numéricas se coercen a 0.0. No se muta el original."
|
||||
- name: labels
|
||||
desc: "Lista de N nombres de columna, paralela a matrix. Puede venir vacía (devuelve figura \"sin columnas con ausencia variable\"). Se truncan a ~14 chars con elipsis para los ticks; los originales no se mutan."
|
||||
- name: title
|
||||
desc: "Título de la figura. Se trunca a ~60 chars con elipsis si es muy largo. Default \"Co-ocurrencia de ausencias\"."
|
||||
output: "Un matplotlib.figure.Figure (figsize 6.4x5.2, dpi 150) con un Axes heatmap (imshow vmin=-1, vmax=1, cmap coolwarm) más una colorbar etiquetada \"correlación de ausencias\". Ticks en ambos ejes con los labels truncados (X rotados 45º). Si N<=12 cada celda lleva su valor numérico anotado (texto blanco sobre celdas saturadas, oscuro sobre pálidas); con N grande se omiten las anotaciones para no saturar. Si matrix o labels vienen vacíos devuelve una Figure con texto centrado \"sin columnas con ausencia variable\"; cualquier error inesperado se captura y devuelve una Figure con el mensaje de error (nunca lanza). El caller rasteriza/cierra la figura; la función no la muestra ni la guarda."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.missingness_corr_heatmap_figure import missingness_corr_heatmap_figure
|
||||
|
||||
# Correlación de ausencias entre 3 columnas de contacto:
|
||||
# telefono y movil tienden a faltar juntos (0.82); email es casi independiente.
|
||||
matrix = [
|
||||
[1.00, 0.82, -0.10],
|
||||
[0.82, 1.00, 0.05],
|
||||
[-0.10, 0.05, 1.00],
|
||||
]
|
||||
labels = ["telefono", "movil", "email"]
|
||||
|
||||
fig = missingness_corr_heatmap_figure(
|
||||
matrix,
|
||||
labels,
|
||||
title="Co-ocurrencia de ausencias",
|
||||
)
|
||||
|
||||
# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
|
||||
fig.savefig("/tmp/missingness_heatmap.png")
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala en el capítulo de datos faltantes de un informe EDA cuando quieras ver de
|
||||
un vistazo qué columnas faltan juntas (mismo formulario sin rellenar, mismo
|
||||
proceso roto) frente a columnas cuyas ausencias son independientes. Pásale la
|
||||
matriz de correlación de ausencias (calculada sobre la máscara de nulos, p. ej.
|
||||
`df.isnull().corr()`) restringida a las columnas que de verdad tienen ausencia
|
||||
variable, junto con sus nombres. Es la pareja "estructura" del ranking de % de
|
||||
nulos: las barras dicen *cuánto* falta cada columna, este heatmap dice *si las
|
||||
ausencias están relacionadas* entre columnas.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
|
||||
y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
|
||||
para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
|
||||
es thread-safe; esta función evita ese riesgo construyendo el `Figure`
|
||||
directamente, así que es segura de llamar en bucle desde el renderer.
|
||||
- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo
|
||||
guarda. Quien la consume debe rasterizarla y luego liberarla
|
||||
(`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes.
|
||||
- **Escala de color fija en [-1, 1].** `vmin=-1`, `vmax=1` están fijados a
|
||||
propósito para que el color sea comparable entre informes y entre columnas. No
|
||||
se autoescala al rango real de la matriz; valores fuera de `[-1, 1]` se
|
||||
saturan al extremo del colormap.
|
||||
- **Anotaciones solo con N<=12.** Por encima de 12 columnas el grid de números
|
||||
se vuelve ilegible y se omite; queda solo el color + la colorbar. Filtra a las
|
||||
columnas con ausencia variable antes de llamar para no llegar a matrices
|
||||
enormes.
|
||||
- **Defensiva, nunca lanza.** `matrix=[]`, `labels=[]`, filas cortas, celdas
|
||||
`None`/`NaN`/no numéricas o cualquier error inesperado se manejan sin propagar:
|
||||
en el peor caso devuelve una `Figure` con "sin columnas con ausencia variable"
|
||||
o con el texto del error. No envuelvas la llamada en try/except por miedo a un
|
||||
raise — no lo hay.
|
||||
@@ -0,0 +1,158 @@
|
||||
"""Impure EDA helper: heatmap of missingness co-occurrence (`eda` group).
|
||||
|
||||
Builds a matplotlib heatmap of the pairwise missingness correlation matrix of a
|
||||
dataset: a value near ``+1`` means two columns tend to be null together, near
|
||||
``-1`` means when one is null the other tends to be present, and ``0`` means
|
||||
their absences are independent. Returns a ready-to-rasterize
|
||||
``matplotlib.figure.Figure``; it never shows nor saves it.
|
||||
|
||||
Impure because it touches matplotlib's rendering machinery. It uses the headless
|
||||
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
|
||||
global state and is safe to call repeatedly from a report renderer.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
# Muted gray for secondary text (no-data / fallback messages).
|
||||
_MUTED_TEXT = "#5f6b7a"
|
||||
# Soft red for the error fallback message (kept readable, not alarming).
|
||||
_ERROR_TEXT = "#b00020"
|
||||
|
||||
|
||||
def _truncate(text, width: int = 14) -> str:
|
||||
"""Truncate ``text`` to ``width`` chars, appending an ellipsis if cut."""
|
||||
s = "" if text is None else str(text)
|
||||
if len(s) <= width:
|
||||
return s
|
||||
if width <= 1:
|
||||
return s[:width]
|
||||
return s[: width - 1] + "…"
|
||||
|
||||
|
||||
def _message_figure(message: str, color: str = _MUTED_TEXT) -> "Figure":
|
||||
"""Return a fallback ``Figure`` carrying a single centered message."""
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
ax.axis("off")
|
||||
ax.text(
|
||||
0.5,
|
||||
0.5,
|
||||
message,
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=12,
|
||||
color=color,
|
||||
wrap=True,
|
||||
transform=ax.transAxes,
|
||||
)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def missingness_corr_heatmap_figure(
|
||||
matrix,
|
||||
labels,
|
||||
title: str = "Co-ocurrencia de ausencias",
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Build a heatmap figure of a missingness correlation matrix.
|
||||
|
||||
Renders an ``NxN`` matrix of missingness correlations in ``[-1, 1]`` with a
|
||||
diverging ``coolwarm`` colormap (fixed ``vmin=-1``, ``vmax=1`` so the color
|
||||
scale is comparable across reports). Both axes are tick-labelled with the
|
||||
column names (truncated to ~14 chars; the X labels rotated 45°). A colorbar
|
||||
is attached. When the matrix is small (``N <= 12``) each cell is annotated
|
||||
with its numeric value; for larger matrices the annotations are omitted to
|
||||
avoid an unreadable grid.
|
||||
|
||||
The function is fully defensive: empty/ragged/non-numeric input never raises.
|
||||
When there is nothing valid to draw it returns a ``Figure`` carrying a
|
||||
centered "sin columnas con ausencia variable" message, and any unexpected
|
||||
error is caught and turned into a fallback ``Figure`` carrying the error text.
|
||||
|
||||
Args:
|
||||
matrix: List of lists (``NxN``) of floats in ``[-1, 1]`` — the pairwise
|
||||
missingness correlation. May be empty; rows of unequal length are
|
||||
tolerated by treating the matrix as invalid only when it is empty or
|
||||
its label count does not match. Non-numeric/``None`` cells are
|
||||
coerced to ``0.0``.
|
||||
labels: List of ``N`` column names, parallel to ``matrix``. May be empty.
|
||||
Truncated for display; the originals are not mutated.
|
||||
title: Figure title. Default "Co-ocurrencia de ausencias".
|
||||
|
||||
Returns:
|
||||
A ``matplotlib.figure.Figure`` with a single heatmap Axes plus a
|
||||
colorbar. The caller is responsible for rasterizing/closing it.
|
||||
"""
|
||||
try:
|
||||
# --- Validate shape: need a non-empty square-ish matrix with labels.
|
||||
if (
|
||||
not isinstance(matrix, (list, tuple))
|
||||
or not isinstance(labels, (list, tuple))
|
||||
or len(matrix) == 0
|
||||
or len(labels) == 0
|
||||
):
|
||||
return _message_figure("sin columnas con ausencia variable")
|
||||
|
||||
n = len(labels)
|
||||
# Build a clean NxN grid: coerce each cell to float, default 0.0, pad/clip
|
||||
# rows so a ragged input never crashes imshow.
|
||||
grid = []
|
||||
for i in range(n):
|
||||
row_src = matrix[i] if i < len(matrix) else []
|
||||
if not isinstance(row_src, (list, tuple)):
|
||||
row_src = []
|
||||
row = []
|
||||
for j in range(n):
|
||||
cell = row_src[j] if j < len(row_src) else 0.0
|
||||
try:
|
||||
val = float(cell)
|
||||
except (TypeError, ValueError):
|
||||
val = 0.0
|
||||
if val != val: # NaN guard.
|
||||
val = 0.0
|
||||
row.append(val)
|
||||
grid.append(row)
|
||||
|
||||
fig = Figure(figsize=(6.4, 5.2), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
im = ax.imshow(grid, vmin=-1, vmax=1, cmap="coolwarm", aspect="equal")
|
||||
|
||||
short = [_truncate(lab, 14) for lab in labels]
|
||||
ax.set_xticks(range(n))
|
||||
ax.set_yticks(range(n))
|
||||
ax.set_xticklabels(short, rotation=45, ha="right", fontsize=8)
|
||||
ax.set_yticklabels(short, fontsize=8)
|
||||
|
||||
# Annotate each cell only when the grid is small enough to stay legible.
|
||||
if n <= 12:
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
val = grid[i][j]
|
||||
# White text over saturated (dark) cells, dark over pale.
|
||||
txt_color = "white" if abs(val) >= 0.55 else "#202020"
|
||||
ax.text(
|
||||
j,
|
||||
i,
|
||||
f"{val:.2f}",
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=7,
|
||||
color=txt_color,
|
||||
)
|
||||
|
||||
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
|
||||
cbar.ax.tick_params(labelsize=8)
|
||||
cbar.set_label("correlación de ausencias", fontsize=8)
|
||||
|
||||
if title:
|
||||
ax.set_title(_truncate(title, 60), fontsize=12, loc="center", pad=10)
|
||||
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
except Exception as exc: # noqa: BLE001 — never raise from a figure builder.
|
||||
return _message_figure(f"error al dibujar heatmap: {exc}", color=_ERROR_TEXT)
|
||||
@@ -0,0 +1,62 @@
|
||||
"""Tests para missingness_corr_heatmap_figure (heatmap de ausencias, grupo eda).
|
||||
|
||||
Usa el backend Agg sin pyplot; no muestra ni guarda figuras. Cada test cierra
|
||||
explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular
|
||||
estado entre tests.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
from missingness_corr_heatmap_figure import missingness_corr_heatmap_figure
|
||||
|
||||
|
||||
def _identity_matrix(n):
|
||||
"""Matriz NxN con diagonal 1.0 y resto 0.0 (correlación de ausencias)."""
|
||||
return [[1.0 if i == j else 0.0 for j in range(n)] for i in range(n)]
|
||||
|
||||
|
||||
def test_returns_figure_with_axes():
|
||||
matrix = [[1.0, 0.3, -0.2], [0.3, 1.0, 0.5], [-0.2, 0.5, 1.0]]
|
||||
labels = ["edad", "ingresos", "ciudad"]
|
||||
fig = missingness_corr_heatmap_figure(matrix, labels, title="ausencias")
|
||||
assert isinstance(fig, Figure)
|
||||
# Heatmap (>=1 axes) + colorbar añade su propio Axes -> al menos 1.
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_matrix_does_not_raise_and_returns_figure():
|
||||
fig = missingness_corr_heatmap_figure([], [], title="vacía")
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_labels_returns_message_figure():
|
||||
fig = missingness_corr_heatmap_figure([[1.0]], [], title="sin labels")
|
||||
assert isinstance(fig, Figure)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_large_matrix_omits_annotations():
|
||||
n = 16
|
||||
fig = missingness_corr_heatmap_figure(
|
||||
_identity_matrix(n), [f"col_{i}" for i in range(n)]
|
||||
)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_ragged_and_non_numeric_cells_are_handled():
|
||||
# Fila corta + celda None + celda string -> se rellenan/coercen sin lanzar.
|
||||
matrix = [[1.0, None], ["x", 1.0, 0.5]]
|
||||
labels = ["a", "b"]
|
||||
fig = missingness_corr_heatmap_figure(matrix, labels)
|
||||
assert isinstance(fig, Figure)
|
||||
plt.close(fig)
|
||||
@@ -0,0 +1,68 @@
|
||||
---
|
||||
name: missingness_correlation
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def missingness_correlation(null_mask: dict, top_k: int = 20) -> dict"
|
||||
description: "Co-ocurrencia de ausencias: nucleo del capitulo de missingness del grupo eda. Recibe la mascara binaria de nulos de una tabla (1 = falta, 0 = presente, alineada por fila) y mide hasta que punto las columnas faltan juntas. Calcula la matriz de correlacion de Pearson entre los vectores binarios de ausencia de las columnas con varianza (al menos un 1 y un 0), mas las cifras de solapamiento de conjuntos por par (co-missing, either-missing, Jaccard). Excluye las columnas constantes en su ausencia (correlacion indefinida) y reporta cuantas. Compone la funcion atomica pearson del registry; no la reimplementa. Lectura defensiva; NUNCA lanza."
|
||||
tags: [eda, missingness, correlation, pearson, co-occurrence, jaccard, datascience]
|
||||
params:
|
||||
- name: null_mask
|
||||
desc: "dict {col: [int 0/1, ...]} con la mascara de ausencias de la tabla, alineada por fila: 1 = el valor falta en esa fila, 0 = presente. Todas las listas se asumen de la misma longitud (numero de filas). Valores truthy distintos de 0 se tratan como ausencia; entradas no-lista se ignoran sin romper."
|
||||
- name: top_k
|
||||
desc: "Numero maximo de pares a devolver en `pairs`, ordenados por valor absoluto de correlacion descendente. Default 20. Solo limita la lista de pares; la matriz cubre siempre todas las columnas con varianza."
|
||||
output: "dict con: columns (columnas con varianza en la ausencia, en orden de entrada); matrix (len(columns) x len(columns) de correlacion de Pearson entre las mascaras binarias, diagonal 1.0); pairs (hasta top_k pares i<j ordenados por |corr| desc, cada uno {a, b, corr, co_missing, either_missing, jaccard} donde co_missing = filas en que ambas faltan, either_missing = filas en que al menos una falta, jaccard = co_missing/either_missing o 0.0 si either_missing=0); n_excluded (nº de columnas con algun nulo pero sin varianza, constantes en la ausencia); excluded_cols (esas columnas en orden de entrada). Si hay <2 columnas con varianza, columns/matrix/pairs van vacios pero n_excluded/excluded_cols se rellenan. NUNCA lanza."
|
||||
uses_functions: [pearson_py_datascience]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["test_co_ocurrencia_fuerte_corr_uno_jaccard_uno", "test_ausencias_disjuntas_corr_negativa_jaccard_cero", "test_columna_sin_varianza_se_excluye", "test_menos_de_dos_columnas_con_varianza_vacio_pero_cuenta_excluidas", "test_mask_vacio_todo_vacio", "test_top_k_limita_pares", "test_no_lanza_con_entradas_raras"]
|
||||
test_file_path: "python/functions/datascience/missingness_correlation_test.py"
|
||||
file_path: "python/functions/datascience/missingness_correlation.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.missingness_correlation import missingness_correlation
|
||||
|
||||
# Mascara de ausencias de 6 filas. 1 = falta, 0 = presente.
|
||||
mask = {
|
||||
"ingresos": [1, 0, 1, 0, 1, 0], # falta junto a "deducciones"
|
||||
"deducciones": [1, 0, 1, 0, 1, 0], # mismas filas que "ingresos"
|
||||
"telefono": [0, 0, 0, 1, 0, 0], # casi siempre presente
|
||||
"verificado": [1, 1, 1, 1, 1, 1], # siempre ausente -> constante, excluida
|
||||
}
|
||||
out = missingness_correlation(mask, top_k=10)
|
||||
|
||||
print(out["columns"]) # ['ingresos', 'deducciones', 'telefono']
|
||||
print(out["n_excluded"]) # 1
|
||||
print(out["excluded_cols"]) # ['verificado']
|
||||
|
||||
# El par mas fuerte: ingresos y deducciones faltan siempre juntas.
|
||||
top = out["pairs"][0]
|
||||
print(top["a"], top["b"], round(top["corr"], 3)) # ingresos deducciones 1.0
|
||||
print(top["co_missing"], top["either_missing"], top["jaccard"]) # 3 3 1.0
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
- Usala en el capitulo de **missingness** de `AutomaticEDA` cuando ya tengas la mascara binaria de nulos por columna y quieras detectar **patrones de ausencia conjunta**: que columnas faltan siempre juntas (posible misma fuente/proceso roto) y cuales faltan de forma independiente.
|
||||
- Cuando necesites ordenar los pares de columnas por fuerza de co-ocurrencia (|corr|) para priorizar que bloques de ausencia investigar o imputar juntos.
|
||||
- Cuando quieras la cifra de solapamiento de conjuntos (Jaccard, co-missing) ademas de la correlacion lineal, para distinguir "faltan juntas" de "estan presentes juntas".
|
||||
- Antes de elegir una estrategia de imputacion: dos columnas con corr de ausencia ~1.0 no aportan informacion independiente sobre por que falta la otra.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Funcion pura, sin I/O y determinista. Lectura defensiva: entradas no-dict, columnas no-lista o vacias se ignoran sin lanzar.
|
||||
- Solo entran al calculo las columnas con **varianza en la ausencia** (al menos un 1 y al menos un 0). Una columna siempre-presente (todo 0) no aporta ausencia y **no** se cuenta como excluida; una columna siempre-ausente o constante con nulos (todo 1) tiene correlacion indefinida y se excluye, sumando a `n_excluded` / `excluded_cols`.
|
||||
- Con menos de 2 columnas con varianza, `columns`/`matrix`/`pairs` quedan vacios pero `n_excluded`/`excluded_cols` se rellenan igual — el caller debe contemplar el caso "sin pares".
|
||||
- La correlacion es la de Pearson sobre vectores binarios (equivale al coeficiente phi). El signo importa: corr negativa = las ausencias tienden a ser **complementarias** (cuando una falta, la otra suele estar presente).
|
||||
- Asume todas las listas alineadas por fila y de la misma longitud. Si vienen de longitudes distintas, `pearson` opera sobre el solapamiento que permita `zip` y degrada a 0.0 cuando no hay varianza efectiva; alinea la mascara antes de llamar.
|
||||
@@ -0,0 +1,120 @@
|
||||
"""Co-ocurrencia de ausencias: matriz de correlacion de Pearson entre mascaras de nulos.
|
||||
|
||||
Funcion pura del grupo eda, nucleo del capitulo de missingness. Recibe la mascara
|
||||
binaria de ausencias de una tabla (1 = falta, 0 = presente, alineada por fila) y
|
||||
mide hasta que punto las columnas faltan juntas. Para cada par de columnas con
|
||||
varianza en su ausencia calcula la correlacion de Pearson entre los vectores
|
||||
binarios, mas las cifras de solapamiento de conjuntos (co-missing, either-missing,
|
||||
Jaccard). Compone la funcion atomica `pearson` del registry; no reimplementa la
|
||||
correlacion. Lectura defensiva; NUNCA lanza.
|
||||
"""
|
||||
|
||||
from datascience import pearson
|
||||
|
||||
|
||||
def missingness_correlation(null_mask, top_k=20) -> dict:
|
||||
"""Correlacion de co-ocurrencia de ausencias entre columnas.
|
||||
|
||||
Args:
|
||||
null_mask: dict {col: [int 0/1, ...]} alineado por fila (1 = el valor
|
||||
falta en esa fila). Todas las listas se asumen de la misma longitud.
|
||||
top_k: numero maximo de pares a devolver, ordenados por |corr| desc.
|
||||
|
||||
Returns:
|
||||
dict con:
|
||||
- columns: columnas con varianza en la ausencia (al menos un 1 y al
|
||||
menos un 0), en orden de entrada.
|
||||
- matrix: matriz len(columns) x len(columns) de correlacion de Pearson
|
||||
entre las mascaras binarias, diagonal 1.0.
|
||||
- pairs: lista de hasta top_k pares (i<j) ordenados por |corr| desc.
|
||||
Cada par: {a, b, corr, co_missing, either_missing, jaccard}.
|
||||
- n_excluded: numero de columnas con algun nulo pero sin varianza
|
||||
(constantes en la ausencia: siempre presentes o siempre ausentes).
|
||||
- excluded_cols: lista de esas columnas (en orden de entrada).
|
||||
|
||||
Si hay menos de 2 columnas con varianza, columns/matrix/pairs van vacios
|
||||
pero n_excluded/excluded_cols se rellenan igualmente. NUNCA lanza.
|
||||
"""
|
||||
# Salida base, defensiva ante entradas no-dict.
|
||||
result = {
|
||||
"columns": [],
|
||||
"matrix": [],
|
||||
"pairs": [],
|
||||
"n_excluded": 0,
|
||||
"excluded_cols": [],
|
||||
}
|
||||
|
||||
if not isinstance(null_mask, dict) or not null_mask:
|
||||
return result
|
||||
|
||||
varying = [] # columnas con varianza en la ausencia
|
||||
varying_vecs = [] # sus vectores binarios saneados (floats 0.0/1.0)
|
||||
excluded_cols = [] # columnas con nulos pero sin varianza (constantes)
|
||||
|
||||
for col, raw in null_mask.items():
|
||||
if not isinstance(raw, (list, tuple)):
|
||||
continue
|
||||
# Sanea a 0/1: cualquier valor truthy distinto de 0 cuenta como ausencia.
|
||||
vec = [1 if bool(v) else 0 for v in raw]
|
||||
if not vec:
|
||||
continue
|
||||
ones = sum(vec)
|
||||
zeros = len(vec) - ones
|
||||
if ones > 0 and zeros > 0:
|
||||
varying.append(col)
|
||||
varying_vecs.append([float(v) for v in vec])
|
||||
elif ones > 0:
|
||||
# Tiene nulos pero todos (constante en la ausencia): sin varianza.
|
||||
excluded_cols.append(col)
|
||||
# ones == 0 -> columna siempre presente, sin nulos: no se cuenta como
|
||||
# excluida (no aporta ausencia al analisis de co-ocurrencia).
|
||||
|
||||
result["n_excluded"] = len(excluded_cols)
|
||||
result["excluded_cols"] = excluded_cols
|
||||
|
||||
n = len(varying)
|
||||
if n < 2:
|
||||
return result
|
||||
|
||||
result["columns"] = list(varying)
|
||||
|
||||
# Matriz de correlacion de Pearson, diagonal 1.0.
|
||||
matrix = [[0.0] * n for _ in range(n)]
|
||||
for i in range(n):
|
||||
matrix[i][i] = 1.0
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
r = pearson(varying_vecs[i], varying_vecs[j])
|
||||
matrix[i][j] = r
|
||||
matrix[j][i] = r
|
||||
result["matrix"] = matrix
|
||||
|
||||
# Pares con cifras de solapamiento de conjuntos.
|
||||
pairs = []
|
||||
for i in range(n):
|
||||
vi = varying_vecs[i]
|
||||
for j in range(i + 1, n):
|
||||
vj = varying_vecs[j]
|
||||
co_missing = 0
|
||||
either_missing = 0
|
||||
for a, b in zip(vi, vj):
|
||||
a_miss = a != 0.0
|
||||
b_miss = b != 0.0
|
||||
if a_miss and b_miss:
|
||||
co_missing += 1
|
||||
if a_miss or b_miss:
|
||||
either_missing += 1
|
||||
jaccard = co_missing / either_missing if either_missing > 0 else 0.0
|
||||
pairs.append({
|
||||
"a": varying[i],
|
||||
"b": varying[j],
|
||||
"corr": matrix[i][j],
|
||||
"co_missing": co_missing,
|
||||
"either_missing": either_missing,
|
||||
"jaccard": jaccard,
|
||||
})
|
||||
|
||||
pairs.sort(key=lambda p: abs(p["corr"]), reverse=True)
|
||||
result["pairs"] = pairs[:top_k] if top_k is not None and top_k >= 0 else pairs
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,115 @@
|
||||
"""Tests para missingness_correlation."""
|
||||
|
||||
from datascience.missingness_correlation import missingness_correlation
|
||||
|
||||
|
||||
def test_co_ocurrencia_fuerte_corr_uno_jaccard_uno():
|
||||
# a y b faltan EXACTAMENTE en las mismas filas -> corr 1.0, jaccard 1.0.
|
||||
mask = {
|
||||
"a": [1, 0, 1, 0, 1, 0],
|
||||
"b": [1, 0, 1, 0, 1, 0],
|
||||
}
|
||||
out = missingness_correlation(mask)
|
||||
assert out["columns"] == ["a", "b"]
|
||||
assert out["n_excluded"] == 0
|
||||
# Diagonal 1.0, off-diagonal ~1.0.
|
||||
assert out["matrix"][0][0] == 1.0
|
||||
assert out["matrix"][1][1] == 1.0
|
||||
assert abs(out["matrix"][0][1] - 1.0) < 1e-9
|
||||
assert len(out["pairs"]) == 1
|
||||
pair = out["pairs"][0]
|
||||
assert {pair["a"], pair["b"]} == {"a", "b"}
|
||||
assert abs(pair["corr"] - 1.0) < 1e-9
|
||||
assert pair["co_missing"] == 3 # filas 0,2,4
|
||||
assert pair["either_missing"] == 3 # mismas filas
|
||||
assert abs(pair["jaccard"] - 1.0) < 1e-9
|
||||
|
||||
|
||||
def test_ausencias_disjuntas_corr_negativa_jaccard_cero():
|
||||
# a y b nunca faltan en la misma fila -> co_missing 0, jaccard 0, corr <= 0.
|
||||
mask = {
|
||||
"a": [1, 1, 0, 0],
|
||||
"b": [0, 0, 1, 1],
|
||||
}
|
||||
out = missingness_correlation(mask)
|
||||
assert out["columns"] == ["a", "b"]
|
||||
pair = out["pairs"][0]
|
||||
assert pair["co_missing"] == 0
|
||||
assert pair["either_missing"] == 4
|
||||
assert pair["jaccard"] == 0.0
|
||||
# Solapamiento nulo + ausencias complementarias -> correlacion negativa.
|
||||
assert pair["corr"] < 0.0
|
||||
assert abs(pair["corr"] - out["matrix"][0][1]) < 1e-12
|
||||
|
||||
|
||||
def test_columna_sin_varianza_se_excluye():
|
||||
# c esta siempre presente (todo 0): no aporta ausencia -> no entra ni como
|
||||
# excluida. d esta siempre ausente (todo 1): tiene nulos pero sin varianza
|
||||
# -> excluida y n_excluded incrementa. a y b tienen varianza.
|
||||
mask = {
|
||||
"a": [1, 0, 1, 0],
|
||||
"b": [1, 0, 0, 0],
|
||||
"c": [0, 0, 0, 0], # siempre presente
|
||||
"d": [1, 1, 1, 1], # siempre ausente, constante
|
||||
}
|
||||
out = missingness_correlation(mask)
|
||||
assert out["columns"] == ["a", "b"]
|
||||
assert "d" in out["excluded_cols"]
|
||||
assert "c" not in out["excluded_cols"]
|
||||
assert out["n_excluded"] == 1
|
||||
# Matriz solo de las columnas con varianza.
|
||||
assert len(out["matrix"]) == 2
|
||||
assert len(out["matrix"][0]) == 2
|
||||
|
||||
|
||||
def test_menos_de_dos_columnas_con_varianza_vacio_pero_cuenta_excluidas():
|
||||
# Solo una columna con varianza (a) + una constante-ausente (d).
|
||||
mask = {
|
||||
"a": [1, 0, 1, 0],
|
||||
"d": [1, 1, 1, 1],
|
||||
}
|
||||
out = missingness_correlation(mask)
|
||||
assert out["columns"] == []
|
||||
assert out["matrix"] == []
|
||||
assert out["pairs"] == []
|
||||
assert out["n_excluded"] == 1
|
||||
assert out["excluded_cols"] == ["d"]
|
||||
|
||||
|
||||
def test_mask_vacio_todo_vacio():
|
||||
out = missingness_correlation({})
|
||||
assert out == {
|
||||
"columns": [],
|
||||
"matrix": [],
|
||||
"pairs": [],
|
||||
"n_excluded": 0,
|
||||
"excluded_cols": [],
|
||||
}
|
||||
|
||||
|
||||
def test_top_k_limita_pares():
|
||||
# 4 columnas con varianza -> 6 pares; top_k=2 deja 2.
|
||||
mask = {
|
||||
"a": [1, 0, 1, 0, 0],
|
||||
"b": [1, 0, 0, 1, 0],
|
||||
"c": [0, 1, 1, 0, 1],
|
||||
"d": [1, 1, 0, 0, 1],
|
||||
}
|
||||
out = missingness_correlation(mask, top_k=2)
|
||||
assert len(out["columns"]) == 4
|
||||
assert len(out["pairs"]) == 2
|
||||
# Ordenados por |corr| desc.
|
||||
assert abs(out["pairs"][0]["corr"]) >= abs(out["pairs"][1]["corr"])
|
||||
|
||||
|
||||
def test_no_lanza_con_entradas_raras():
|
||||
# Valores no-lista y no-dict no deben romper.
|
||||
assert missingness_correlation(None)["columns"] == []
|
||||
mask = {
|
||||
"a": [1, 0, 1, 0],
|
||||
"b": [1, 0, 1, 0],
|
||||
"bad": "not a list",
|
||||
"empty": [],
|
||||
}
|
||||
out = missingness_correlation(mask)
|
||||
assert out["columns"] == ["a", "b"]
|
||||
@@ -0,0 +1,99 @@
|
||||
---
|
||||
id: missingness_overview_py_datascience
|
||||
name: missingness_overview
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def missingness_overview(null_mask) -> dict"
|
||||
description: "Resumen de ausencias a nivel de dataset a partir de una máscara de nulos 0/1 por columna ({col: [1=falta, 0=presente]} alineada por fila). Calcula celdas y porcentaje de datos faltantes, cuántas columnas tienen algún nulo y cuántas filas son completas vs. incompletas. Estilo dict-no-throw del grupo eda: nunca lanza. Lectura defensiva — no-dict o dict vacío devuelve todo a 0; columnas no-lista se tratan como vacías; listas de longitud distinta se alinean a la longitud máxima rellenando la cola corta como presente (0); valores None/no-int cuentan como presente; sin ZeroDivisionError."
|
||||
tags: [eda, missing, missingness, nulls, profiling, datascience, pure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
example: |
|
||||
from datascience.missingness_overview import missingness_overview
|
||||
mask = {
|
||||
"a": [1, 0, 0, 0, 1],
|
||||
"b": [1, 0, 1, 0, 0],
|
||||
"c": [0, 0, 0, 0, 1],
|
||||
}
|
||||
missingness_overview(mask)
|
||||
# n_missing_cells=5, missing_cell_pct≈33.33, complete_rows=2, incomplete_rows=3
|
||||
tested: true
|
||||
tests:
|
||||
- "test_cooccurrence_three_cols_exact"
|
||||
- "test_empty_dict_all_zero"
|
||||
- "test_output_keys_contract"
|
||||
- "test_not_a_dict_returns_zero"
|
||||
- "test_no_nulls_all_complete"
|
||||
- "test_none_values_treated_as_present"
|
||||
- "test_unequal_lengths_pad_with_max"
|
||||
- "test_columns_present_but_no_rows"
|
||||
- "test_never_raises_on_garbage"
|
||||
test_file_path: "python/functions/datascience/missingness_overview_test.py"
|
||||
file_path: "python/functions/datascience/missingness_overview.py"
|
||||
params:
|
||||
- name: null_mask
|
||||
desc: "Dict {col_name: [int 0/1, ...]} con la máscara de nulos por columna, alineada por fila (1 = el valor falta, 0 = el valor está presente). Normalmente todas las listas tienen la misma longitud = nº de filas. Lectura defensiva: si no es dict o está vacío se devuelve todo a 0; columnas cuyo valor no es lista/tupla se tratan como vacías; listas de longitud distinta se alinean a la longitud máxima (las posiciones inexistentes de las columnas más cortas cuentan como presentes, 0); valores None o no enteros cuentan como presentes."
|
||||
output: "Dict con exactamente 9 claves, todas siempre presentes (la función nunca lanza): n_rows (longitud de fila = longitud máxima entre columnas, 0 si vacío), n_cols (nº de columnas), n_cols_with_null (columnas con >=1 falta), n_missing_cells (suma total de 1s), missing_cell_pct (0-100 = n_missing_cells / (n_rows*n_cols) * 100), complete_rows (filas sin ninguna falta), incomplete_rows (filas con >=1 falta), complete_pct (0-100), incomplete_pct (0-100). Los porcentajes son 0.0 cuando el denominador es 0 (sin ZeroDivisionError)."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.missingness_overview import missingness_overview
|
||||
|
||||
# Máscara de nulos por columna: 1 = falta, 0 = presente, alineada por fila.
|
||||
mask = {
|
||||
"a": [1, 0, 0, 0, 1],
|
||||
"b": [1, 0, 1, 0, 0],
|
||||
"c": [0, 0, 0, 0, 1],
|
||||
}
|
||||
|
||||
missingness_overview(mask)
|
||||
# {
|
||||
# "n_rows": 5,
|
||||
# "n_cols": 3,
|
||||
# "n_cols_with_null": 3, # a, b y c tienen al menos una falta
|
||||
# "n_missing_cells": 5, # 2 (a) + 2 (b) + 1 (c)
|
||||
# "missing_cell_pct": 33.33, # 5 / (5*3) * 100
|
||||
# "complete_rows": 2, # filas 1 y 3 sin ninguna falta
|
||||
# "incomplete_rows": 3, # filas 0 (a&b), 2 (b), 4 (a&c)
|
||||
# "complete_pct": 40.0, # 2 / 5 * 100
|
||||
# "incomplete_pct": 60.0, # 3 / 5 * 100
|
||||
# }
|
||||
|
||||
missingness_overview({})
|
||||
# Todo a 0: {"n_rows": 0, "n_cols": 0, "n_cols_with_null": 0,
|
||||
# "n_missing_cells": 0, "missing_cell_pct": 0.0,
|
||||
# "complete_rows": 0, "incomplete_rows": 0,
|
||||
# "complete_pct": 0.0, "incomplete_pct": 0.0}
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala al perfilar un dataset cuando ya tienes una máscara de nulos 0/1 por
|
||||
columna (p. ej. derivada del paso de carga/perfilado del EDA) y quieres la foto
|
||||
global de ausencias en una llamada: cuánta proporción de celdas falta, cuántas
|
||||
columnas están afectadas y, sobre todo, cuántas filas quedan completas vs.
|
||||
incompletas. Es el bloque resumen del capítulo de calidad/missingness de un EDA,
|
||||
y la base para decidir estrategias de imputación o de borrado de filas. Como es
|
||||
pura y dict-no-throw, puedes alimentarla con la máscara tal cual sin validarla
|
||||
antes: entradas malformadas degradan a ceros en vez de romper el pipeline.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **`n_rows` es la longitud máxima entre columnas.** Con listas de longitud
|
||||
desigual, las posiciones que faltan en las columnas más cortas se cuentan como
|
||||
presentes (`0`); no se descartan filas. En el caso normal (todas las listas de
|
||||
igual longitud) `n_rows` es simplemente esa longitud.
|
||||
- **Solo el valor exacto `1` cuenta como falta.** `None`, `0`, cadenas y
|
||||
cualquier otro valor se tratan como presentes. `True` (== 1) también cuenta
|
||||
como falta por la igualdad.
|
||||
- **Porcentajes en escala 0-100**, no fracciones. División por cero protegida:
|
||||
con `n_rows*n_cols == 0` los porcentajes salen `0.0`.
|
||||
@@ -0,0 +1,116 @@
|
||||
"""Pure EDA helper: dataset-level missingness overview from a 0/1 null mask.
|
||||
|
||||
Part of the `eda` capability group. Consumes a per-column null mask
|
||||
(``{col_name: [int 0/1, ...]}`` aligned by row, ``1`` = value is missing,
|
||||
``0`` = value is present) and derives dataset-wide missingness metrics: cell
|
||||
count and percentage of missing data, how many columns carry any null, and how
|
||||
many rows are complete vs. incomplete.
|
||||
|
||||
Dict-no-throw style of the `eda` group: it NEVER raises. A non-dict, an empty
|
||||
dict, malformed columns, ragged lists or non-int cell values all degrade
|
||||
gracefully to the zero/contract output. Stdlib only.
|
||||
|
||||
Ragged-length policy: columns are allowed to have different lengths. ``n_rows``
|
||||
is the **maximum** column length; positions that don't exist in a shorter
|
||||
column are treated as present (``0``). This keeps the ``n_rows * n_cols`` cell
|
||||
grid well defined without dropping rows.
|
||||
"""
|
||||
|
||||
|
||||
def _is_missing(value) -> int:
|
||||
"""Return ``1`` iff ``value`` denotes a missing cell, else ``0``.
|
||||
|
||||
Only an exact equality to ``1`` (covers ``int`` ``1`` and ``float`` ``1.0``)
|
||||
counts as missing. ``None``, ``0``, strings and any other value are treated
|
||||
as present. The comparison cannot raise for standard inputs.
|
||||
"""
|
||||
try:
|
||||
return 1 if value == 1 else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def missingness_overview(null_mask) -> dict:
|
||||
"""Summarize dataset-level missingness from a 0/1 null mask.
|
||||
|
||||
Args:
|
||||
null_mask: Dict ``{col_name: [int 0/1, ...]}`` where each list is aligned
|
||||
by row (``1`` = missing, ``0`` = present). Lists are normally all the
|
||||
same length (= number of rows). Defensive: a non-dict or empty dict
|
||||
returns the all-zero contract; non-list columns are treated as empty;
|
||||
ragged lists are aligned to the maximum length, padding the missing
|
||||
tail of shorter columns as present (``0``); ``None`` / non-int cells
|
||||
count as present.
|
||||
|
||||
Returns:
|
||||
Dict with exactly these keys, all always present (the function never
|
||||
raises): ``n_rows``, ``n_cols``, ``n_cols_with_null``,
|
||||
``n_missing_cells``, ``missing_cell_pct`` (0-100), ``complete_rows``,
|
||||
``incomplete_rows``, ``complete_pct`` (0-100), ``incomplete_pct``
|
||||
(0-100). Percentages are ``0.0`` when the denominator is zero (no
|
||||
``ZeroDivisionError``).
|
||||
"""
|
||||
zero = {
|
||||
"n_rows": 0,
|
||||
"n_cols": 0,
|
||||
"n_cols_with_null": 0,
|
||||
"n_missing_cells": 0,
|
||||
"missing_cell_pct": 0.0,
|
||||
"complete_rows": 0,
|
||||
"incomplete_rows": 0,
|
||||
"complete_pct": 0.0,
|
||||
"incomplete_pct": 0.0,
|
||||
}
|
||||
|
||||
if not isinstance(null_mask, dict) or not null_mask:
|
||||
return dict(zero)
|
||||
|
||||
# Normalize every column to a list; non-list columns become empty.
|
||||
cols = {}
|
||||
for name, seq in null_mask.items():
|
||||
cols[name] = seq if isinstance(seq, (list, tuple)) else []
|
||||
|
||||
n_cols = len(cols)
|
||||
lengths = [len(seq) for seq in cols.values()]
|
||||
n_rows = max(lengths) if lengths else 0
|
||||
|
||||
if n_rows == 0:
|
||||
# Columns exist but carry no rows: everything zero except n_cols.
|
||||
out = dict(zero)
|
||||
out["n_cols"] = n_cols
|
||||
return out
|
||||
|
||||
n_missing_cells = 0
|
||||
n_cols_with_null = 0
|
||||
row_has_missing = [False] * n_rows
|
||||
|
||||
for seq in cols.values():
|
||||
col_len = len(seq)
|
||||
col_has_null = False
|
||||
for r in range(n_rows):
|
||||
if r < col_len and _is_missing(seq[r]):
|
||||
n_missing_cells += 1
|
||||
row_has_missing[r] = True
|
||||
col_has_null = True
|
||||
if col_has_null:
|
||||
n_cols_with_null += 1
|
||||
|
||||
incomplete_rows = sum(1 for flag in row_has_missing if flag)
|
||||
complete_rows = n_rows - incomplete_rows
|
||||
|
||||
total_cells = n_rows * n_cols
|
||||
missing_cell_pct = (n_missing_cells / total_cells * 100.0) if total_cells else 0.0
|
||||
complete_pct = complete_rows / n_rows * 100.0
|
||||
incomplete_pct = incomplete_rows / n_rows * 100.0
|
||||
|
||||
return {
|
||||
"n_rows": n_rows,
|
||||
"n_cols": n_cols,
|
||||
"n_cols_with_null": n_cols_with_null,
|
||||
"n_missing_cells": n_missing_cells,
|
||||
"missing_cell_pct": missing_cell_pct,
|
||||
"complete_rows": complete_rows,
|
||||
"incomplete_rows": incomplete_rows,
|
||||
"complete_pct": complete_pct,
|
||||
"incomplete_pct": incomplete_pct,
|
||||
}
|
||||
@@ -0,0 +1,146 @@
|
||||
"""Tests para missingness_overview."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from missingness_overview import missingness_overview
|
||||
|
||||
|
||||
# Output contract: every call returns exactly these 9 keys.
|
||||
EXPECTED_KEYS = {
|
||||
"n_rows",
|
||||
"n_cols",
|
||||
"n_cols_with_null",
|
||||
"n_missing_cells",
|
||||
"missing_cell_pct",
|
||||
"complete_rows",
|
||||
"incomplete_rows",
|
||||
"complete_pct",
|
||||
"incomplete_pct",
|
||||
}
|
||||
|
||||
|
||||
def test_cooccurrence_three_cols_exact():
|
||||
# 3 columns, 5 rows. Hand-computed expectations:
|
||||
# col a missing at rows 0, 4 -> 2
|
||||
# col b missing at rows 0, 2 -> 2
|
||||
# col c missing at row 4 -> 1
|
||||
# n_missing_cells = 5, total_cells = 5*3 = 15 -> 33.333...%
|
||||
# row 0 (a&b co-occur) -> incomplete
|
||||
# row 1 (all present) -> complete
|
||||
# row 2 (b only) -> incomplete
|
||||
# row 3 (all present) -> complete
|
||||
# row 4 (a&c co-occur) -> incomplete
|
||||
mask = {
|
||||
"a": [1, 0, 0, 0, 1],
|
||||
"b": [1, 0, 1, 0, 0],
|
||||
"c": [0, 0, 0, 0, 1],
|
||||
}
|
||||
out = missingness_overview(mask)
|
||||
assert out["n_rows"] == 5
|
||||
assert out["n_cols"] == 3
|
||||
assert out["n_cols_with_null"] == 3
|
||||
assert out["n_missing_cells"] == 5
|
||||
assert out["missing_cell_pct"] == pytest.approx(33.33333333, abs=1e-6)
|
||||
assert out["complete_rows"] == 2
|
||||
assert out["incomplete_rows"] == 3
|
||||
assert out["complete_pct"] == pytest.approx(40.0)
|
||||
assert out["incomplete_pct"] == pytest.approx(60.0)
|
||||
|
||||
|
||||
def test_empty_dict_all_zero():
|
||||
out = missingness_overview({})
|
||||
assert out == {
|
||||
"n_rows": 0,
|
||||
"n_cols": 0,
|
||||
"n_cols_with_null": 0,
|
||||
"n_missing_cells": 0,
|
||||
"missing_cell_pct": 0.0,
|
||||
"complete_rows": 0,
|
||||
"incomplete_rows": 0,
|
||||
"complete_pct": 0.0,
|
||||
"incomplete_pct": 0.0,
|
||||
}
|
||||
|
||||
|
||||
def test_output_keys_contract():
|
||||
# The 9-key contract holds even for the garbage/zero path.
|
||||
assert set(missingness_overview({}).keys()) == EXPECTED_KEYS
|
||||
assert set(missingness_overview({"a": [1, 0]}).keys()) == EXPECTED_KEYS
|
||||
|
||||
|
||||
def test_not_a_dict_returns_zero():
|
||||
for bad in (None, [1, 0, 1], 42, "nope", 3.14):
|
||||
out = missingness_overview(bad)
|
||||
assert out["n_rows"] == 0
|
||||
assert out["n_cols"] == 0
|
||||
assert out["n_missing_cells"] == 0
|
||||
assert out["missing_cell_pct"] == 0.0
|
||||
|
||||
|
||||
def test_no_nulls_all_complete():
|
||||
mask = {"a": [0, 0, 0], "b": [0, 0, 0]}
|
||||
out = missingness_overview(mask)
|
||||
assert out["n_rows"] == 3
|
||||
assert out["n_cols"] == 2
|
||||
assert out["n_cols_with_null"] == 0
|
||||
assert out["n_missing_cells"] == 0
|
||||
assert out["missing_cell_pct"] == 0.0
|
||||
assert out["complete_rows"] == 3
|
||||
assert out["incomplete_rows"] == 0
|
||||
assert out["complete_pct"] == pytest.approx(100.0)
|
||||
assert out["incomplete_pct"] == pytest.approx(0.0)
|
||||
|
||||
|
||||
def test_none_values_treated_as_present():
|
||||
# None and other non-1 values count as present (0).
|
||||
mask = {"a": [None, 1, None, "x", 0]}
|
||||
out = missingness_overview(mask)
|
||||
assert out["n_rows"] == 5
|
||||
assert out["n_cols"] == 1
|
||||
assert out["n_missing_cells"] == 1 # only the explicit 1 at row 1
|
||||
assert out["n_cols_with_null"] == 1
|
||||
assert out["complete_rows"] == 4
|
||||
assert out["incomplete_rows"] == 1
|
||||
|
||||
|
||||
def test_unequal_lengths_pad_with_max():
|
||||
# Ragged lists: n_rows = max length; shorter column padded as present.
|
||||
# a = [1, 1] -> missing at rows 0, 1
|
||||
# b = [0] -> row 1 padded to present
|
||||
# n_rows = 2, n_cols = 2, total_cells = 4, n_missing_cells = 2 -> 50%
|
||||
mask = {"a": [1, 1], "b": [0]}
|
||||
out = missingness_overview(mask)
|
||||
assert out["n_rows"] == 2
|
||||
assert out["n_cols"] == 2
|
||||
assert out["n_cols_with_null"] == 1
|
||||
assert out["n_missing_cells"] == 2
|
||||
assert out["missing_cell_pct"] == pytest.approx(50.0)
|
||||
assert out["complete_rows"] == 0
|
||||
assert out["incomplete_rows"] == 2
|
||||
assert out["incomplete_pct"] == pytest.approx(100.0)
|
||||
|
||||
|
||||
def test_columns_present_but_no_rows():
|
||||
# Columns exist but all empty -> zero metrics, n_cols preserved.
|
||||
out = missingness_overview({"a": [], "b": []})
|
||||
assert out["n_rows"] == 0
|
||||
assert out["n_cols"] == 2
|
||||
assert out["n_missing_cells"] == 0
|
||||
assert out["missing_cell_pct"] == 0.0
|
||||
assert out["complete_pct"] == 0.0
|
||||
|
||||
|
||||
def test_never_raises_on_garbage():
|
||||
# Non-list column values, mixed junk -> must not raise.
|
||||
mask = {"a": "not a list", "b": 123, "c": [1, 0, 1]}
|
||||
out = missingness_overview(mask)
|
||||
assert set(out.keys()) == EXPECTED_KEYS
|
||||
assert out["n_rows"] == 3
|
||||
assert out["n_cols"] == 3
|
||||
assert out["n_missing_cells"] == 2 # only col c contributes
|
||||
assert out["n_cols_with_null"] == 1
|
||||
@@ -0,0 +1,93 @@
|
||||
---
|
||||
id: missingness_rank_bar_figure_py_datascience
|
||||
name: missingness_rank_bar_figure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def missingness_rank_bar_figure(names, pcts, title=\"% de valores faltantes por columna\") -> \"matplotlib.figure.Figure\""
|
||||
description: "Construye una figura matplotlib de barras horizontales que ordena las columnas de un dataset por su porcentaje de valores faltantes (0-100), la mayor arriba, etiquetando cada barra con su NN.N% al final. Usa ax.barh, eje X fijo 0-100 y labels truncados a ~22 chars. Devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA (capítulo de datos faltantes). Backend Agg sin pyplot global; defensivo ante listas vacías, longitudes desiguales o valores no numéricos (nunca lanza)."
|
||||
tags: [eda, missing, missingness, ranking, bar, barh, matplotlib, figure, visualization, datascience, impure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [matplotlib]
|
||||
example: |
|
||||
from datascience.missingness_rank_bar_figure import missingness_rank_bar_figure
|
||||
names = ["edad", "ingresos", "ciudad", "email"]
|
||||
pcts = [12.5, 40.0, 3.2, 0.0]
|
||||
fig = missingness_rank_bar_figure(names, pcts, title="% de valores faltantes por columna")
|
||||
tested: true
|
||||
tests:
|
||||
- "test_returns_figure_with_axes"
|
||||
- "test_sorted_descending_largest_on_top"
|
||||
- "test_empty_lists_do_not_raise_and_returns_figure"
|
||||
- "test_xlim_is_zero_to_hundred"
|
||||
- "test_length_mismatch_and_non_numeric_are_handled"
|
||||
test_file_path: "python/functions/datascience/missingness_rank_bar_figure_test.py"
|
||||
file_path: "python/functions/datascience/missingness_rank_bar_figure.py"
|
||||
params:
|
||||
- name: names
|
||||
desc: "Lista de nombres de columna. Puede venir vacía (devuelve figura \"sin datos faltantes\"). Los items se convierten a str y se truncan a ~22 chars con elipsis para las etiquetas del eje Y; los originales no se mutan."
|
||||
- name: pcts
|
||||
desc: "Lista paralela a names con el % de nulos en [0,100]. Valores None, NaN o no numéricos se coercen a 0.0 y los negativos se recortan a 0. Si len(names) != len(pcts) se recorta al menor de ambos para no romper."
|
||||
- name: title
|
||||
desc: "Título de la figura. Se trunca a ~60 chars con elipsis si es muy largo. Default \"% de valores faltantes por columna\"."
|
||||
output: "Un matplotlib.figure.Figure (figsize 6.4 x alto adaptativo según nº de barras, dpi 150) con un Axes de barras horizontales (ax.barh) ordenadas por % descendente, la mayor arriba. Eje X fijado a [0,100] con label \"% faltante\", etiquetas del eje Y truncadas a ~22 chars, y cada barra anotada con su NN.N% al final. Si names o pcts vienen vacíos devuelve una Figure con texto centrado \"sin datos faltantes\"; cualquier error inesperado se captura y devuelve una Figure con el mensaje de error (nunca lanza). El caller rasteriza/cierra la figura; la función no la muestra ni la guarda."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience.missingness_rank_bar_figure import missingness_rank_bar_figure
|
||||
|
||||
# % de nulos por columna (p. ej. (df.isnull().mean() * 100).
|
||||
names = ["edad", "ingresos", "ciudad", "email"]
|
||||
pcts = [12.5, 40.0, 3.2, 0.0]
|
||||
|
||||
fig = missingness_rank_bar_figure(
|
||||
names,
|
||||
pcts,
|
||||
title="% de valores faltantes por columna",
|
||||
)
|
||||
|
||||
# ingresos (40.0%) queda arriba; email (0.0%) abajo.
|
||||
# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
|
||||
fig.savefig("/tmp/missingness_rank.png")
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala al abrir el capítulo de datos faltantes de un informe EDA para responder
|
||||
"¿qué columnas están más incompletas?" de un vistazo. Pásale los nombres de
|
||||
columna y el % de nulos de cada una (`(df.isnull().mean() * 100).round(1)`); la
|
||||
función se encarga de ordenar de mayor a menor y poner la peor arriba. Es la
|
||||
pareja "magnitud" del heatmap de co-ocurrencia: las barras dicen *cuánto* falta
|
||||
en cada columna, el heatmap dice *si esas ausencias están relacionadas* entre
|
||||
columnas.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
|
||||
y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
|
||||
para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
|
||||
es thread-safe; esta función evita ese riesgo construyendo el `Figure`
|
||||
directamente, así que es segura de llamar en bucle desde el renderer.
|
||||
- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo
|
||||
guarda. Quien la consume debe rasterizarla y luego liberarla
|
||||
(`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes.
|
||||
- **Espera porcentajes 0-100, no fracciones 0-1.** El eje X está fijado a
|
||||
`[0, 100]`. Si pasas fracciones (`0.4` en vez de `40.0`) las barras saldrán
|
||||
pegadas al origen. Multiplica por 100 antes de llamar.
|
||||
- **Alto adaptativo.** La altura de la figura crece con el número de barras
|
||||
(hasta un tope) para que reports con muchas columnas sigan legibles; aun así,
|
||||
conviene filtrar a las columnas con algún nulo antes de llamar para no listar
|
||||
decenas de barras a 0%.
|
||||
- **Defensiva, nunca lanza.** Listas vacías, longitudes desiguales, valores
|
||||
`None`/`NaN`/no numéricos o cualquier error inesperado se manejan sin propagar:
|
||||
en el peor caso devuelve una `Figure` con "sin datos faltantes" o con el texto
|
||||
del error. No envuelvas la llamada en try/except por miedo a un raise — no lo
|
||||
hay.
|
||||
@@ -0,0 +1,150 @@
|
||||
"""Impure EDA helper: ranked bar figure of missing-value share (`eda` group).
|
||||
|
||||
Builds a horizontal bar chart ranking the columns of a dataset by their
|
||||
percentage of missing values (0-100), largest at the top, each bar labelled with
|
||||
its ``NN.N%`` at the end. Returns a ready-to-rasterize
|
||||
``matplotlib.figure.Figure``; it never shows nor saves it.
|
||||
|
||||
Impure because it touches matplotlib's rendering machinery. It uses the headless
|
||||
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
|
||||
global state and is safe to call repeatedly from a report renderer.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
# Muted gray for secondary text (no-data / fallback messages).
|
||||
_MUTED_TEXT = "#5f6b7a"
|
||||
# Soft red for the error fallback message.
|
||||
_ERROR_TEXT = "#b00020"
|
||||
# Bar fill — a calm blue that reads well on white at report size.
|
||||
_BAR_COLOR = "#4C72B0"
|
||||
|
||||
|
||||
def _truncate(text, width: int = 22) -> str:
|
||||
"""Truncate ``text`` to ``width`` chars, appending an ellipsis if cut."""
|
||||
s = "" if text is None else str(text)
|
||||
if len(s) <= width:
|
||||
return s
|
||||
if width <= 1:
|
||||
return s[:width]
|
||||
return s[: width - 1] + "…"
|
||||
|
||||
|
||||
def _message_figure(message: str, color: str = _MUTED_TEXT) -> "Figure":
|
||||
"""Return a fallback ``Figure`` carrying a single centered message."""
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
ax.axis("off")
|
||||
ax.text(
|
||||
0.5,
|
||||
0.5,
|
||||
message,
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=12,
|
||||
color=color,
|
||||
wrap=True,
|
||||
transform=ax.transAxes,
|
||||
)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def missingness_rank_bar_figure(
|
||||
names,
|
||||
pcts,
|
||||
title: str = "% de valores faltantes por columna",
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Build a horizontal ranked bar figure of missing-value share per column.
|
||||
|
||||
Pairs each column name with its missing percentage, sorts by percentage
|
||||
descending and draws horizontal bars with the largest at the top. The X axis
|
||||
is pinned to ``[0, 100]`` so bars are comparable across reports, each bar is
|
||||
annotated with its ``NN.N%`` at the end, and the Y tick labels are truncated
|
||||
to ~22 chars.
|
||||
|
||||
The function is fully defensive: empty/mismatched/non-numeric input never
|
||||
raises. When there is nothing valid to draw it returns a ``Figure`` carrying
|
||||
a centered "sin datos faltantes" message, and any unexpected error is caught
|
||||
and turned into a fallback ``Figure`` carrying the error text.
|
||||
|
||||
Args:
|
||||
names: List of column names. May be empty. Items are stringified and
|
||||
truncated for display; the originals are not mutated.
|
||||
pcts: List parallel to ``names`` of missing-value percentages in
|
||||
``[0, 100]``. Non-numeric/``None`` values are coerced to ``0.0`` and
|
||||
negatives are clamped to ``0``. The list is truncated to
|
||||
``min(len(names), len(pcts))`` so a length mismatch never crashes.
|
||||
title: Figure title. Default "% de valores faltantes por columna".
|
||||
|
||||
Returns:
|
||||
A ``matplotlib.figure.Figure`` with a single horizontal-bar Axes. The
|
||||
caller is responsible for rasterizing/closing it.
|
||||
"""
|
||||
try:
|
||||
if (
|
||||
not isinstance(names, (list, tuple))
|
||||
or not isinstance(pcts, (list, tuple))
|
||||
or len(names) == 0
|
||||
or len(pcts) == 0
|
||||
):
|
||||
return _message_figure("sin datos faltantes")
|
||||
|
||||
# --- Pair names with coerced percentages, tolerating length mismatch.
|
||||
pairs = []
|
||||
for name, pct in zip(names, pcts):
|
||||
try:
|
||||
val = float(pct)
|
||||
except (TypeError, ValueError):
|
||||
val = 0.0
|
||||
if val != val: # NaN guard.
|
||||
val = 0.0
|
||||
val = max(0.0, val)
|
||||
pairs.append((name, val))
|
||||
|
||||
if not pairs:
|
||||
return _message_figure("sin datos faltantes")
|
||||
|
||||
# Sort by percentage descending; barh draws bottom-up, so the largest
|
||||
# ends at the top when we reverse the order before plotting.
|
||||
pairs.sort(key=lambda p: p[1], reverse=True)
|
||||
ordered = list(reversed(pairs)) # smallest first -> largest on top.
|
||||
|
||||
labels = [_truncate(name, 22) for name, _ in ordered]
|
||||
values = [val for _, val in ordered]
|
||||
y_pos = range(len(ordered))
|
||||
|
||||
# Height scales with the number of bars so dense reports stay readable.
|
||||
height = max(2.4, min(0.4 * len(ordered) + 1.2, 14.0))
|
||||
fig = Figure(figsize=(6.4, height), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
ax.barh(list(y_pos), values, color=_BAR_COLOR, edgecolor="white")
|
||||
ax.set_yticks(list(y_pos))
|
||||
ax.set_yticklabels(labels, fontsize=8)
|
||||
ax.set_xlim(0, 100)
|
||||
ax.set_xlabel("% faltante", fontsize=9)
|
||||
|
||||
# Annotate each bar with its percentage at the end of the bar.
|
||||
for y, val in zip(y_pos, values):
|
||||
ax.text(
|
||||
min(val + 1.5, 99.0),
|
||||
y,
|
||||
f"{val:.1f}%",
|
||||
va="center",
|
||||
ha="left" if val < 90 else "right",
|
||||
fontsize=7,
|
||||
color="#202020",
|
||||
)
|
||||
|
||||
if title:
|
||||
ax.set_title(_truncate(title, 60), fontsize=12, loc="left", pad=10)
|
||||
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
except Exception as exc: # noqa: BLE001 — never raise from a figure builder.
|
||||
return _message_figure(f"error al dibujar barras: {exc}", color=_ERROR_TEXT)
|
||||
@@ -0,0 +1,64 @@
|
||||
"""Tests para missingness_rank_bar_figure (barras de % faltante, grupo eda).
|
||||
|
||||
Usa el backend Agg sin pyplot; no muestra ni guarda figuras. Cada test cierra
|
||||
explícitamente la Figure construida (matplotlib.pyplot.close) para no acumular
|
||||
estado entre tests.
|
||||
"""
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
from missingness_rank_bar_figure import missingness_rank_bar_figure
|
||||
|
||||
|
||||
def test_returns_figure_with_axes():
|
||||
names = ["edad", "ingresos", "ciudad"]
|
||||
pcts = [12.5, 40.0, 3.2]
|
||||
fig = missingness_rank_bar_figure(names, pcts, title="faltantes")
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_sorted_descending_largest_on_top():
|
||||
names = ["a", "b", "c"]
|
||||
pcts = [10.0, 50.0, 25.0]
|
||||
fig = missingness_rank_bar_figure(names, pcts)
|
||||
ax = fig.axes[0]
|
||||
# barh dibuja de abajo arriba; la mayor (50, "b") debe quedar arriba (mayor y).
|
||||
bars = ax.patches
|
||||
# El último parche (mayor índice y) corresponde a la barra superior.
|
||||
widths = [b.get_width() for b in bars]
|
||||
assert max(widths) == 50.0
|
||||
# La barra con la mayor anchura es la de mayor coordenada y (arriba).
|
||||
top_bar = max(bars, key=lambda b: b.get_y())
|
||||
assert top_bar.get_width() == 50.0
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_lists_do_not_raise_and_returns_figure():
|
||||
fig = missingness_rank_bar_figure([], [], title="vacía")
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_xlim_is_zero_to_hundred():
|
||||
fig = missingness_rank_bar_figure(["a"], [42.0])
|
||||
ax = fig.axes[0]
|
||||
assert ax.get_xlim() == (0.0, 100.0)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_length_mismatch_and_non_numeric_are_handled():
|
||||
# Más names que pcts + un pct None -> zip recorta y None se coacciona a 0.
|
||||
names = ["a", "b", "c"]
|
||||
pcts = [None, 30.0]
|
||||
fig = missingness_rank_bar_figure(names, pcts)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
@@ -0,0 +1,65 @@
|
||||
---
|
||||
name: missingness_row_patterns
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def missingness_row_patterns(null_mask, top_n=10) -> dict"
|
||||
description: "Agrupa las filas de un dataset por su patron de ausencias (estilo matriz de missingno): para cada fila, el patron es la tupla ORDENADA de columnas que faltan en esa fila (las que tienen 1 en el null_mask). Cuenta la frecuencia de cada patron distinto, incluido el patron vacio (fila completa). Devuelve el top_n por frecuencia con su pct sobre el total. Pura, lectura defensiva, NUNCA lanza; {} -> n_rows 0."
|
||||
tags: [eda, missingness, missingno, patterns, profiling, datascience, data-quality]
|
||||
params:
|
||||
- name: null_mask
|
||||
desc: "Dict {col: [0/1, ...]} alineado por fila, donde 1 = la celda falta en esa fila y 0 = presente. Todas las columnas deberian tener la misma longitud (una entrada por fila); si difieren, n_rows es la lista mas larga y las celdas fuera de rango cuentan como presentes. Las claves se ordenan por str(col) para canonizar el patron. {} (o no-dict) -> n_rows 0."
|
||||
- name: top_n
|
||||
desc: "Maximo de patrones devueltos en `patterns`, rankeados por n_rows desc (desempate: menos columnas primero, luego nombres de columna). El recuento total de patrones distintos siempre se reporta en `n_patterns`, no se trunca. Default 10. Valores negativos -> 0; no-int -> 10."
|
||||
output: "Dict {n_rows: int (filas totales), n_patterns: int (patrones distintos, incluye el patron vacio = fila completa), complete_rows: int (filas con patron vacio, nada falta), patterns: lista del top_n ordenada por n_rows desc con [{missing_cols: [col,...] (vacio = fila completa), n_rows: int, pct: float 0-100 sobre n_rows total, redondeado a 2 decimales}]}. Para {} devuelve n_rows 0 y patterns []. NUNCA lanza."
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["test_patron_dominante_completas_singleton", "test_mask_vacio", "test_top_n_trunca_pero_cuenta_todos"]
|
||||
test_file_path: "python/functions/datascience/missingness_row_patterns_test.py"
|
||||
file_path: "python/functions/datascience/missingness_row_patterns.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join("python", "functions"))
|
||||
from datascience.missingness_row_patterns import missingness_row_patterns
|
||||
|
||||
# null_mask alineado por fila: 1 = la celda falta en esa fila.
|
||||
null_mask = {
|
||||
"A": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
|
||||
"B": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
|
||||
"C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
||||
}
|
||||
out = missingness_row_patterns(null_mask, top_n=10)
|
||||
print(out["n_rows"], out["n_patterns"], out["complete_rows"]) # 10 3 5
|
||||
for p in out["patterns"]:
|
||||
label = p["missing_cols"] or "(fila completa)"
|
||||
print(label, p["n_rows"], p["pct"])
|
||||
# (fila completa) 5 50.0
|
||||
# ['A', 'B'] 4 40.0
|
||||
# ['C'] 1 10.0
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
- Usala en el capitulo de calidad/ausencias de `AutomaticEDA` para mostrar la "matriz de patrones de missingno": en vez de pintar celda a celda, resume que combinaciones de columnas se quedan en blanco juntas y con que frecuencia.
|
||||
- Cuando ya tengas el null_mask por columna (1=falta) y quieras detectar co-ausencia estructural ("A y B siempre faltan juntas") antes de decidir una imputacion o un drop conjunto de columnas.
|
||||
- Cuando necesites una tabla compacta "patron -> nº filas -> pct" para un report o un grafico de barras de los patrones de ausencia mas comunes, separando ademas cuantas filas estan completas (`complete_rows`).
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Funcion pura, sin I/O y determinista. Lectura defensiva: `{}` o un no-dict devuelven `n_rows` 0 con `patterns` []. NUNCA lanza.
|
||||
- El patron vacio (fila completa, `missing_cols=[]`) SI cuenta como patron: aparece en `n_patterns` y puede aparecer en `patterns`. El consumidor lo etiqueta como "(fila completa)".
|
||||
- `pct` es sobre `n_rows` total (0-100), redondeado a 2 decimales. La suma de los `pct` de TODOS los patrones es 100; si `top_n` trunca, los `pct` mostrados sumaran menos.
|
||||
- Las columnas se ordenan por `str(col)` para canonizar cada patron, asi `{A,B}` y `{B,A}` colapsan al mismo patron `["A", "B"]`.
|
||||
- Una celda cuenta como ausente solo si vale 1 (`int(cell) == 1`); 0, None y valores no numericos se tratan como presentes.
|
||||
- Si las listas de columnas tienen longitudes distintas, `n_rows` es la mas larga y las posiciones fuera de rango de una columna corta cuentan como presentes (0).
|
||||
@@ -0,0 +1,107 @@
|
||||
"""missingness_row_patterns — distinct per-row missingness patterns (missingno matrix style).
|
||||
|
||||
Pure function: no I/O, deterministic, NEVER raises. Given a per-column null mask
|
||||
aligned by row ({col: [0/1, ...]}, 1 = missing), it groups rows by their missing
|
||||
"pattern" — the sorted tuple of column names that are missing in that row — and
|
||||
counts how often each distinct pattern occurs.
|
||||
|
||||
This mirrors the missingno matrix idea: instead of plotting per-cell nullity, it
|
||||
collapses each row to the SET of columns it lacks, surfacing co-missing structure
|
||||
(e.g. "A and B always go missing together"). The empty pattern (a fully complete
|
||||
row) is a first-class pattern and may appear in the result with missing_cols=[];
|
||||
the caller labels it "(fila completa)".
|
||||
"""
|
||||
|
||||
|
||||
def _is_missing(cell) -> bool:
|
||||
"""A cell counts as missing when it equals 1 (truthy 0/1 mask).
|
||||
|
||||
None / 0 / non-numeric are treated as present. Defensive: never raises.
|
||||
"""
|
||||
try:
|
||||
return int(cell) == 1
|
||||
except (TypeError, ValueError):
|
||||
return bool(cell)
|
||||
|
||||
|
||||
def missingness_row_patterns(null_mask, top_n=10) -> dict:
|
||||
"""Count distinct per-row missingness patterns from a column null mask.
|
||||
|
||||
For each row, its pattern is the sorted tuple of column names missing in that
|
||||
row (the columns whose value is 1). The frequency of each distinct pattern is
|
||||
counted, including the empty pattern (a complete row with nothing missing).
|
||||
|
||||
Args:
|
||||
null_mask: Dict {col: [0/1, ...]} aligned by row, where 1 means the cell
|
||||
is missing in that row. Read defensively; columns with differing
|
||||
lengths are tolerated (n_rows is the longest list; out-of-range cells
|
||||
count as present). Empty dict -> n_rows 0.
|
||||
top_n: Maximum number of patterns returned in `patterns`, ranked by
|
||||
n_rows desc (tiebreak: fewer columns first, then column names). The
|
||||
full count of distinct patterns is always reported in `n_patterns`.
|
||||
|
||||
Returns:
|
||||
Dict:
|
||||
{
|
||||
"n_rows": int, # total rows
|
||||
"n_patterns": int, # distinct patterns (incl. the empty pattern)
|
||||
"complete_rows": int, # rows with the empty pattern (nothing missing)
|
||||
"patterns": [ # top_n patterns, n_rows desc
|
||||
{"missing_cols": [col, ...], "n_rows": int, "pct": float} # [] = complete row
|
||||
],
|
||||
}
|
||||
For {} (or a non-dict) returns n_rows 0 and patterns []. NEVER raises.
|
||||
"""
|
||||
empty = {"n_rows": 0, "n_patterns": 0, "complete_rows": 0, "patterns": []}
|
||||
if not isinstance(null_mask, dict) or not null_mask:
|
||||
return empty
|
||||
|
||||
# Stable, canonical column order so each row's pattern tuple is sorted.
|
||||
items = sorted(null_mask.items(), key=lambda kv: str(kv[0]))
|
||||
names = [str(k) for k, _ in items]
|
||||
lists = [v if isinstance(v, (list, tuple)) else [] for _, v in items]
|
||||
|
||||
n_rows = max((len(lst) for lst in lists), default=0)
|
||||
if n_rows == 0:
|
||||
return empty
|
||||
|
||||
# Defensive parsing of top_n.
|
||||
try:
|
||||
limit = int(top_n)
|
||||
except (TypeError, ValueError):
|
||||
limit = 10
|
||||
if limit < 0:
|
||||
limit = 0
|
||||
|
||||
counts: dict = {}
|
||||
n_cols = len(names)
|
||||
for r in range(n_rows):
|
||||
# names is sorted, so iterating in order yields an already-sorted tuple.
|
||||
pattern = tuple(
|
||||
names[c]
|
||||
for c in range(n_cols)
|
||||
if r < len(lists[c]) and _is_missing(lists[c][r])
|
||||
)
|
||||
counts[pattern] = counts.get(pattern, 0) + 1
|
||||
|
||||
complete_rows = counts.get((), 0)
|
||||
n_patterns = len(counts)
|
||||
|
||||
# Rank: n_rows desc, then fewer columns first, then column names (deterministic).
|
||||
ordered = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))
|
||||
|
||||
patterns = [
|
||||
{
|
||||
"missing_cols": list(pat),
|
||||
"n_rows": cnt,
|
||||
"pct": round(100.0 * cnt / n_rows, 2),
|
||||
}
|
||||
for pat, cnt in ordered[:limit]
|
||||
]
|
||||
|
||||
return {
|
||||
"n_rows": n_rows,
|
||||
"n_patterns": n_patterns,
|
||||
"complete_rows": complete_rows,
|
||||
"patterns": patterns,
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
"""Tests para missingness_row_patterns."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from missingness_row_patterns import missingness_row_patterns
|
||||
|
||||
_EXPECTED_KEYS = {"n_rows", "n_patterns", "complete_rows", "patterns"}
|
||||
|
||||
|
||||
def test_patron_dominante_completas_singleton():
|
||||
"""Golden: {A,B} co-faltan en 4 filas + 5 filas completas + 1 singleton {C}."""
|
||||
# 10 filas. A y B faltan juntas en las filas 0-3; filas 4-8 completas;
|
||||
# la fila 9 solo le falta C.
|
||||
null_mask = {
|
||||
"A": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
|
||||
"B": [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
|
||||
"C": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
||||
}
|
||||
out = missingness_row_patterns(null_mask)
|
||||
|
||||
assert set(out.keys()) == _EXPECTED_KEYS
|
||||
assert out["n_rows"] == 10
|
||||
# 3 patrones distintos: (A,B), () y (C,).
|
||||
assert out["n_patterns"] == 3
|
||||
# 5 filas completas (filas 4-8).
|
||||
assert out["complete_rows"] == 5
|
||||
|
||||
# Orden: n_rows desc; desempate menos columnas primero.
|
||||
# () tiene 5 filas, (A,B) 4, (C,) 1.
|
||||
pats = out["patterns"]
|
||||
assert len(pats) == 3
|
||||
|
||||
assert pats[0]["missing_cols"] == []
|
||||
assert pats[0]["n_rows"] == 5
|
||||
assert pats[0]["pct"] == 50.0
|
||||
|
||||
assert pats[1]["missing_cols"] == ["A", "B"]
|
||||
assert pats[1]["n_rows"] == 4
|
||||
assert pats[1]["pct"] == 40.0
|
||||
|
||||
assert pats[2]["missing_cols"] == ["C"]
|
||||
assert pats[2]["n_rows"] == 1
|
||||
assert pats[2]["pct"] == 10.0
|
||||
|
||||
# Tipos de salida.
|
||||
assert isinstance(out["n_rows"], int)
|
||||
assert isinstance(pats[0]["pct"], float)
|
||||
|
||||
|
||||
def test_mask_vacio():
|
||||
"""{} -> n_rows 0, sin patrones, nunca lanza."""
|
||||
out = missingness_row_patterns({})
|
||||
assert out == {
|
||||
"n_rows": 0,
|
||||
"n_patterns": 0,
|
||||
"complete_rows": 0,
|
||||
"patterns": [],
|
||||
}
|
||||
# No dict / None tambien degradan a vacio sin lanzar.
|
||||
assert missingness_row_patterns(None)["n_rows"] == 0
|
||||
# Columnas presentes pero listas vacias -> n_rows 0.
|
||||
assert missingness_row_patterns({"A": [], "B": []})["patterns"] == []
|
||||
|
||||
|
||||
def test_top_n_trunca_pero_cuenta_todos():
|
||||
"""top_n limita `patterns`, pero n_patterns reporta TODOS los distintos."""
|
||||
null_mask = {
|
||||
"A": [0, 1, 1, 0, 1],
|
||||
"B": [0, 0, 0, 1, 1],
|
||||
"C": [0, 0, 0, 0, 1],
|
||||
}
|
||||
# Filas: () (A,) (A,) (B,) (A,B,C)
|
||||
out = missingness_row_patterns(null_mask, top_n=2)
|
||||
|
||||
assert out["n_rows"] == 5
|
||||
assert out["n_patterns"] == 4 # (), (A,), (B,), (A,B,C)
|
||||
assert out["complete_rows"] == 1
|
||||
# Solo 2 patrones devueltos pese a haber 4.
|
||||
assert len(out["patterns"]) == 2
|
||||
# (A,) domina con 2 filas; desempate del 2o entre los de 1 fila -> () (0 cols).
|
||||
assert out["patterns"][0]["missing_cols"] == ["A"]
|
||||
assert out["patterns"][0]["n_rows"] == 2
|
||||
assert out["patterns"][1]["missing_cols"] == []
|
||||
assert out["patterns"][1]["n_rows"] == 1
|
||||
@@ -0,0 +1,122 @@
|
||||
---
|
||||
id: relationship_scatter_figure_py_datascience
|
||||
name: relationship_scatter_figure
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def relationship_scatter_figure(xs: list, ys: list, x_label: str = \"\", y_label: str = \"\", classification: dict = None, max_points: int = 2000) -> \"matplotlib.figure.Figure\""
|
||||
description: "Construye una figura matplotlib scatter de un par de variables numéricas con su curva/recta de ajuste y una anotación del tipo de relación (lineal, polinómica grado 2/3, monótona no-lineal, etc.) más sus métricas (r, ρ, R²lin, R²poly). Consume el dict de classify_relationship_type; si es None lo calcula internamente reusando esa función. Devuelve un matplotlib.figure.Figure listo para rasterizar por el renderer del informe EDA (PDF/PPTX). Backend Agg sin pyplot global; downsample determinista de los puntos dibujados; defensivo ante vacío/None."
|
||||
tags: [eda, correlation, scatter, relationship, matplotlib, figure, visualization, datascience, impure]
|
||||
uses_functions: [classify_relationship_type_py_datascience]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [matplotlib, numpy]
|
||||
example: |
|
||||
from relationship_scatter_figure import relationship_scatter_figure
|
||||
xs = [float(i) for i in range(100)]
|
||||
ys = [0.5 * x * x - x + 3 for x in xs]
|
||||
classification = {
|
||||
"tipo": "polinómica (grado 2)", "pearson": 0.97, "spearman": 0.99,
|
||||
"r2_linear": 0.92, "r2_poly2": 0.999, "r2_poly3": 0.999,
|
||||
"best_degree": 2, "coeffs": [0.5, -1.0, 3.0],
|
||||
}
|
||||
fig = relationship_scatter_figure(xs, ys, x_label="dosis", y_label="efecto", classification=classification)
|
||||
tested: true
|
||||
tests:
|
||||
- "test_returns_figure"
|
||||
- "test_downsample_determinista"
|
||||
- "test_empty_no_lanza"
|
||||
- "test_classification_none"
|
||||
test_file_path: "python/functions/datascience/relationship_scatter_figure_test.py"
|
||||
file_path: "python/functions/datascience/relationship_scatter_figure.py"
|
||||
params:
|
||||
- name: xs
|
||||
desc: "Lista (o tupla) de valores x. Se emparejan por índice con ys. Valores None, bool, NaN o inf descartan ese par (lectura defensiva)."
|
||||
- name: ys
|
||||
desc: "Lista (o tupla) de valores y, paralela a xs. Mismas reglas defensivas que xs."
|
||||
- name: x_label
|
||||
desc: "Etiqueta del eje/título para la variable x. Default \"\" (en el título cae a \"x\")."
|
||||
- name: y_label
|
||||
desc: "Etiqueta del eje/título para la variable y. Default \"\" (en el título cae a \"y\")."
|
||||
- name: classification
|
||||
desc: "Opcional. Dict de classify_relationship_type con claves tipo, pearson, r2_linear, spearman, r2_poly2, r2_poly3, best_degree, coeffs. Si es None se calcula internamente importando y llamando a classify_relationship_type sobre los pares limpios (self-contained). Si el módulo hermano no está disponible, se dibuja el scatter sin curva de ajuste ni anotación. Default None."
|
||||
- name: max_points
|
||||
desc: "Tope del nº de puntos DIBUJADOS. Si los pares limpios superan el tope, la nube se submuestrea por paso fijo ceil(n/max_points) tomando pairs[::step] — DETERMINISTA, no aleatorio, reproducible. La clasificación/ajuste usa SIEMPRE todos los pares limpios; el downsample solo adelgaza el dibujo. Valor no-positivo o no-int desactiva el downsample. Default 2000."
|
||||
output: "Un matplotlib.figure.Figure (figsize 6.4x4.0, dpi 150) con un Axes scatter (puntos semitransparentes alpha 0.5, color #4C72B0), la curva/recta de ajuste (numpy.polyval sobre coeffs, color #C44E52) cuando hay un ajuste polinómico disponible, título \"{x_label} ↔ {y_label}\", labels de ejes y una caja de anotación en la esquina superior izquierda con el tipo de relación y las métricas disponibles (r, ρ, R²lin, R²poly; se omiten las None). Si tras la limpieza hay menos de 2 pares válidos, devuelve igualmente una Figure con un texto centrado \"Sin datos suficientes para el scatter\" (nunca lanza). El caller rasteriza/cierra la figura; la función no la muestra ni la guarda."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from relationship_scatter_figure import relationship_scatter_figure
|
||||
|
||||
# Par numérico con relación cuadrática y su clasificación (de
|
||||
# classify_relationship_type). Pasándola explícita evitas recomputarla.
|
||||
xs = [float(i) for i in range(100)]
|
||||
ys = [0.5 * x * x - x + 3 for x in xs]
|
||||
classification = {
|
||||
"tipo": "polinómica (grado 2)",
|
||||
"pearson": 0.97,
|
||||
"spearman": 0.99,
|
||||
"r2_linear": 0.92,
|
||||
"r2_poly2": 0.999,
|
||||
"r2_poly3": 0.999,
|
||||
"best_degree": 2,
|
||||
"coeffs": [0.5, -1.0, 3.0],
|
||||
}
|
||||
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="dosis", y_label="efecto", classification=classification
|
||||
)
|
||||
|
||||
# El renderer del informe lo rasteriza; aquí solo persistimos para inspección.
|
||||
fig.savefig("/tmp/scatter_dosis_efecto.png")
|
||||
|
||||
# Con classification=None la función la calcula internamente (self-contained):
|
||||
fig2 = relationship_scatter_figure(xs, ys, x_label="dosis", y_label="efecto")
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Úsala dentro del informe EDA automático cuando quieras visualizar de un vistazo
|
||||
la relación entre dos variables numéricas: la nube de puntos, la curva que mejor
|
||||
la ajusta y una etiqueta legible del tipo de relación con sus métricas. Es la
|
||||
pareja "vista humana" de `classify_relationship_type`: esa función decide el
|
||||
tipo y los coeficientes; esta los pinta en una `Figure` que el renderer del
|
||||
informe rasteriza a PDF/PPTX. Pásale el dict de clasificación si ya lo tienes
|
||||
calculado (evitas recomputar el ajuste); si no, déjalo en `None` y la función lo
|
||||
resuelve sola sobre los pares limpios. Pensada para móvil: anotación pequeña
|
||||
(fontsize 8) y nube adelgazada por `max_points` para que el PDF no pese.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura por matplotlib.** Toca la maquinaria de render. Usa el backend `Agg`
|
||||
y la API orientada a objetos `Figure`/`add_subplot` — NUNCA `pyplot.*` aquí,
|
||||
para no tocar el estado global ni filtrar figuras entre llamadas. `pyplot` NO
|
||||
es thread-safe; esta función lo evita construyendo el `Figure` directamente,
|
||||
así que es segura de llamar en bucle desde el renderer.
|
||||
- **El caller cierra la figura.** Devuelve el `Figure` pero no lo muestra ni lo
|
||||
guarda. Quien la consume debe rasterizarla y luego liberarla
|
||||
(`matplotlib.pyplot.close(fig)`) para no acumular memoria en lotes grandes de
|
||||
pares de columnas.
|
||||
- **Downsample determinista, solo del dibujo.** Cuando los pares limpios superan
|
||||
`max_points`, la nube DIBUJADA se adelgaza por paso fijo `pairs[::step]`
|
||||
(reproducible, no aleatorio). La clasificación y el ajuste usan SIEMPRE todos
|
||||
los pares limpios; el downsample no altera las métricas ni la curva.
|
||||
- **`classification=None` ⇒ se calcula sola.** Importa y llama a
|
||||
`classify_relationship_type` sobre los pares limpios. Si ese módulo hermano no
|
||||
está disponible (entorno incompleto), NO lanza: dibuja el scatter sin curva de
|
||||
ajuste ni anotación. Pasar la clasificación explícita es más barato (no
|
||||
recomputa el ajuste).
|
||||
- **Sin curva para `monótona no-lineal`.** Cuando `coeffs` es `None` o
|
||||
`best_degree` es `None` (p.ej. tipo "monótona no-lineal"), no se pinta recta
|
||||
polinómica — solo la nube y la anotación. Tampoco se dibuja la curva si el
|
||||
rango de x es nulo (todos los x iguales). Nunca falla por esto.
|
||||
- **Defensiva, nunca lanza.** `xs=[]`, `ys=[]`, menos de 2 pares válidos, ends
|
||||
`None`/`bool`/`NaN`/`inf` o `coeffs` malformado se manejan sin error: en el
|
||||
peor caso devuelve una `Figure` con "Sin datos suficientes para el scatter".
|
||||
No envuelvas la llamada en try/except por miedo a un raise — no lo hay.
|
||||
@@ -0,0 +1,322 @@
|
||||
"""Impure EDA helper: scatter figure of a numeric pair with its fit (`eda` group).
|
||||
|
||||
Builds a matplotlib scatter of two numeric variables, overlays the fitted
|
||||
curve/line implied by the relationship classification (linear, polynomial of
|
||||
degree 2/3, etc.) and annotates the relationship type with its available
|
||||
metrics. Returns a ready-to-rasterize ``matplotlib.figure.Figure``; it never
|
||||
shows nor saves it.
|
||||
|
||||
Impure because it touches matplotlib's rendering machinery. It uses the headless
|
||||
Agg backend and the object-oriented ``Figure`` API (no ``pyplot``) so it leaks no
|
||||
global state and is safe to call repeatedly from a report renderer.
|
||||
|
||||
To keep the rendered PDF/PPTX light on phones, when the number of valid pairs
|
||||
exceeds ``max_points`` the *plotted* points are down-sampled DETERMINISTICALLY by
|
||||
a fixed step (``pairs[::step]``), never randomly, so the output is reproducible.
|
||||
The classification/fit always uses every clean pair; the down-sample only thins
|
||||
the drawn cloud.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import numpy as np # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
# Sober blue for the scatter cloud and red for the fitted curve (Tufte: the
|
||||
# data points are the primary ink, the fit is the secondary highlight).
|
||||
_POINT_COLOR = "#4C72B0"
|
||||
_FIT_COLOR = "#C44E52"
|
||||
# Muted gray for the no-data fallback message.
|
||||
_MUTED_TEXT = "#5f6b7a"
|
||||
|
||||
|
||||
def _finite(value):
|
||||
"""Coerce ``value`` to a finite float, or return None when not usable.
|
||||
|
||||
bool is a subclass of int, but a real numeric measurement is never a bool,
|
||||
so True/False are treated as missing instead of coercing to 1.0/0.0. NaN and
|
||||
+/-infinity are never valid either.
|
||||
"""
|
||||
if value is None or isinstance(value, bool):
|
||||
return None
|
||||
try:
|
||||
f = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
if math.isnan(f) or math.isinf(f):
|
||||
return None
|
||||
return f
|
||||
|
||||
|
||||
def _clean_pairs(xs, ys):
|
||||
"""Pair ``xs[i], ys[i]`` by index, dropping any pair with a non-finite end."""
|
||||
pairs = []
|
||||
if isinstance(xs, (list, tuple)) and isinstance(ys, (list, tuple)):
|
||||
n = min(len(xs), len(ys))
|
||||
for i in range(n):
|
||||
x = _finite(xs[i])
|
||||
y = _finite(ys[i])
|
||||
if x is None or y is None:
|
||||
continue
|
||||
pairs.append((x, y))
|
||||
return pairs
|
||||
|
||||
|
||||
def _ordered_trend(xs_clean, ys_clean, n_bins: int = 12):
|
||||
"""Return (x_trend, y_trend): the ordered trend of y over x for a monotonic
|
||||
relationship that has no polynomial fit.
|
||||
|
||||
When x has few distinct values (an ordinal/discrete scale) the trend is the
|
||||
mean of y per distinct x value. Otherwise x is split into ``n_bins`` ordered
|
||||
quantile bins and each point is (mean x, mean y) of the bin. Returns
|
||||
``(None, None)`` when there is nothing meaningful to draw.
|
||||
"""
|
||||
x_arr = np.asarray(xs_clean, dtype=float)
|
||||
y_arr = np.asarray(ys_clean, dtype=float)
|
||||
if x_arr.size < 2:
|
||||
return None, None
|
||||
uniq = np.unique(x_arr)
|
||||
if uniq.size <= max(2, n_bins):
|
||||
# Discrete x: one trend point per distinct value (mean y).
|
||||
xt = uniq
|
||||
yt = np.array([float(np.mean(y_arr[x_arr == ux])) for ux in uniq])
|
||||
return xt, yt
|
||||
# Continuous x: ordered quantile bins, (mean x, mean y) per bin.
|
||||
order = np.argsort(x_arr, kind="stable")
|
||||
x_sorted = x_arr[order]
|
||||
y_sorted = y_arr[order]
|
||||
chunks_x = np.array_split(x_sorted, n_bins)
|
||||
chunks_y = np.array_split(y_sorted, n_bins)
|
||||
xt = np.array([float(np.mean(cx)) for cx in chunks_x if cx.size])
|
||||
yt = np.array([float(np.mean(cy)) for cy in chunks_y if cy.size])
|
||||
return xt, yt
|
||||
|
||||
|
||||
def _no_data_figure(message: str) -> "matplotlib.figure.Figure":
|
||||
"""A bare Figure carrying a centered muted message (defensive fallback)."""
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
ax.axis("off")
|
||||
ax.text(
|
||||
0.5,
|
||||
0.5,
|
||||
message,
|
||||
ha="center",
|
||||
va="center",
|
||||
fontsize=12,
|
||||
color=_MUTED_TEXT,
|
||||
transform=ax.transAxes,
|
||||
)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def _metrics_caption(classification: dict) -> str:
|
||||
"""Format the available metrics of a classification dict into one line.
|
||||
|
||||
Omits the metrics that are None. Keys consumed (any may be absent/None):
|
||||
``pearson`` (r), ``spearman`` (rho), ``r2_linear`` (R²lin) and the best
|
||||
polynomial R² (``r2_poly3`` if a cubic was the best fit, else ``r2_poly2``).
|
||||
"""
|
||||
parts = []
|
||||
r = _finite(classification.get("pearson"))
|
||||
if r is not None:
|
||||
parts.append(f"r={r:.2f}")
|
||||
rho = _finite(classification.get("spearman"))
|
||||
if rho is not None:
|
||||
parts.append(f"ρ={rho:.2f}")
|
||||
r2_lin = _finite(classification.get("r2_linear"))
|
||||
if r2_lin is not None:
|
||||
parts.append(f"R²lin={r2_lin:.2f}")
|
||||
# Prefer the R² of the best polynomial degree when it is a poly fit.
|
||||
best_degree = classification.get("best_degree")
|
||||
r2_poly = None
|
||||
if best_degree == 3:
|
||||
r2_poly = _finite(classification.get("r2_poly3"))
|
||||
elif best_degree == 2:
|
||||
r2_poly = _finite(classification.get("r2_poly2"))
|
||||
if r2_poly is None:
|
||||
# Fall back to whichever poly R² is present (cubic first).
|
||||
r2_poly = _finite(classification.get("r2_poly3"))
|
||||
if r2_poly is None:
|
||||
r2_poly = _finite(classification.get("r2_poly2"))
|
||||
if r2_poly is not None:
|
||||
parts.append(f"R²poly={r2_poly:.2f}")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def relationship_scatter_figure(
|
||||
xs: list,
|
||||
ys: list,
|
||||
x_label: str = "",
|
||||
y_label: str = "",
|
||||
classification: dict = None,
|
||||
max_points: int = 2000,
|
||||
) -> "matplotlib.figure.Figure":
|
||||
"""Build a scatter figure of a numeric pair with its fit and a type label.
|
||||
|
||||
Cleans the pairs defensively (drops any pair with a None/bool/NaN/inf end),
|
||||
plots a semi-transparent scatter cloud (down-sampled deterministically when
|
||||
it exceeds ``max_points``), overlays the polynomial fit implied by
|
||||
``classification`` and annotates the relationship type plus its available
|
||||
metrics in a corner box.
|
||||
|
||||
The fit and classification always use every clean pair; only the drawn cloud
|
||||
is thinned by the down-sample. When ``classification`` is None it is computed
|
||||
internally by reusing ``classify_relationship_type`` over the clean pairs, so
|
||||
the function is self-contained.
|
||||
|
||||
The function is fully defensive: empty input, fewer than 2 clean pairs, a
|
||||
missing/None ``coeffs`` or a missing sibling classifier never raise. When
|
||||
there is nothing valid to draw it still returns a ``Figure`` carrying a
|
||||
centered "Sin datos suficientes para el scatter" message.
|
||||
|
||||
Args:
|
||||
xs: List (or tuple) of x values. Paired by index with ``ys``. Values that
|
||||
are None, bool, NaN or infinite discard that pair. Read defensively.
|
||||
ys: List (or tuple) of y values, parallel to ``xs``. Same defensive rules.
|
||||
x_label: Axis/title label for the x variable. Default "" (falls back to
|
||||
"x" in the title).
|
||||
y_label: Axis/title label for the y variable. Default "" (falls back to
|
||||
"y" in the title).
|
||||
classification: Optional dict from ``classify_relationship_type`` with
|
||||
keys ``tipo, pearson, r2_linear, spearman, r2_poly2, r2_poly3,
|
||||
best_degree, coeffs``. When None, it is computed internally by
|
||||
importing and calling ``classify_relationship_type`` over the clean
|
||||
pairs. When that sibling module is unavailable, the scatter is still
|
||||
drawn (no fit curve, no annotation).
|
||||
max_points: Cap on the number of *plotted* points. When the number of
|
||||
clean pairs exceeds this cap, the drawn cloud is down-sampled by a
|
||||
fixed step ``ceil(n/max_points)`` taking ``pairs[::step]`` —
|
||||
DETERMINISTIC, not random, so the figure is reproducible. A
|
||||
non-positive or non-int value disables down-sampling. Default 2000.
|
||||
|
||||
Returns:
|
||||
A ``matplotlib.figure.Figure`` (figsize 6.4x4.0, dpi 150) with a single
|
||||
scatter Axes, the fitted curve (when a polynomial fit is available) and a
|
||||
corner annotation with the relationship type and metrics. When there are
|
||||
fewer than 2 clean pairs it returns a Figure with a centered "Sin datos
|
||||
suficientes para el scatter" message. The caller rasterizes/closes it.
|
||||
"""
|
||||
pairs = _clean_pairs(xs, ys)
|
||||
if len(pairs) < 2:
|
||||
return _no_data_figure("Sin datos suficientes para el scatter")
|
||||
|
||||
# Full clean coordinates feed the classification/fit; the plotted cloud is
|
||||
# what gets thinned.
|
||||
xs_clean = [p[0] for p in pairs]
|
||||
ys_clean = [p[1] for p in pairs]
|
||||
|
||||
# Resolve the classification. If not provided, reuse the sibling classifier
|
||||
# over ALL clean pairs (self-contained). Missing module => no fit/annotation.
|
||||
cls = classification
|
||||
if cls is None:
|
||||
try:
|
||||
from classify_relationship_type import classify_relationship_type
|
||||
|
||||
cls = classify_relationship_type(xs_clean, ys_clean)
|
||||
except Exception:
|
||||
cls = None
|
||||
if not isinstance(cls, dict):
|
||||
cls = {}
|
||||
|
||||
# --- Deterministic down-sampling of the DRAWN points only.
|
||||
n_total = len(pairs)
|
||||
if (
|
||||
isinstance(max_points, int)
|
||||
and not isinstance(max_points, bool)
|
||||
and max_points > 0
|
||||
and n_total > max_points
|
||||
):
|
||||
step = math.ceil(n_total / max_points)
|
||||
sampled = pairs[::step]
|
||||
else:
|
||||
sampled = pairs
|
||||
|
||||
x_plot = [p[0] for p in sampled]
|
||||
y_plot = [p[1] for p in sampled]
|
||||
|
||||
fig = Figure(figsize=(6.4, 4.0), dpi=150)
|
||||
ax = fig.add_subplot(111)
|
||||
|
||||
ax.scatter(
|
||||
x_plot,
|
||||
y_plot,
|
||||
s=12,
|
||||
alpha=0.5,
|
||||
color=_POINT_COLOR,
|
||||
edgecolors="none",
|
||||
rasterized=True,
|
||||
)
|
||||
|
||||
# --- Fitted curve/line over the full clean x range.
|
||||
coeffs = cls.get("coeffs")
|
||||
best_degree = cls.get("best_degree")
|
||||
tipo = cls.get("tipo")
|
||||
x_min, x_max = min(xs_clean), max(xs_clean)
|
||||
drew_fit = False
|
||||
if coeffs is not None and best_degree is not None and x_max > x_min:
|
||||
try:
|
||||
coeff_arr = np.asarray(coeffs, dtype=float)
|
||||
if coeff_arr.ndim == 1 and coeff_arr.size > 0 and np.all(np.isfinite(coeff_arr)):
|
||||
x_line = np.linspace(x_min, x_max, 200)
|
||||
y_line = np.polyval(coeff_arr, x_line)
|
||||
if np.all(np.isfinite(y_line)):
|
||||
ax.plot(x_line, y_line, color=_FIT_COLOR, linewidth=2)
|
||||
drew_fit = True
|
||||
except Exception:
|
||||
# Never fail the figure because of a malformed coeffs array.
|
||||
pass
|
||||
|
||||
# A monotonic non-linear relationship has no fitted polynomial (coeffs is
|
||||
# None by design — a low-degree polynomial would mislead). Draw instead the
|
||||
# ordered trend of y over x so the reader still sees the shape: y averaged
|
||||
# within ordered x-bins (or per distinct x value when x is discrete with few
|
||||
# levels, e.g. an ordinal scale). Defensive: any failure leaves the cloud.
|
||||
if (not drew_fit and isinstance(tipo, str) and "monóton" in tipo.lower()
|
||||
and x_max > x_min):
|
||||
try:
|
||||
xt, yt = _ordered_trend(xs_clean, ys_clean)
|
||||
if xt is not None and len(xt) >= 2:
|
||||
ax.plot(xt, yt, color=_FIT_COLOR, linewidth=2, marker="o",
|
||||
markersize=3)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- Labels and title.
|
||||
tx = x_label if x_label else "x"
|
||||
ty = y_label if y_label else "y"
|
||||
ax.set_title(f"{tx} ↔ {ty}", fontsize=12, loc="left", pad=8)
|
||||
ax.set_xlabel(x_label)
|
||||
ax.set_ylabel(y_label)
|
||||
|
||||
# --- Corner annotation: relationship type + available metrics.
|
||||
caption_lines = []
|
||||
if tipo:
|
||||
caption_lines.append(str(tipo))
|
||||
metrics_line = _metrics_caption(cls)
|
||||
if metrics_line:
|
||||
caption_lines.append(metrics_line)
|
||||
if caption_lines:
|
||||
ax.text(
|
||||
0.03,
|
||||
0.97,
|
||||
"\n".join(caption_lines),
|
||||
transform=ax.transAxes,
|
||||
ha="left",
|
||||
va="top",
|
||||
fontsize=8,
|
||||
bbox=dict(
|
||||
boxstyle="round,pad=0.35",
|
||||
facecolor="white",
|
||||
edgecolor="#cccccc",
|
||||
alpha=0.85,
|
||||
),
|
||||
)
|
||||
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
@@ -0,0 +1,100 @@
|
||||
"""Tests para relationship_scatter_figure (scatter de un par numérico, grupo eda).
|
||||
|
||||
Usa el backend Agg sin pyplot global; no muestra ni guarda figuras. Cada test
|
||||
cierra explícitamente la Figure construida (matplotlib.pyplot.close) para no
|
||||
acumular estado entre tests.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt # noqa: E402
|
||||
from matplotlib.collections import PathCollection # noqa: E402
|
||||
from matplotlib.figure import Figure # noqa: E402
|
||||
|
||||
from relationship_scatter_figure import relationship_scatter_figure
|
||||
|
||||
|
||||
def _scatter_offsets(fig):
|
||||
"""Return the plotted points of the first PathCollection (scatter) found."""
|
||||
for ax in fig.axes:
|
||||
for coll in ax.collections:
|
||||
if isinstance(coll, PathCollection):
|
||||
return coll.get_offsets()
|
||||
return None
|
||||
|
||||
|
||||
def test_returns_figure():
|
||||
xs = [float(i) for i in range(20)]
|
||||
ys = [2.0 * x + 1.0 for x in xs] # y = 2x + 1
|
||||
classification = {
|
||||
"tipo": "lineal",
|
||||
"pearson": 1.0,
|
||||
"r2_linear": 1.0,
|
||||
"spearman": 1.0,
|
||||
"r2_poly2": 1.0,
|
||||
"r2_poly3": 1.0,
|
||||
"best_degree": 1,
|
||||
"coeffs": [2.0, 1.0],
|
||||
}
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="a", y_label="b", classification=classification
|
||||
)
|
||||
assert hasattr(fig, "savefig")
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_downsample_determinista():
|
||||
n = 5000
|
||||
xs = [float(i) for i in range(n)]
|
||||
ys = [0.5 * x for x in xs]
|
||||
classification = {
|
||||
"tipo": "lineal",
|
||||
"pearson": 1.0,
|
||||
"r2_linear": 1.0,
|
||||
"spearman": 1.0,
|
||||
"r2_poly2": 1.0,
|
||||
"r2_poly3": 1.0,
|
||||
"best_degree": 1,
|
||||
"coeffs": [0.5, 0.0],
|
||||
}
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="x", y_label="y", classification=classification, max_points=1000
|
||||
)
|
||||
assert isinstance(fig, Figure)
|
||||
offsets = _scatter_offsets(fig)
|
||||
assert offsets is not None
|
||||
# El nº de puntos dibujados no debe exceder el cap.
|
||||
assert len(offsets) <= 1000
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_empty_no_lanza():
|
||||
fig = relationship_scatter_figure([], [], x_label="x", y_label="y")
|
||||
assert isinstance(fig, Figure)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def test_classification_none():
|
||||
# Solo se ejecuta si el módulo hermano classify_relationship_type existe.
|
||||
try:
|
||||
import classify_relationship_type # noqa: F401
|
||||
except Exception:
|
||||
import pytest
|
||||
|
||||
pytest.skip("classify_relationship_type aún no disponible")
|
||||
xs = [float(i) for i in range(30)]
|
||||
ys = [3.0 * x - 2.0 for x in xs]
|
||||
fig = relationship_scatter_figure(
|
||||
xs, ys, x_label="a", y_label="b", classification=None
|
||||
)
|
||||
assert isinstance(fig, Figure)
|
||||
assert len(fig.axes) >= 1
|
||||
plt.close(fig)
|
||||
@@ -0,0 +1,89 @@
|
||||
---
|
||||
name: render_automatic_eda_markdown
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def render_automatic_eda_markdown(chapters_or_profile, out_path: str, meta: dict = None) -> dict"
|
||||
description: "Renderiza un documento AutomaticEDA por CAPÍTULOS (modelo de bloques independiente del formato) en un único MARKDOWN autocontenido pensado para PEGAR A UN LLM. Acepta una lista de capítulos del modelo o directamente un TableProfile del grupo eda (construye los capítulos canónicos con build_document). Prioriza TEXTO + DATOS sobre lo visual: las tablas se vuelcan como tablas markdown con TODAS las filas (sin paginar — no hay páginas que cortar), una figura matplotlib se reduce a su caption más la tabla de datos subyacente (Desde/Hasta/Frecuencia de las barras del histograma) porque un LLM no ve la imagen, y los marcadores de glosario se eliminan conservando el **negrita**. Lleva cabecera (# título), bloque de metadatos en blockquote e índice numerado con anclas GitHub. Espejo de render_automatic_eda_pdf/render_automatic_eda_pptx pero SIN manifest (KISS, el markdown es un único artefacto de texto). dict-no-throw: nunca lanza, devuelve {path, n_chars, chapters, note}; en error fatal path es None y note explica la causa. Flag opcional meta['embed_figures'] exporta PNGs junto al .md (off por defecto)."
|
||||
tags: [eda, markdown, render, report, llm, automatic-eda, chapters, versioned, no-cut, text, datascience, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [os, re, matplotlib, "datascience.automatic_eda"]
|
||||
params:
|
||||
- name: chapters_or_profile
|
||||
desc: "una lista de capítulos del modelo AutomaticEDA (dataclasses Chapter o dicts {id,title,version,blocks}) O un TableProfile dict del grupo eda. Si es un TableProfile, los capítulos canónicos se construyen con build_document(profile, meta['ctx']). Bloques soportados: heading, markdown, kv_table, data_table, figure, image, caption, note, group, glossary_entry. Lectura defensiva: lo no reconocido se degrada a Note, nunca lanza."
|
||||
- name: out_path
|
||||
desc: "ruta del archivo .md de salida. Los directorios padre se crean si faltan. Directorio no escribible → {path:None, note:<causa>} sin lanzar."
|
||||
- name: meta
|
||||
desc: "dict opcional. Claves: title (título del documento), ctx (dict con dataset_name→Dataset, source_origin→Fuente, storage→Almacenamiento, n_rows/n_cols→Dimensiones; también lo consumen los builders de capítulo cuando se da un profile), generated_at (timestamp; si falta se genera ISO UTC), embed_figures (True para exportar PNGs <basename>_figN.png junto al .md; por defecto False y el markdown queda autocontenido)."
|
||||
output: "dict (nunca lanza): {path: str|None, n_chars: int, chapters: list[{id,version}], note: str}. En error fatal (p.ej. directorio no escribible) path es None y note explica la causa. Un documento sin capítulos aplicables produce un markdown mínimo válido con 'documento vacío' y chapters=[]."
|
||||
tested: true
|
||||
tests: ["test_golden_bloques_sinteticos_serializa_todo_a_markdown", "test_edge_documento_vacio_no_revienta", "test_profile_path_construye_capitulos_y_escribe"]
|
||||
test_file_path: "python/functions/datascience/render_automatic_eda_markdown_test.py"
|
||||
file_path: "python/functions/datascience/render_automatic_eda_markdown.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience import render_automatic_eda_markdown
|
||||
|
||||
# Desde un TableProfile del grupo eda (mismo modelo que los renderers PDF/PPTX).
|
||||
profile = {
|
||||
"table": "ventas", "source": "/data/ventas.csv",
|
||||
"n_rows": 1000, "n_cols": 2, "quality_score": 92.5,
|
||||
"columns": [
|
||||
{"name": "precio", "inferred_type": "numeric", "null_pct": 0.01,
|
||||
"numeric": {"mean": 42.5, "median": 40.0, "min": 1.0, "max": 100.0,
|
||||
"std": 12.3}},
|
||||
{"name": "categoria", "inferred_type": "categorical", "null_pct": 0.0,
|
||||
"categorical": {"top": [{"value": "neumaticos", "count": 500}]}},
|
||||
],
|
||||
}
|
||||
res = render_automatic_eda_markdown(
|
||||
profile, "reports/ventas_aeda.md",
|
||||
{"title": "EDA — ventas",
|
||||
"ctx": {"dataset_name": "Ventas", "source_origin": "ERP export",
|
||||
"n_rows": 1000, "n_cols": 2}})
|
||||
print(res["path"], res["n_chars"], res["chapters"])
|
||||
# -> reports/ventas_aeda.md 4123 [{'id':'portada','version':'1.0.0'}, ...]
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando quieras **pegar el EDA a un LLM** (ChatGPT, Claude, ...) o tenerlo en texto
|
||||
plano versionable: mismo documento por capítulos que el PDF/PPTX, pero serializado a
|
||||
Markdown sin binarios. Úsala como tercera salida junto a `render_automatic_eda_pdf`
|
||||
(móvil) y `render_automatic_eda_pptx` (compartir) desde el MISMO modelo de capítulos.
|
||||
A diferencia de esas dos, no hay páginas ni slides: todas las filas de cada tabla se
|
||||
vuelcan (nada se corta) y cada figura se reduce a su caption + la tabla de datos
|
||||
subyacente, que es lo que un LLM puede leer. Para añadir capítulos al documento, ver
|
||||
`docs/capabilities/automatic_eda.md`.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Impura**: escribe el `.md` en `out_path` (crea los directorios padre). Con
|
||||
`meta['embed_figures']=True` además exporta un PNG `<basename>_figN.png` por figura
|
||||
junto al `.md`; por defecto NO exporta nada y el markdown queda autocontenido.
|
||||
- **Nunca lanza** (dict-no-throw): un bloque que falle se degrada a una nota y se anota
|
||||
en `note`; el documento se escribe igual. Un profile/lista vacíos producen un markdown
|
||||
mínimo válido con `*(documento vacío …)*` y `chapters=[]`.
|
||||
- **Figuras = datos, no imagen**: un bloque `figure` se serializa como `*Figura: caption*`
|
||||
más, si la figura matplotlib trae barras (histograma / barras), una tabla
|
||||
`| Desde | Hasta | Frecuencia |` extraída de los `Rectangle` patches (máx 100 filas;
|
||||
el resto se trunca con `*… (N filas más)*`). Si no hay barras o algo falla, solo sale
|
||||
el caption. La figura se cierra (`plt.close`) tras leerla.
|
||||
- **Glosario vs negrita**: se eliminan SOLO los marcadores de glosario
|
||||
`[[term:key]]visible[[/term]]` (queda `visible`); el `**negrita**` markdown SE
|
||||
CONSERVA (es válido). No se usa `strip_inline_md` aquí porque ese también quita el bold.
|
||||
- **Anclas del índice**: el `## Índice` enlaza cada capítulo con un ancla estilo GitHub
|
||||
del encabezado `## N. Título` (minúsculas, espacios→`-`, sin signos). Si dos capítulos
|
||||
comparten título exacto sus anclas colisionan (caso raro; los capítulos canónicos tienen
|
||||
títulos únicos).
|
||||
- **Tablas**: las celdas escapan `|` (→ `\|`) y pliegan saltos de línea a `<br>` para no
|
||||
romper la columna. No hay reparto por ancho — un LLM no lo necesita.
|
||||
@@ -0,0 +1,55 @@
|
||||
"""render_automatic_eda_markdown — chapter-based EDA report as one Markdown file.
|
||||
|
||||
Public ``eda``-group entry point that serializes an AutomaticEDA document (a list
|
||||
of chapters, or an ``eda`` TableProfile from which the canonical chapters are
|
||||
built) into a single self-contained Markdown file optimised to be **pasted into
|
||||
an LLM**: plain text, Markdown tables (every row dumped — there are no pages to
|
||||
cut), figures reduced to caption + underlying data, no binaries. It mirrors
|
||||
``render_automatic_eda_pdf`` / ``render_automatic_eda_pptx`` but for text output;
|
||||
unlike those it writes no manifest (KISS — Markdown is a single text artefact).
|
||||
|
||||
dict-no-throw: never raises. Returns ``{path, n_chars, chapters, note}``; on a
|
||||
fatal error ``path`` is None and ``note`` explains why.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datascience.automatic_eda import build_document, render_md
|
||||
from datascience.automatic_eda.model import as_chapter, as_chapters
|
||||
|
||||
|
||||
def _coerce_chapters(chapters_or_profile, meta: dict) -> list:
|
||||
"""Accept chapters OR an eda profile and return a list of Chapter."""
|
||||
arg = chapters_or_profile
|
||||
if isinstance(arg, (list, tuple)):
|
||||
return as_chapters(list(arg))
|
||||
if isinstance(arg, dict):
|
||||
if "blocks" in arg and "columns" not in arg:
|
||||
ch = as_chapter(arg)
|
||||
return [ch] if ch is not None else []
|
||||
return build_document(arg, (meta or {}).get("ctx"))
|
||||
return []
|
||||
|
||||
|
||||
def render_automatic_eda_markdown(chapters_or_profile, out_path: str,
|
||||
meta: dict = None) -> dict:
|
||||
"""Render an AutomaticEDA document into a single self-contained Markdown file.
|
||||
|
||||
Args:
|
||||
chapters_or_profile: a list of chapters (``Chapter`` dataclasses or
|
||||
dicts) or an ``eda`` TableProfile dict (chapters built via
|
||||
``build_document(profile, meta['ctx'])``).
|
||||
out_path: filesystem path for the ``.md`` (parent dirs are created).
|
||||
meta: optional dict. Recognised keys: ``title``, ``ctx`` (dict with
|
||||
``dataset_name``/``source_origin``/``storage``/``n_rows``/``n_cols``),
|
||||
``generated_at``, ``embed_figures`` (export PNGs beside the .md,
|
||||
default False — off keeps the Markdown self-contained).
|
||||
|
||||
Returns:
|
||||
dict (never raises): ``{path: str|None, n_chars: int,
|
||||
chapters: list[{id, version}], note: str}``. On a fatal error ``path`` is
|
||||
None and ``note`` explains the cause.
|
||||
"""
|
||||
meta = dict(meta or {})
|
||||
chapters = _coerce_chapters(chapters_or_profile, meta)
|
||||
return render_md(chapters, out_path, meta)
|
||||
@@ -0,0 +1,168 @@
|
||||
"""Tests for render_automatic_eda_markdown — DoD: golden + edge + profile path.
|
||||
|
||||
Self-contained synthetic blocks (no DuckDB). Verifies every block kind serializes
|
||||
to Markdown (heading, markdown with glossary+bold, kv/data tables, a figure whose
|
||||
histogram bars become a data table, caption, note, group, glossary entry), that a
|
||||
leading level-1 heading equal to the chapter title is omitted, that an empty
|
||||
document degrades to a valid minimal Markdown without raising, and that passing a
|
||||
minimal TableProfile builds chapters and writes the file.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from datascience.render_automatic_eda_markdown import render_automatic_eda_markdown
|
||||
from datascience.automatic_eda.model import (
|
||||
Caption, Chapter, DataTable, Figure, GlossaryEntry, Group, Heading, KVTable,
|
||||
Markdown, Note,
|
||||
)
|
||||
|
||||
|
||||
def _hist_fig():
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
fig, ax = plt.subplots()
|
||||
ax.hist([1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5], bins=5)
|
||||
return fig
|
||||
|
||||
|
||||
def _chapters() -> list:
|
||||
blocks = [
|
||||
Heading("Demo", 1), # == chapter title -> omitted.
|
||||
Heading("Seccion dos", 2), # -> ####
|
||||
Markdown("Texto con [[term:ent]]entropia[[/term]] y **bold** aqui."),
|
||||
KVTable(rows=[("Filas", 1000), ("Columnas", 5)], title="Resumen"),
|
||||
DataTable(header=["col", "valor"],
|
||||
rows=[["alpha", "111"], ["beta", "222"], ["gamma", "333"]],
|
||||
title="Datos", note="nota inferior"),
|
||||
Figure(make=_hist_fig, caption="Histograma demo"),
|
||||
Caption("pie de figura"),
|
||||
Note("una nota aparte"),
|
||||
Group(title="Grupo X", blocks=[Markdown("dentro del grupo")]),
|
||||
GlossaryEntry(key="ent", label="Entropia",
|
||||
definition="Medida de incertidumbre."),
|
||||
]
|
||||
return [Chapter(id="demo", title="Demo", version="1.0.0", blocks=blocks)]
|
||||
|
||||
|
||||
def _read(path: str) -> str:
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
return fh.read()
|
||||
|
||||
|
||||
def test_golden_bloques_sinteticos_serializa_todo_a_markdown():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "demo.md")
|
||||
res = render_automatic_eda_markdown(
|
||||
_chapters(), out,
|
||||
{"title": "EDA Demo",
|
||||
"ctx": {"dataset_name": "Demo", "n_rows": 12, "n_cols": 2}})
|
||||
assert res["path"] == out
|
||||
assert os.path.exists(out)
|
||||
assert res["n_chars"] > 0
|
||||
assert res["chapters"] == [{"id": "demo", "version": "1.0.0"}]
|
||||
|
||||
content = _read(out)
|
||||
# Document structure.
|
||||
assert content.startswith("# ")
|
||||
assert "## Índice" in content
|
||||
# A Markdown table is present (header + separator row).
|
||||
assert "| " in content and "| --- " in content
|
||||
# DataTable values are all dumped.
|
||||
for v in ("alpha", "111", "beta", "222", "gamma", "333"):
|
||||
assert v in content
|
||||
# Glossary markers stripped, bold kept.
|
||||
assert "[[term" not in content
|
||||
assert "[[/term]]" not in content
|
||||
assert "**bold**" in content
|
||||
assert "entropia" in content # visible glossary text preserved.
|
||||
# Figure histogram bars became a data table.
|
||||
assert "| Desde | Hasta | Frecuencia |" in content
|
||||
# Glossary entry rendered as a level-3 heading.
|
||||
assert "### Entropia" in content
|
||||
# Level-2 heading -> ####.
|
||||
assert "#### Seccion dos" in content
|
||||
# Leading level-1 heading equal to the title was omitted.
|
||||
assert "### Demo" not in content
|
||||
# Group title rendered.
|
||||
assert "### Grupo X" in content
|
||||
|
||||
|
||||
def _hist_fig_with_span():
|
||||
"""Histogram with a wide ``axvspan`` (±1σ band) over it.
|
||||
|
||||
Reproduces the num_distr figure shape: matplotlib keeps the span as a lone
|
||||
Rectangle in ``ax.patches`` alongside the bin bars; it must NOT leak into the
|
||||
extracted bins table as a fake bin (it is ~5x wider than a bin)."""
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
fig, ax = plt.subplots()
|
||||
data = [1, 1, 2, 2, 2, 3, 4, 4, 5, 5, 5, 5]
|
||||
ax.hist(data, bins=5)
|
||||
ax.axvspan(2.0, 4.0, alpha=0.2) # mean±σ band — a wide stray rectangle.
|
||||
return fig
|
||||
|
||||
|
||||
def test_figura_descarta_axvspan_de_la_tabla_de_bins():
|
||||
"""The ±1σ band rectangle must not appear as a row in the bins table."""
|
||||
blocks = [Figure(make=_hist_fig_with_span, caption="Hist con banda")]
|
||||
chapters = [Chapter(id="f", title="Fig", version="1.0.0", blocks=blocks)]
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "fig.md")
|
||||
render_automatic_eda_markdown(chapters, out, {"title": "T"})
|
||||
content = _read(out)
|
||||
assert "| Desde | Hasta | Frecuencia |" in content
|
||||
# Extract the rows of the bins table: lines between the header/separator
|
||||
# and the next blank line.
|
||||
lines = content.splitlines()
|
||||
hi = next(i for i, ln in enumerate(lines)
|
||||
if ln.startswith("| Desde | Hasta | Frecuencia |"))
|
||||
rows = []
|
||||
for ln in lines[hi + 2:]: # skip header + separator
|
||||
if not ln.startswith("|"):
|
||||
break
|
||||
rows.append(ln)
|
||||
# 5 histogram bins, no extra wide span row.
|
||||
assert len(rows) == 5, rows
|
||||
# No row spans a width of ~2.0 (the axvspan from x=2 to x=4).
|
||||
for ln in rows:
|
||||
cells = [c.strip() for c in ln.strip("|").split("|")]
|
||||
lo, hi_v = float(cells[0]), float(cells[1])
|
||||
assert (hi_v - lo) < 1.5, f"wide span leaked: {ln}"
|
||||
|
||||
|
||||
def test_edge_documento_vacio_no_revienta():
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "empty.md")
|
||||
res = render_automatic_eda_markdown([], out, {})
|
||||
assert res["path"] == out
|
||||
assert os.path.exists(out)
|
||||
assert res["chapters"] == []
|
||||
content = _read(out)
|
||||
assert "documento vacío" in content
|
||||
assert content.startswith("# ")
|
||||
|
||||
|
||||
def test_profile_path_construye_capitulos_y_escribe():
|
||||
profile = {
|
||||
"table": "mini",
|
||||
"source": "/data/mini.csv",
|
||||
"n_rows": 10,
|
||||
"n_cols": 1,
|
||||
"quality_score": 88.0,
|
||||
"columns": [
|
||||
{"name": "x", "inferred_type": "numeric", "null_pct": 0.0,
|
||||
"null_count": 0,
|
||||
"numeric": {"mean": 1.0, "median": 1.0, "min": 0.0, "max": 2.0,
|
||||
"std": 0.5}},
|
||||
],
|
||||
}
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "mini.md")
|
||||
res = render_automatic_eda_markdown(
|
||||
profile, out, {"title": "Mini", "ctx": {"dataset_name": "Mini"}})
|
||||
assert res["path"] == out # not None — no exception, file written.
|
||||
assert os.path.exists(out)
|
||||
assert res["n_chars"] > 0
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
name: suggest_intratable_fk_candidates
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def suggest_intratable_fk_candidates(profile: dict, max_candidates: int = 20) -> list"
|
||||
description: "Sobre el TableProfile de UNA tabla (el dict de profile_table), sugiere por heuristica de nombre + cardinalidad que columnas PARECEN una clave foranea hacia otra tabla, cuando no hay relaciones inter-tabla que medir (una sola tabla). Es una SUGERENCIA, no una afirmacion: el ref_table_guess es el stem del nombre (customer_id -> customer) y NO confirma containment. Pura: solo lee el dict, sin I/O; nunca lanza (devuelve [])."
|
||||
tags: [eda, datascience, relationships, foreign-key, fk, heuristic, schema, python]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
params:
|
||||
- name: profile
|
||||
desc: "TableProfile (dict que produce profile_table / summarize_table_*). Se leen de forma defensiva `columns` (lista de ColumnProfile con name/inferred_type/physical_type/distinct_count/unique_pct/flags), `n_rows` (int) y `key_candidates` (lista de nombres de columna ya candidatos a PK, que se excluyen). Si no es dict o no trae columns -> []."
|
||||
- name: max_candidates
|
||||
desc: "Tope de sugerencias devueltas (default 20). Las columnas candidatas se ordenan por distinct_count descendente (mas informativas primero) antes de cortar a este maximo."
|
||||
output: "list (posiblemente vacia) de dicts, uno por columna sugerida, con claves: `column` (nombre), `ref_table_guess` (tabla conjeturada por el stem del nombre, p.ej. customer_id -> 'customer'), `reason` (frase humana que deja claro que es heuristica sin confirmar containment), `distinct_count` (int|None), `unique_pct` (float|None, fraccion 0-1 tal como viene del profile), `inferred_type` (str), `physical_type` (str). Nunca lanza."
|
||||
tested: true
|
||||
tests: ["test_golden_customer_id_detectado_otras_no", "test_camelcase_albumid_detectado", "test_constante_status_id_no_aparece", "test_profile_vacio_y_none_devuelven_lista_vacia", "test_category_id_casi_unico_parece_pk_no_aparece", "test_ref_table_guess_multitoken_y_orden_por_distinct", "test_max_candidates_corta_la_lista", "test_id_generico_solo_nunca_es_fk"]
|
||||
test_file_path: "python/functions/datascience/suggest_intratable_fk_candidates_test.py"
|
||||
file_path: "python/functions/datascience/suggest_intratable_fk_candidates.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience import suggest_intratable_fk_candidates
|
||||
|
||||
# TableProfile de UNA tabla (tipo titanic): customer_id es FK N:1; id es la PK;
|
||||
# amount es una medida float; name es categorica sin sufijo de id.
|
||||
profile = {
|
||||
"n_rows": 891,
|
||||
"key_candidates": ["id"],
|
||||
"columns": [
|
||||
{"name": "id", "inferred_type": "numeric", "physical_type": "BIGINT",
|
||||
"distinct_count": 891, "unique_pct": 1.0, "flags": ["possible_id"]},
|
||||
{"name": "customer_id", "inferred_type": "numeric", "physical_type": "BIGINT",
|
||||
"distinct_count": 137, "unique_pct": 0.15, "flags": []},
|
||||
{"name": "amount", "inferred_type": "numeric", "physical_type": "DOUBLE",
|
||||
"distinct_count": 400, "unique_pct": 0.45, "flags": []},
|
||||
{"name": "name", "inferred_type": "categorical", "physical_type": "VARCHAR",
|
||||
"distinct_count": 700, "unique_pct": 0.78, "flags": []},
|
||||
],
|
||||
}
|
||||
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
[c["column"] for c in out] # -> ["customer_id"]
|
||||
out[0]["ref_table_guess"] # -> "customer"
|
||||
out[0]["reason"]
|
||||
# -> "el nombre termina en '_id' y es N:1 (137 valores distintos < 891 filas):
|
||||
# parece (heuristica por nombre, sin confirmar containment) una referencia a
|
||||
# una tabla «customer»"
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Cuando el EDA tiene SOLO UNA tabla y, por tanto, no se puede inferir una FK
|
||||
inter-tabla por containment (no hay otra tabla cuyos valores contener). Es el plan B
|
||||
del capitulo RELACIONES de AutomaticEDA: en vez de medir solapamiento de valores
|
||||
entre tablas (lo correcto cuando hay varias, ver `infer_fk_containment_duckdb` /
|
||||
`build_join_graph`), conjetura por el NOMBRE de la columna (`<algo>_id`) y por su
|
||||
CARDINALIDAD N:1 que columnas parecen apuntar a una entidad externa. Usala para
|
||||
enriquecer el reporte con "estas columnas parecen referencias a otras tablas" sin
|
||||
prometer que esa tabla exista. NO la uses si tienes varias tablas: ahi mide
|
||||
containment de verdad.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- Es **heuristica**, no una verdad: produce **falsos positivos** (una columna
|
||||
`period_id` que en realidad es un codigo libre, no una FK) y **falsos negativos**
|
||||
(una FK que no se llama `*_id`, p.ej. `parent`, `owner`, `sku`). No la trates como
|
||||
una afirmacion de esquema.
|
||||
- `ref_table_guess` es una **conjetura por el nombre** (el stem sin el sufijo id):
|
||||
`customer_id` -> `customer`, `AlbumId` -> `album`, `manager_staff_id` ->
|
||||
`manager_staff`. Puede no coincidir con el nombre real de la tabla (plurales,
|
||||
prefijos, alias). Es una pista, no un join garantizado.
|
||||
- **NO confirma containment**: no comprueba que los valores de la columna existan en
|
||||
ninguna otra tabla (no puede — solo recibe el perfil de una tabla). Para confirmar
|
||||
una FK real con varias tablas usa `infer_fk_containment_duckdb`.
|
||||
- Excluye deliberadamente: el `id`/`Id`/`ID` generico a secas (suele ser la PK
|
||||
propia, no una referencia), las columnas constantes, las que parecen unicas
|
||||
(`unique_pct >= 0.99`, mas PK que FK) y los tipos no-clave (float/decimal son
|
||||
medidas; date/time/timestamp y boolean no son claves). En camelCase, `paid`,
|
||||
`valid`, `grid` (con `id` en minuscula y sin separador) NO se confunden con FK.
|
||||
- `unique_pct` se interpreta como **fraccion 0-1** (tal como la emite el profile), no
|
||||
como porcentaje 0-100.
|
||||
@@ -0,0 +1,202 @@
|
||||
"""suggest_intratable_fk_candidates — heuristica de FK intra-tabla del grupo `eda`.
|
||||
|
||||
Sobre el TableProfile de UNA tabla (el dict que produce ``profile_table``), sugiere
|
||||
por heuristica de NOMBRE + CARDINALIDAD que columnas PARECEN una clave foranea hacia
|
||||
otra tabla, util cuando no hay relaciones inter-tabla disponibles (una sola tabla y,
|
||||
por tanto, sin containment cruzado que medir). Es una SUGERENCIA, no una afirmacion:
|
||||
no confirma que exista la tabla referida ni que los valores esten contenidos en ella.
|
||||
|
||||
La consume el capitulo RELACIONES de AutomaticEDA cuando solo hay una tabla.
|
||||
|
||||
Funcion PURA: solo lee el dict (lectura defensiva con ``.get``), no hace I/O y nunca
|
||||
lanza por inputs raros (devuelve ``[]``).
|
||||
"""
|
||||
|
||||
# inferred_type que es compatible con una clave foranea (entero/categorico).
|
||||
_FK_INFERRED_OK = {"numeric", "categorical", "integer"}
|
||||
|
||||
# Prefijos de physical_type que admiten ser clave foranea (enteros, texto, uuid).
|
||||
_FK_PHYSICAL_PREFIXES = (
|
||||
"int", "bigint", "smallint", "tinyint", "hugeint", "uint",
|
||||
"varchar", "text", "char", "bpchar", "string", "uuid",
|
||||
)
|
||||
|
||||
# Prefijos de physical_type que EXCLUYEN ser clave foranea: medidas en coma flotante
|
||||
# (float/double/decimal/numeric/real), temporales (date/time/timestamp/interval) y
|
||||
# boolean. Se comprueban ANTES que las senales positivas (la exclusion gana: una
|
||||
# columna numeric con physical DOUBLE es una medida, no una FK).
|
||||
_FK_PHYSICAL_EXCLUDE = (
|
||||
"float", "double", "decimal", "numeric", "real",
|
||||
"date", "time", "timestamp", "interval",
|
||||
"bool",
|
||||
)
|
||||
|
||||
|
||||
def _fk_name_signal(name):
|
||||
"""Detecta el sufijo de clave foranea en el nombre y devuelve ``(stem, sufijo)``.
|
||||
|
||||
Reconoce ``<algo>_id`` (snake), ``<Algo>Id`` y ``<algo>ID`` (camel). NO reconoce
|
||||
el ``id``/``Id``/``ID`` generico a secas (suele ser la PK propia de la tabla, no
|
||||
una referencia). En camelCase la ``I`` mayuscula marca el limite de palabra, asi
|
||||
que ``paid``/``valid``/``grid`` (``id`` en minuscula y sin separador) NO matchean.
|
||||
|
||||
El ``stem`` se devuelve en minusculas y sirve de ``ref_table_guess`` (la tabla a
|
||||
la que probablemente apunta): ``customer_id`` -> ``"customer"``, ``AlbumId`` ->
|
||||
``"album"``, ``manager_staff_id`` -> ``"manager_staff"``. Devuelve ``None`` si no
|
||||
hay senal de nombre.
|
||||
"""
|
||||
if not isinstance(name, str):
|
||||
return None
|
||||
raw = name.strip()
|
||||
if not raw:
|
||||
return None
|
||||
# Snake: termina en "_id" (indiferente a mayusculas en la parte "id").
|
||||
if raw.lower().endswith("_id"):
|
||||
stem = raw[:-3].rstrip("_-. ")
|
||||
if not stem:
|
||||
return None
|
||||
return (stem.lower(), "_id")
|
||||
# Camel todo-mayuscula: "...ID" (p.ej. customerID).
|
||||
if raw.endswith("ID"):
|
||||
stem = raw[:-2].rstrip("_-. ")
|
||||
if not stem:
|
||||
return None
|
||||
return (stem.lower(), "ID")
|
||||
# Camel: "...Id" (p.ej. AlbumId).
|
||||
if raw.endswith("Id"):
|
||||
stem = raw[:-2].rstrip("_-. ")
|
||||
if not stem:
|
||||
return None
|
||||
return (stem.lower(), "Id")
|
||||
return None
|
||||
|
||||
|
||||
def _fk_type_compatible(col):
|
||||
"""True si el tipo de la columna admite ser clave foranea.
|
||||
|
||||
Compatible si el ``physical_type`` NO es una medida flotante, una temporal ni
|
||||
boolean, Y ademas (``inferred_type`` en {numeric, categorical, integer} O el
|
||||
``physical_type`` empieza por entero/varchar/text/char/uuid). La comparacion es
|
||||
indistinta a mayusculas/minusculas.
|
||||
"""
|
||||
phys = (col.get("physical_type") or "").strip().lower()
|
||||
inferred = (col.get("inferred_type") or "").strip().lower()
|
||||
# Exclusion por tipo fisico (gana sobre cualquier senal positiva).
|
||||
for bad in _FK_PHYSICAL_EXCLUDE:
|
||||
if phys.startswith(bad):
|
||||
return False
|
||||
# Senal positiva por tipo inferido.
|
||||
if inferred in _FK_INFERRED_OK:
|
||||
return True
|
||||
# Senal positiva por tipo fisico (entero/texto/uuid).
|
||||
for good in _FK_PHYSICAL_PREFIXES:
|
||||
if phys.startswith(good):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def suggest_intratable_fk_candidates(profile: dict, max_candidates: int = 20) -> list:
|
||||
"""Sugiere columnas que parecen una FK intra-tabla por nombre + cardinalidad.
|
||||
|
||||
Heuristica (no afirma nada): una columna es candidata a clave foranea si su nombre
|
||||
tiene sufijo de id con stem no vacio (``<algo>_id`` / ``<Algo>Id`` / ``<algo>ID``,
|
||||
NUNCA el ``id`` generico), no es ya candidata a PK, no es constante, tiene
|
||||
cardinalidad alta pero por debajo del numero de filas (N:1, no unica) y un tipo
|
||||
compatible con clave (entero/categorico/texto/uuid; nunca float/fecha/boolean).
|
||||
|
||||
Args:
|
||||
profile: TableProfile (dict de ``profile_table``). Se leen, de forma
|
||||
defensiva, ``columns`` (lista de ColumnProfile), ``n_rows`` y
|
||||
``key_candidates`` (nombres de columna ya candidatos a PK).
|
||||
max_candidates: tope de sugerencias devueltas (default 20). Las columnas se
|
||||
ordenan por ``distinct_count`` descendente (mas informativas primero)
|
||||
antes de cortar.
|
||||
|
||||
Returns:
|
||||
list de dicts (posiblemente vacia), uno por columna sugerida, con claves:
|
||||
``column``, ``ref_table_guess`` (stem del nombre), ``reason`` (frase humana),
|
||||
``distinct_count``, ``unique_pct`` (fraccion 0-1 tal como viene del profile),
|
||||
``inferred_type``, ``physical_type``. Nunca lanza: si ``profile`` no es dict o
|
||||
no hay columnas, devuelve ``[]``.
|
||||
"""
|
||||
if not isinstance(profile, dict):
|
||||
return []
|
||||
columns = profile.get("columns")
|
||||
if not isinstance(columns, list):
|
||||
return []
|
||||
|
||||
n_rows = profile.get("n_rows")
|
||||
has_n_rows = (
|
||||
isinstance(n_rows, int) and not isinstance(n_rows, bool) and n_rows > 0
|
||||
)
|
||||
|
||||
key_candidates = profile.get("key_candidates")
|
||||
if not isinstance(key_candidates, (list, tuple, set)):
|
||||
key_candidates = []
|
||||
key_set = set(key_candidates)
|
||||
|
||||
out = []
|
||||
for col in columns:
|
||||
if not isinstance(col, dict):
|
||||
continue
|
||||
name = col.get("name")
|
||||
|
||||
# 1) Senal de nombre: sufijo de id con stem no vacio.
|
||||
signal = _fk_name_signal(name)
|
||||
if signal is None:
|
||||
continue
|
||||
ref_guess, suffix = signal
|
||||
|
||||
# 2) No es ya candidata a PK (clave primaria de la propia tabla).
|
||||
if name in key_set:
|
||||
continue
|
||||
|
||||
# 3) No constante y con >= 2 valores distintos.
|
||||
flags = col.get("flags") or []
|
||||
if "constant" in flags:
|
||||
continue
|
||||
dc = col.get("distinct_count")
|
||||
if not (isinstance(dc, int) and not isinstance(dc, bool) and dc >= 2):
|
||||
continue
|
||||
|
||||
# 4) Cardinalidad alta pero < n_rows (no es PK) y no parece unica.
|
||||
if has_n_rows and dc >= n_rows:
|
||||
continue
|
||||
unique_pct = col.get("unique_pct")
|
||||
has_unique = (
|
||||
isinstance(unique_pct, (int, float)) and not isinstance(unique_pct, bool)
|
||||
)
|
||||
if has_unique and unique_pct >= 0.99:
|
||||
continue
|
||||
|
||||
# 5) Tipo compatible con clave foranea (entero/categorico/texto; no medida).
|
||||
if not _fk_type_compatible(col):
|
||||
continue
|
||||
|
||||
out.append(
|
||||
{
|
||||
"column": name,
|
||||
"ref_table_guess": ref_guess,
|
||||
"reason": _build_reason(suffix, dc, n_rows if has_n_rows else None, ref_guess),
|
||||
"distinct_count": dc,
|
||||
"unique_pct": float(unique_pct) if has_unique else None,
|
||||
"inferred_type": col.get("inferred_type") or "",
|
||||
"physical_type": col.get("physical_type") or "",
|
||||
}
|
||||
)
|
||||
|
||||
# Mas informativas primero (mayor cardinalidad), luego corte.
|
||||
out.sort(key=lambda d: d.get("distinct_count") or 0, reverse=True)
|
||||
return out[: max(0, int(max_candidates))]
|
||||
|
||||
|
||||
def _build_reason(suffix, dc, n_rows, ref_guess):
|
||||
"""Frase humana que deja claro que la sugerencia es heuristica, no confirmada."""
|
||||
if n_rows is not None:
|
||||
card = f"es N:1 ({dc} valores distintos < {n_rows} filas)"
|
||||
else:
|
||||
card = f"tiene {dc} valores distintos que se repiten (cardinalidad N:1)"
|
||||
return (
|
||||
f"el nombre termina en '{suffix}' y {card}: parece (heuristica por nombre, "
|
||||
f"sin confirmar containment) una referencia a una tabla «{ref_guess}»"
|
||||
)
|
||||
@@ -0,0 +1,157 @@
|
||||
"""Tests para suggest_intratable_fk_candidates (funcion pura, sin I/O)."""
|
||||
|
||||
from suggest_intratable_fk_candidates import suggest_intratable_fk_candidates
|
||||
|
||||
|
||||
def _col(name, inferred_type="numeric", physical_type="BIGINT", distinct_count=10,
|
||||
unique_pct=0.1, flags=None):
|
||||
"""Construye un ColumnProfile minimo a mano (el dict que emite profile_table)."""
|
||||
return {
|
||||
"name": name,
|
||||
"inferred_type": inferred_type,
|
||||
"physical_type": physical_type,
|
||||
"semantic_type": "",
|
||||
"distinct_count": distinct_count,
|
||||
"unique_pct": unique_pct,
|
||||
"null_count": 0,
|
||||
"null_pct": 0.0,
|
||||
"flags": list(flags) if flags else [],
|
||||
}
|
||||
|
||||
|
||||
def test_golden_customer_id_detectado_otras_no():
|
||||
# Tabla tipo titanic: customer_id es FK N:1; id es la PK; amount es medida;
|
||||
# name es categorica sin sufijo de id. Solo customer_id debe aparecer.
|
||||
profile = {
|
||||
"n_rows": 891,
|
||||
"key_candidates": ["id"],
|
||||
"columns": [
|
||||
_col("id", inferred_type="numeric", physical_type="BIGINT",
|
||||
distinct_count=891, unique_pct=1.0, flags=["possible_id"]),
|
||||
_col("customer_id", inferred_type="numeric", physical_type="BIGINT",
|
||||
distinct_count=137, unique_pct=0.15, flags=[]),
|
||||
_col("amount", inferred_type="numeric", physical_type="DOUBLE",
|
||||
distinct_count=400, unique_pct=0.45),
|
||||
_col("name", inferred_type="categorical", physical_type="VARCHAR",
|
||||
distinct_count=700, unique_pct=0.78),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
assert isinstance(out, list)
|
||||
assert [c["column"] for c in out] == ["customer_id"]
|
||||
cand = out[0]
|
||||
assert cand["ref_table_guess"] == "customer"
|
||||
assert cand["distinct_count"] == 137
|
||||
assert cand["unique_pct"] == 0.15
|
||||
assert cand["inferred_type"] == "numeric"
|
||||
assert cand["physical_type"] == "BIGINT"
|
||||
# La razon deja claro que es heuristica + cita el sufijo y la tabla.
|
||||
assert "customer" in cand["reason"]
|
||||
assert "_id" in cand["reason"]
|
||||
|
||||
|
||||
def test_camelcase_albumid_detectado():
|
||||
# AlbumId (camelCase, VARCHAR) -> detectada, ref_table_guess "album".
|
||||
profile = {
|
||||
"n_rows": 3503,
|
||||
"key_candidates": ["TrackId"],
|
||||
"columns": [
|
||||
_col("AlbumId", inferred_type="categorical", physical_type="VARCHAR",
|
||||
distinct_count=347, unique_pct=0.10),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
# TrackId es PK candidata (en key_candidates), AlbumId no -> AlbumId aparece.
|
||||
assert [c["column"] for c in out] == ["AlbumId"]
|
||||
assert out[0]["ref_table_guess"] == "album"
|
||||
|
||||
|
||||
def test_constante_status_id_no_aparece():
|
||||
# status_id constante (flag "constant", distinct_count 1) NO es FK util.
|
||||
profile = {
|
||||
"n_rows": 1000,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_col("status_id", inferred_type="numeric", physical_type="INTEGER",
|
||||
distinct_count=1, unique_pct=0.001, flags=["constant"]),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
assert out == []
|
||||
|
||||
|
||||
def test_profile_vacio_y_none_devuelven_lista_vacia():
|
||||
# Lectura defensiva: ni {} ni None lanzan; devuelven [].
|
||||
assert suggest_intratable_fk_candidates({}) == []
|
||||
assert suggest_intratable_fk_candidates(None) == []
|
||||
# profile sin columns o con columns no-lista tampoco lanza.
|
||||
assert suggest_intratable_fk_candidates({"n_rows": 10}) == []
|
||||
assert suggest_intratable_fk_candidates({"columns": "no-soy-lista"}) == []
|
||||
|
||||
|
||||
def test_category_id_casi_unico_parece_pk_no_aparece():
|
||||
# unique_pct 0.999 -> parece PK (no N:1) -> NO se sugiere como FK.
|
||||
profile = {
|
||||
"n_rows": 891,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_col("category_id", inferred_type="numeric", physical_type="BIGINT",
|
||||
distinct_count=890, unique_pct=0.999),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
assert out == []
|
||||
|
||||
|
||||
def test_ref_table_guess_multitoken_y_orden_por_distinct():
|
||||
# manager_staff_id conserva los underscores del stem -> "manager_staff".
|
||||
# Ademas, con varias candidatas, se ordenan por distinct_count descendente.
|
||||
profile = {
|
||||
"n_rows": 10000,
|
||||
"key_candidates": ["staff_id"], # staff_id es PK aqui, no debe aparecer
|
||||
"columns": [
|
||||
_col("staff_id", inferred_type="numeric", physical_type="BIGINT",
|
||||
distinct_count=10000, unique_pct=1.0, flags=["possible_id"]),
|
||||
_col("store_id", inferred_type="numeric", physical_type="INTEGER",
|
||||
distinct_count=2, unique_pct=0.0002),
|
||||
_col("manager_staff_id", inferred_type="numeric", physical_type="INTEGER",
|
||||
distinct_count=40, unique_pct=0.004),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
cols = [c["column"] for c in out]
|
||||
# staff_id excluida (PK); las otras dos ordenadas por distinct desc.
|
||||
assert cols == ["manager_staff_id", "store_id"]
|
||||
refs = {c["column"]: c["ref_table_guess"] for c in out}
|
||||
assert refs["manager_staff_id"] == "manager_staff"
|
||||
assert refs["store_id"] == "store"
|
||||
|
||||
|
||||
def test_max_candidates_corta_la_lista():
|
||||
# max_candidates limita el numero de sugerencias devueltas.
|
||||
profile = {
|
||||
"n_rows": 10000,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_col("a_id", distinct_count=300, unique_pct=0.03),
|
||||
_col("b_id", distinct_count=200, unique_pct=0.02),
|
||||
_col("c_id", distinct_count=100, unique_pct=0.01),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile, max_candidates=2)
|
||||
assert [c["column"] for c in out] == ["a_id", "b_id"]
|
||||
|
||||
|
||||
def test_id_generico_solo_nunca_es_fk():
|
||||
# 'id'/'Id'/'ID' a secas (sin stem) jamas se sugieren como FK.
|
||||
profile = {
|
||||
"n_rows": 500,
|
||||
"key_candidates": [],
|
||||
"columns": [
|
||||
_col("id", distinct_count=500, unique_pct=1.0),
|
||||
_col("Id", distinct_count=120, unique_pct=0.24),
|
||||
_col("ID", distinct_count=80, unique_pct=0.16),
|
||||
],
|
||||
}
|
||||
out = suggest_intratable_fk_candidates(profile)
|
||||
assert out == []
|
||||
@@ -0,0 +1,79 @@
|
||||
---
|
||||
name: summarize_outlier_dims
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "def summarize_outlier_dims(raw_numeric: dict, outlier_rows: list, top_k: int = 3) -> list"
|
||||
description: "Explica QUE columnas hacen rara cada fila anomala detectada por isolation_forest_outliers. Para cada {row_index, score} reconstruye la fila valida (mismo filtro de columnas numericas y mismo descarte de filas con None que el detector, asi row_index coincide) y devuelve las top_k columnas de mayor |z-score| poblacional (ddof=0). Capa de explicabilidad del paso de outliers multivariante en EDA. Pura y determinista; ante entradas vacias/invalidas o sin filas validas devuelve [] sin petar."
|
||||
tags: [eda, models, outliers, anomaly-detection, explainability, z-score, multivariate]
|
||||
params:
|
||||
- name: raw_numeric
|
||||
desc: "dict {nombre_columna: [valores]} alineado por fila (como ctx['raw_numeric'] del motor AutomaticEDA). Solo se usan columnas con todos los valores numericos (None permitido por fila; bool/str/NaN/Inf descartan la columna entera) — filtro IDENTICO al de isolation_forest_outliers para que row_index coincida."
|
||||
- name: outlier_rows
|
||||
desc: "Lista de {row_index, score} tal cual la devuelve isolation_forest_outliers. row_index cuenta SOLO las filas validas (sin None) en orden de aparicion, base 0. Entradas fuera de rango o malformadas se ignoran defensivamente."
|
||||
- name: top_k
|
||||
desc: "Numero de columnas (las de mayor |z-score|) a reportar por outlier. Default 3. Valores invalidos (no-int, bool, <1) caen a 3."
|
||||
output: "Lista paralela a outlier_rows (mismo orden) de dicts {row_index: int, score: float, dims: [{col: str, value: float, z: float}, ...]}. dims trae hasta top_k columnas ordenadas por |z| descendente, con z (z-score poblacional, ddof=0) redondeado a 3 decimales; si una columna tiene std==0 su z es 0. Las entradas de outlier_rows fuera de rango/malformadas se omiten. Ante raw_numeric vacio/no-dict, outlier_rows no-lista, 0 columnas numericas o 0 filas validas devuelve []."
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: []
|
||||
tested: true
|
||||
tests: ["test_row_index_skips_none_rows", "test_extreme_row_flagged_via_isolation", "test_out_of_range_row_index_is_ignored", "test_degrades_to_empty_on_invalid_inputs"]
|
||||
test_file_path: "python/functions/datascience/summarize_outlier_dims_test.py"
|
||||
file_path: "python/functions/datascience/summarize_outlier_dims.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from datascience import isolation_forest_outliers, summarize_outlier_dims
|
||||
|
||||
# Nube densa alrededor del origen + 1 fila con un valor extremo en "c".
|
||||
raw_numeric = {
|
||||
"a": [0.1, 0.2, -0.1, 0.0, 0.3, -0.2, 0.15, -0.05, 0.25, 0.2, -0.3, 0.1],
|
||||
"b": [1.0, 1.1, 0.9, 1.2, 0.8, 1.0, 1.1, 0.95, 1.05, 0.9, 1.15, 1.0],
|
||||
"c": [5.0, 5.2, 4.8, 5.1, 4.9, 5.0, 4.95, 5.05, 4.9, 500.0, 5.1, 5.0],
|
||||
}
|
||||
|
||||
result = isolation_forest_outliers(raw_numeric, contamination=0.1)
|
||||
summary = summarize_outlier_dims(raw_numeric, result["outlier_rows"], top_k=3)
|
||||
|
||||
for item in summary:
|
||||
top = item["dims"][0]
|
||||
print(item["row_index"], top["col"], top["value"], top["z"])
|
||||
# La fila del valor 500 sale con dim top "c" y |z| alto: es lo que la hace rara.
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Justo **despues** de `isolation_forest_outliers`, cuando ya sabes QUE filas son
|
||||
anomalas y quieres explicar POR QUE: en que columnas se desvian mas respecto al
|
||||
resto. Util para rellenar la seccion de outliers de un report/notebook EDA con
|
||||
"la fila 9 es rara sobre todo por `c` (z=+3.3)" en lugar de solo un row_index
|
||||
opaco. Pasa el mismo `raw_numeric` que diste al detector y su `outlier_rows`
|
||||
intacto; el `row_index` apunta a la misma fila porque ambas funciones aplican el
|
||||
mismo filtro de columnas y el mismo descarte de filas con None.
|
||||
|
||||
## Gotchas
|
||||
|
||||
- **Mismo `raw_numeric` que el detector**: el `row_index` solo coincide si pasas
|
||||
el mismo dict de columnas (mismo orden, mismas listas) con el que llamaste a
|
||||
`isolation_forest_outliers`. Si cambias las columnas o el orden, los indices
|
||||
dejan de mapear.
|
||||
- **`row_index` es relativo a las filas validas**: las filas con `None` en
|
||||
cualquier columna usada se descartan y los indices se recalculan sobre las que
|
||||
quedan (base 0, orden de aparicion). No mapea 1:1 con las listas de entrada si
|
||||
hay None.
|
||||
- **z-score poblacional (ddof=0)**: se usa la desviacion tipica poblacional,
|
||||
consistente con el escalado del detector. Columnas con `std==0` (todos los
|
||||
valores iguales) dan `z=0`, asi que nunca aparecen como "raras".
|
||||
- **Devuelve `[]` en vez de petar**: entrada no-dict/no-lista, 0 columnas
|
||||
numericas, 0 filas validas, o todas las entradas fuera de rango -> lista vacia.
|
||||
No lanza excepciones.
|
||||
- **No llama a `isolation_forest_outliers`**: solo consume su salida. Es una
|
||||
funcion independiente (no la importa), por eso `uses_functions` esta vacio.
|
||||
@@ -0,0 +1,144 @@
|
||||
"""Explica que dimensiones (columnas) hacen rara cada fila anomala.
|
||||
|
||||
Toma la salida multivariante de `isolation_forest_outliers` (lista de
|
||||
`{row_index, score}`) y, para cada outlier, devuelve las columnas con mayor
|
||||
|z-score| respecto a la distribucion de las filas validas. Es la capa de
|
||||
"explicabilidad" del paso de outliers multivariante en la fase EDA: el
|
||||
Isolation Forest dice QUE filas son raras, esta funcion dice POR QUE (en que
|
||||
columnas se desvian mas).
|
||||
|
||||
Pura y determinista: reconstruye EXACTAMENTE las mismas "filas validas" que usa
|
||||
`isolation_forest_outliers` (mismo filtro de columnas numericas y mismo descarte
|
||||
de filas con None), de modo que el `row_index` apunta a la misma fila en ambas
|
||||
funciones. No hace I/O ni depende de estado.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _is_finite_number(v) -> bool:
|
||||
"""True si v es int/float finito. bool NO cuenta; NaN/Inf tampoco."""
|
||||
if isinstance(v, bool):
|
||||
return False
|
||||
if not isinstance(v, (int, float)):
|
||||
return False
|
||||
if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def summarize_outlier_dims(
|
||||
raw_numeric: dict,
|
||||
outlier_rows: list,
|
||||
top_k: int = 3,
|
||||
) -> list:
|
||||
"""Resume las dimensiones que mas desvian a cada fila anomala.
|
||||
|
||||
Args:
|
||||
raw_numeric: dict {nombre_columna: [valores]} alineado por fila (como
|
||||
ctx['raw_numeric'] del motor AutomaticEDA). Solo se usan columnas
|
||||
cuyos valores sean todos numericos (None permitido por fila; bool,
|
||||
str, NaN e Inf descartan la columna entera) — filtro identico al de
|
||||
isolation_forest_outliers.
|
||||
outlier_rows: lista de {row_index, score} tal como la devuelve
|
||||
isolation_forest_outliers. row_index cuenta SOLO las filas validas
|
||||
(sin None) en orden de aparicion, empezando en 0.
|
||||
top_k: numero de columnas (las de mayor |z-score|) a reportar por cada
|
||||
outlier. Default 3. Valores invalidos caen a 3.
|
||||
|
||||
Returns:
|
||||
Lista paralela a outlier_rows (mismo orden) de dicts
|
||||
{row_index, score, dims}, donde dims es la lista de hasta top_k columnas
|
||||
ordenadas por |z| descendente: [{col, value, z}, ...] con z redondeado a
|
||||
3 decimales. Las entradas de outlier_rows fuera de rango o malformadas se
|
||||
omiten (defensivo). Ante raw_numeric vacio/no-dict, outlier_rows
|
||||
no-lista, 0 columnas numericas o 0 filas validas devuelve [].
|
||||
"""
|
||||
# Validacion defensiva de los argumentos principales.
|
||||
if not isinstance(raw_numeric, dict) or not isinstance(outlier_rows, list):
|
||||
return []
|
||||
if not isinstance(top_k, int) or isinstance(top_k, bool) or top_k < 1:
|
||||
top_k = 3
|
||||
|
||||
# Seleccion de columnas numericas: identica a isolation_forest_outliers.
|
||||
# Una columna entra solo si todos sus valores son numericos (None permitido
|
||||
# por fila); cualquier bool/str/NaN/Inf descarta la columna completa.
|
||||
numeric_cols: dict[str, list] = {}
|
||||
for name, values in raw_numeric.items():
|
||||
if not isinstance(values, (list, tuple)):
|
||||
continue
|
||||
ok = True
|
||||
for v in values:
|
||||
if v is None:
|
||||
continue
|
||||
if not _is_finite_number(v):
|
||||
ok = False
|
||||
break
|
||||
if ok:
|
||||
numeric_cols[name] = list(values)
|
||||
|
||||
if len(numeric_cols) < 1:
|
||||
return []
|
||||
|
||||
col_names = list(numeric_cols.keys())
|
||||
try:
|
||||
n_rows_total = min(len(numeric_cols[c]) for c in col_names)
|
||||
except ValueError:
|
||||
return []
|
||||
|
||||
# Reconstruye las filas validas con el MISMO criterio que el detector: la
|
||||
# fila i toma un valor por columna; si cualquier valor es None, la fila se
|
||||
# descarta y NO incrementa el indice valido. Asi row_index de outlier_rows
|
||||
# apunta a esta misma secuencia (base 0, orden de aparicion).
|
||||
valid_rows: list[list[float]] = []
|
||||
for i in range(n_rows_total):
|
||||
row = [numeric_cols[c][i] for c in col_names]
|
||||
if any(v is None for v in row):
|
||||
continue
|
||||
valid_rows.append([float(v) for v in row])
|
||||
|
||||
if not valid_rows:
|
||||
return []
|
||||
|
||||
matrix = np.asarray(valid_rows, dtype=float)
|
||||
n_valid = matrix.shape[0]
|
||||
means = matrix.mean(axis=0)
|
||||
stds = matrix.std(axis=0, ddof=0) # poblacional (ddof=0)
|
||||
|
||||
out: list = []
|
||||
for entry in outlier_rows:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
ri = entry.get("row_index")
|
||||
# bool es subclase de int: lo excluimos explicitamente.
|
||||
if not isinstance(ri, int) or isinstance(ri, bool):
|
||||
continue
|
||||
if ri < 0 or ri >= n_valid:
|
||||
continue
|
||||
|
||||
try:
|
||||
score = float(entry.get("score"))
|
||||
except (TypeError, ValueError):
|
||||
score = 0.0
|
||||
|
||||
row = matrix[ri]
|
||||
dims = []
|
||||
for j, name in enumerate(col_names):
|
||||
std = stds[j]
|
||||
if std == 0.0:
|
||||
z = 0.0
|
||||
else:
|
||||
z = float((row[j] - means[j]) / std)
|
||||
dims.append({"col": name, "value": float(row[j]), "z": z})
|
||||
|
||||
# Mayor |z| primero; sort estable, empates por orden de columna.
|
||||
dims.sort(key=lambda d: abs(d["z"]), reverse=True)
|
||||
dims = dims[:top_k]
|
||||
for d in dims:
|
||||
d["z"] = round(d["z"], 3)
|
||||
|
||||
out.append({"row_index": int(ri), "score": score, "dims": dims})
|
||||
|
||||
return out
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Tests para summarize_outlier_dims."""
|
||||
|
||||
from isolation_forest_outliers import isolation_forest_outliers
|
||||
from summarize_outlier_dims import summarize_outlier_dims
|
||||
|
||||
|
||||
# Dataset compartido: 3 columnas, 13 filas. La fila ORIGINAL 6 tiene None en "a"
|
||||
# (se descarta), de modo que la fila ORIGINAL 10 -- con un valor extremo en "c"
|
||||
# -- queda en el indice VALIDO 9 (no 10). Esto verifica el salto de None.
|
||||
A = [0.1, 0.2, -0.1, 0.0, 0.3, -0.2, None, 0.15, -0.05, 0.25, 0.2, -0.3, 0.1]
|
||||
B = [1.0, 1.1, 0.9, 1.2, 0.8, 1.0, 1.3, 1.1, 0.95, 1.05, 0.9, 1.15, 1.0]
|
||||
C = [5.0, 5.2, 4.8, 5.1, 4.9, 5.0, 5.3, 4.95, 5.05, 4.9, 500.0, 5.1, 5.0]
|
||||
RAW = {"a": A, "b": B, "c": C}
|
||||
|
||||
# Mapa original -> valido (saltando original 6):
|
||||
# orig: 0 1 2 3 4 5 7 8 9 10 11 12
|
||||
# valid: 0 1 2 3 4 5 6 7 8 9 10 11
|
||||
# => el extremo en "c" (original 10) esta en el indice valido 9.
|
||||
EXTREME_VALID_INDEX = 9
|
||||
|
||||
|
||||
def test_row_index_skips_none_rows():
|
||||
# Mapeo directo (sin depender de la aleatoriedad de IsolationForest): el
|
||||
# indice valido 9 debe corresponder a la fila con c == 500 -> el None de la
|
||||
# fila original 6 se salto correctamente.
|
||||
summary = summarize_outlier_dims(
|
||||
RAW, [{"row_index": EXTREME_VALID_INDEX, "score": -0.5}], top_k=3
|
||||
)
|
||||
assert len(summary) == 1
|
||||
entry = summary[0]
|
||||
assert entry["row_index"] == EXTREME_VALID_INDEX
|
||||
assert entry["score"] == -0.5
|
||||
# La dimension dominante es "c", con su valor extremo y |z| alto.
|
||||
top = entry["dims"][0]
|
||||
assert top["col"] == "c"
|
||||
assert top["value"] == 500.0
|
||||
assert abs(top["z"]) > 2.0
|
||||
# top_k respetado: como mucho 3 dims.
|
||||
assert len(entry["dims"]) <= 3
|
||||
|
||||
|
||||
def test_extreme_row_flagged_via_isolation():
|
||||
# Integracion real: detectar outliers y explicarlos.
|
||||
result = isolation_forest_outliers(RAW, contamination=0.1)
|
||||
assert "note" not in result
|
||||
outlier_rows = result["outlier_rows"]
|
||||
assert outlier_rows # al menos un outlier
|
||||
|
||||
summary = summarize_outlier_dims(RAW, outlier_rows, top_k=3)
|
||||
# Paralela a outlier_rows (todos los indices estan en rango).
|
||||
assert len(summary) == len(outlier_rows)
|
||||
|
||||
by_index = {e["row_index"]: e for e in summary}
|
||||
# El punto extremo debe estar entre los outliers detectados...
|
||||
assert EXTREME_VALID_INDEX in by_index
|
||||
# ...y su dimension top debe ser "c" (donde se desvia ~muchas sigmas).
|
||||
extreme = by_index[EXTREME_VALID_INDEX]
|
||||
assert extreme["dims"][0]["col"] == "c"
|
||||
assert abs(extreme["dims"][0]["z"]) > 2.0
|
||||
|
||||
|
||||
def test_out_of_range_row_index_is_ignored():
|
||||
# Indices fuera de rango se omiten en lugar de petar.
|
||||
summary = summarize_outlier_dims(
|
||||
RAW,
|
||||
[
|
||||
{"row_index": 999, "score": -1.0},
|
||||
{"row_index": -1, "score": -1.0},
|
||||
{"row_index": EXTREME_VALID_INDEX, "score": -0.5},
|
||||
],
|
||||
top_k=2,
|
||||
)
|
||||
# Solo sobrevive el indice valido; los otros dos se descartan.
|
||||
assert len(summary) == 1
|
||||
assert summary[0]["row_index"] == EXTREME_VALID_INDEX
|
||||
assert len(summary[0]["dims"]) <= 2
|
||||
|
||||
|
||||
def test_degrades_to_empty_on_invalid_inputs():
|
||||
# raw_numeric vacio + outlier_rows vacio.
|
||||
assert summarize_outlier_dims({}, [], 3) == []
|
||||
# raw_numeric no es dict.
|
||||
assert summarize_outlier_dims("not a dict", [{"row_index": 0}], 3) == []
|
||||
# outlier_rows no es lista.
|
||||
assert summarize_outlier_dims(RAW, "not a list", 3) == []
|
||||
# Sin columnas numericas (todas con strings) -> [].
|
||||
assert summarize_outlier_dims(
|
||||
{"s": ["x", "y", "z"]}, [{"row_index": 0, "score": -1.0}], 3
|
||||
) == []
|
||||
# Entradas malformadas dentro de outlier_rows se ignoran (no petan).
|
||||
assert summarize_outlier_dims(
|
||||
RAW, ["nope", 42, {"no_row_index": 1}], 3
|
||||
) == []
|
||||
@@ -3,7 +3,7 @@ name: summarize_table_duckdb
|
||||
kind: function
|
||||
lang: py
|
||||
domain: datascience
|
||||
version: "1.0.0"
|
||||
version: "1.1.0"
|
||||
purity: impure
|
||||
signature: "def summarize_table_duckdb(db_path: str, table: str, high_card_ratio: float = 0.9) -> dict"
|
||||
description: "Perfila una tabla DuckDB en una sola pasada SQL (SUMMARIZE, push-down sin traer filas a RAM) y devuelve el esqueleto de un TableProfile con el perfil base por columna. Corazon del grupo eda: base barata sobre la que otras funciones anaden lo estadistico fino (skew/kurtosis/histograma sobre muestra)."
|
||||
@@ -64,6 +64,7 @@ else:
|
||||
- **`distinct_count` exacto para tablas <=200k filas, aproximado+capado por encima**: `SUMMARIZE` usa HyperLogLog (`approx_unique`), que SOBREESTIMA y en tablas pequenas puede reportar mas distintos que filas (inflando `unique_pct` por encima de 1.0 y disparando flags `possible_id` falsos). Por eso, para `n_rows <= 200000` la funcion calcula `COUNT(DISTINCT)` EXACTO en una sola query combinada (barata) y usa ese valor. Para tablas mas grandes mantiene `approx_unique` pero lo CAPA a `n_rows` (`distinct_count = min(approx_unique, n_rows)`). En ambos casos `unique_pct = min(distinct_count / n_rows, 1.0)`, asi que `distinct_count` nunca supera las filas ni `unique_pct` pasa de 1.0. Los flags `possible_id` / `high_cardinality` derivan de ese `distinct_count` ya corregido (exacto y fiable por debajo de 200k filas; aproximado y conservador por encima).
|
||||
- **`SUMMARIZE` NO da skew, kurtosis ni histograma**, ni percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates ni quality_score. Esas claves quedan en `None`/`[]` a proposito: las rellena otra funcion del grupo `eda` sobre una muestra. El sub-dict `numeric` solo trae min, max, mean, std, p25, p50, p75.
|
||||
- **`SUMMARIZE.count` es el total de filas, no el no-nulo**: la funcion deriva el `count` no-nulo del ColumnProfile como `n_rows - null_count` (con `null_count` redondeado de `null_percentage`).
|
||||
- **`duplicate_rows`/`duplicate_pct` se pueblan push-down** (desde v1.1.0) con `count(*)` sobre `SELECT DISTINCT *` (sin traer filas a RAM): `duplicate_rows = n_rows - filas_distintas`, `duplicate_pct` en fraccion 0-1. Habilitan la dimension de unicidad de registro del score de dataset (`profile_table` paso 6). Si la tabla tiene tipos no comparables con `DISTINCT` (BLOB/LIST/MAP) la query degrada y ambas vuelven a `None` (renormaliza el score a solo `cell_quality`).
|
||||
- **min/max/avg/std/q25/q50/q75 vienen como strings** desde DuckDB; se convierten a float (None si la columna no es numerica).
|
||||
- **Requiere DuckDB 1.5.2** (columnas de `SUMMARIZE` validadas con esa version: column_name, column_type, min, max, approx_unique, avg, std, q25, q50, q75, count, null_percentage).
|
||||
- **El identificador de tabla se interpola** (no parametrizable en `SUMMARIZE`): por eso se valida contra `^[A-Za-z_][A-Za-z0-9_]*$` antes de citarlo. Un nombre invalido (p.ej. con `;` o espacios) devuelve `{status:'error'}` sin tocar la base.
|
||||
|
||||
@@ -196,6 +196,21 @@ def summarize_table_duckdb(
|
||||
sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
|
||||
)
|
||||
|
||||
# Unicidad de registro: filas duplicadas via COUNT de filas distintas
|
||||
# push-down (DISTINCT *), sin traer filas a RAM. Habilita la dimension
|
||||
# de uniqueness del score de dataset (1 - duplicate_pct). Degrada a None
|
||||
# si la tabla tiene tipos no comparables con DISTINCT (BLOB/LIST/MAP).
|
||||
duplicate_rows = None
|
||||
duplicate_pct = None
|
||||
if n_rows > 0:
|
||||
dup_res = duckdb_query_readonly(
|
||||
db_path, f"SELECT count(*) AS c FROM (SELECT DISTINCT * FROM {quoted})"
|
||||
)
|
||||
if dup_res["status"] == "ok" and dup_res["rows"]:
|
||||
distinct_rows = int(dup_res["rows"][0]["c"])
|
||||
duplicate_rows = max(0, n_rows - distinct_rows)
|
||||
duplicate_pct = duplicate_rows / n_rows # fraccion 0-1
|
||||
|
||||
profile = {
|
||||
"table": table,
|
||||
"source": "duckdb",
|
||||
@@ -203,8 +218,8 @@ def summarize_table_duckdb(
|
||||
"n_rows": n_rows,
|
||||
"n_cols": len(columns),
|
||||
"size_bytes": None,
|
||||
"duplicate_rows": None,
|
||||
"duplicate_pct": None,
|
||||
"duplicate_rows": duplicate_rows,
|
||||
"duplicate_pct": duplicate_pct,
|
||||
"constant_cols": constant_cols,
|
||||
"all_null_cols": all_null_cols,
|
||||
"null_cell_pct": null_cell_pct,
|
||||
|
||||
@@ -54,6 +54,30 @@ def test_shape_y_metadatos_tabla(db):
|
||||
assert profile["correlations"] is None
|
||||
|
||||
|
||||
def test_duplicate_pct_sin_duplicados(db):
|
||||
"""Tabla con todas las filas distintas: duplicate_pct = 0, no None."""
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
assert profile["duplicate_rows"] == 0
|
||||
assert profile["duplicate_pct"] == 0.0
|
||||
|
||||
|
||||
def test_duplicate_pct_con_duplicados(tmp_path):
|
||||
"""Filas repetidas: duplicate_rows/duplicate_pct se pueblan push-down."""
|
||||
path = str(tmp_path / "dups.duckdb")
|
||||
con = duckdb.connect(path)
|
||||
con.execute("CREATE TABLE t (a INTEGER, b VARCHAR)")
|
||||
# 5 filas, 2 de ellas idénticas a otras -> 2 duplicadas sobre 5 = 0.4.
|
||||
con.execute(
|
||||
"INSERT INTO t VALUES "
|
||||
"(1,'x'), (2,'y'), (1,'x'), (3,'z'), (2,'y')"
|
||||
)
|
||||
con.close()
|
||||
profile = summarize_table_duckdb(path, "t")["profile"]
|
||||
assert profile["n_rows"] == 5
|
||||
assert profile["duplicate_rows"] == 2
|
||||
assert profile["duplicate_pct"] == 0.4
|
||||
|
||||
|
||||
def test_column_profile_shape(db):
|
||||
profile = summarize_table_duckdb(db, "ventas")["profile"]
|
||||
by_name = {c["name"]: c for c in profile["columns"]}
|
||||
|
||||
@@ -4,7 +4,7 @@ kind: pipeline
|
||||
lang: py
|
||||
domain: pipelines
|
||||
purity: impure
|
||||
version: "1.0.0"
|
||||
version: "1.1.0"
|
||||
signature: "def profile_table(db_path: str, table: str, backend: str = \"duckdb\", sample: int = 5000, run_models: bool = False, run_llm: bool = False, run_series: bool = False, emit_pdf: bool = False, emit_automatic: bool = False, report_dir: str = \"reports\", write_report: bool = True) -> dict"
|
||||
description: "Orquestador one-shot del grupo de capacidad eda: perfila UNA tabla (DuckDB o PostgreSQL) end-to-end componiendo las funciones del grupo (perfil base SQL + muestreo read-only + inferencia semantica + promocion de tipo + estadistica numerica/categorica + score de calidad + correlaciones con correccion FDR + re-expresion de Tukey + avisos exploratorios) y, opcional, modelos baratos (run_models), interpretacion LLM (run_llm) y analisis de serie temporal por columna (run_series: estacionariedad ADF+KPSS, ACF/PACF, STL, retornos). Emite el TableProfile completo mas (opcional) report markdown + JSON sidecar + PDF movil (emit_pdf). Es la composicion canonica para hazme un EDA de esta tabla."
|
||||
tags: [eda, duckdb, postgres, profiling, data-quality, pipeline, dataops, timeseries]
|
||||
@@ -114,3 +114,12 @@ para auditar la calidad de una tabla ya productiva. Reemplaza orquestar a mano
|
||||
Formatos exoticos pueden descartarse silenciosamente del calculo numerico.
|
||||
- `db_path` debe existir: DuckDB read-only NO crea la base. El muestreo usa el
|
||||
sandbox por defecto de `duckdb_query_readonly` (sin acceso a FS/red).
|
||||
- **Score de calidad (report 2046, desde v1.1.0).** Paso 5: cada columna recibe
|
||||
`quality_score` de `column_quality_score` con la formula 60/40
|
||||
(completeness/validity); al promocionar texto a numero/fecha se expone
|
||||
`col["validity_rate"]` (parse rate de la muestra) para alimentar la dimension
|
||||
validity. Paso 6: el score de dataset NO es la media simple — es
|
||||
`100 * (0.85*cell_quality + 0.15*row_uniqueness)`, donde
|
||||
`cell_quality = media(score_col/100)` y `row_uniqueness = 1 - duplicate_pct`.
|
||||
Si `duplicate_pct` es `None` (backend sin calcularlo) el score se renormaliza
|
||||
a solo `cell_quality`. Los outliers NO bajan el score (van a `observations`).
|
||||
|
||||
@@ -477,9 +477,18 @@ def profile_table(
|
||||
if vals and (len(ok) / len(vals)) >= _PROMOTE_MIN_PARSE:
|
||||
col["inferred_type"] = "numeric"
|
||||
inferred = "numeric"
|
||||
# Tasa de parseo real de la muestra: alimenta la
|
||||
# dimension validity de column_quality_score (fraccion
|
||||
# de valores conformes al tipo numerico promovido).
|
||||
col["validity_rate"] = len(ok) / len(vals)
|
||||
elif semantic in _DATETIME_SEMANTIC:
|
||||
col["inferred_type"] = "datetime"
|
||||
inferred = "datetime"
|
||||
# Tasa de parseo de la muestra a fecha (mismo papel que el
|
||||
# parse rate numerico) para la dimension validity.
|
||||
parsed_dt = [_to_ordinal_days(v) for v in vals]
|
||||
ok_dt = [d for d in parsed_dt if d is not None]
|
||||
col["validity_rate"] = (len(ok_dt) / len(vals)) if vals else None
|
||||
|
||||
# 4) Enriquecer segun el inferred_type final.
|
||||
if inferred == "numeric":
|
||||
@@ -506,11 +515,36 @@ def profile_table(
|
||||
# 5) Score de calidad por columna.
|
||||
col["quality_score"] = column_quality_score(col).get("score")
|
||||
|
||||
# 6) Score agregado de la tabla (media de columnas).
|
||||
# 6) Score agregado de la tabla (report 2046): NO media simple.
|
||||
# cell_quality = media de los scores de columna, en [0,1].
|
||||
# row_uniqueness = 1 - duplicate_pct (unicidad de registro).
|
||||
# score = 100 * (0.85*cell_quality + 0.15*row_uniqueness).
|
||||
# Renormaliza a solo cell_quality si duplicate_pct no se pudo calcular.
|
||||
scores = [
|
||||
c["quality_score"] for c in cols if c.get("quality_score") is not None
|
||||
]
|
||||
prof["quality_score"] = round(sum(scores) / len(scores), 1) if scores else None
|
||||
if scores:
|
||||
cell_quality = (sum(scores) / len(scores)) / 100.0
|
||||
dup_pct = prof.get("duplicate_pct")
|
||||
if dup_pct is not None:
|
||||
try:
|
||||
d = float(dup_pct)
|
||||
except (TypeError, ValueError):
|
||||
d = None
|
||||
else:
|
||||
d = None
|
||||
if d is not None:
|
||||
# Tolerar escala 0-100 por si algun backend la entrega asi.
|
||||
if d > 1.0:
|
||||
d = d / 100.0
|
||||
row_uniqueness = max(0.0, min(1.0, 1.0 - d))
|
||||
prof["quality_score"] = round(
|
||||
100.0 * (0.85 * cell_quality + 0.15 * row_uniqueness), 1
|
||||
)
|
||||
else:
|
||||
prof["quality_score"] = round(100.0 * cell_quality, 1)
|
||||
else:
|
||||
prof["quality_score"] = None
|
||||
|
||||
# 7) Candidatos a clave.
|
||||
key_candidates = []
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user