Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 048781df3f |
@@ -89,35 +89,6 @@ _DEF_MAX_CARD = 20
|
||||
_DEF_MAX_MEASURES = 4
|
||||
_DEF_TOP_N = 12
|
||||
|
||||
# Glossary terms this chapter explains. Both appear in the always-rendered intro,
|
||||
# so they are registered and marked clickable whenever a collector is in ctx —
|
||||
# the canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
|
||||
# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
|
||||
# block. Mapping key -> (label, definition).
|
||||
_TERM_DEFS = {
|
||||
"groupby": (
|
||||
"Agrupación (split-apply-combine)",
|
||||
"Operación de agrupación (group by): parte la tabla en grupos según los "
|
||||
"valores de una columna categórica, aplica un cálculo (conteo, media, "
|
||||
"mediana…) dentro de cada grupo y combina los resultados en una tabla "
|
||||
"resumen. Es el patrón split-apply-combine."),
|
||||
"pivot_table": (
|
||||
"Tabla dinámica (pivot)",
|
||||
"Tabla dinámica que cruza dos variables categóricas — una en las filas y "
|
||||
"otra en las columnas — y rellena cada celda con un agregado (media, "
|
||||
"suma…) de una medida numérica. Resume de un vistazo cómo interactúan las "
|
||||
"dos categóricas sobre esa medida."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
it), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the other chapters' defensive style).
|
||||
@@ -554,18 +525,13 @@ def _sections_live(profile: dict, ctx: dict, candidates: dict) -> list:
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entry point.
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _intro_blocks(gloss=None, mark_term: bool = False) -> list:
|
||||
if gloss is not None:
|
||||
for key, (label, definition) in _TERM_DEFS.items():
|
||||
gloss.add(key, label, definition)
|
||||
t_groupby = _term(mark_term, "groupby", "**por grupos** (split-apply-combine)")
|
||||
t_pivot = _term(mark_term, "pivot_table", "**tablas dinámicas** (pivot)")
|
||||
def _intro_blocks() -> list:
|
||||
text = (
|
||||
f"Este capítulo analiza la tabla {t_groupby}: "
|
||||
"Este capítulo analiza la tabla **por grupos** (split-apply-combine): "
|
||||
"elige las columnas categóricas más informativas — por su cardinalidad "
|
||||
"y relevancia, no todas contra todas, para no inflar comparaciones "
|
||||
"espurias — y resume las variables numéricas dentro de cada grupo "
|
||||
f"(conteo, media, mediana, desviación). Las {t_pivot} "
|
||||
"(conteo, media, mediana, desviación). Las **tablas dinámicas** (pivot) "
|
||||
"cruzan dos categóricas sobre una medida, y los **gráficos de barras** "
|
||||
"(siempre desde cero) comparan los grupos de un vistazo."
|
||||
)
|
||||
@@ -590,21 +556,13 @@ def build_agregacion(profile: dict, ctx: dict):
|
||||
if not isinstance(profile, dict):
|
||||
return None
|
||||
|
||||
# Shared glossary collector: groupby + pivot_table live in the always-present
|
||||
# intro, so they are registered + marked there. Degrades silently (mark_term
|
||||
# False) when no collector is in ctx (standalone render).
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
|
||||
# Pre-computed results take precedence (offline / tests / forward-compat).
|
||||
pre = ctx.get("aggregations")
|
||||
if _is_dict(pre) and (pre.get("groupby") or pre.get("pivots")):
|
||||
sections = _sections_from_precomputed(pre)
|
||||
if not sections:
|
||||
return None
|
||||
blocks = (_intro_blocks(gloss, mark_term) + sections
|
||||
+ _insights_section(ctx))
|
||||
blocks = _intro_blocks() + sections + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -625,11 +583,10 @@ def build_agregacion(profile: dict, ctx: dict):
|
||||
"crudos. Pasa ctx['db_path'] + ctx['table'] (para el cálculo "
|
||||
"push-down en DuckDB) o ctx['aggregations'] ya precalculado. "
|
||||
f"Columnas categóricas candidatas: {keys or '—'}.")
|
||||
blocks = (_intro_blocks(gloss, mark_term) + [note]
|
||||
+ _insights_section(ctx))
|
||||
blocks = _intro_blocks() + [note] + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
blocks = _intro_blocks(gloss, mark_term) + sections + _insights_section(ctx)
|
||||
blocks = _intro_blocks() + sections + _insights_section(ctx)
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -254,25 +254,3 @@ def test_anti_corte_muchos_grupos_y_texto_largo():
|
||||
# First, middle and last words of the long paragraph all present.
|
||||
for i in (0, 60, 119):
|
||||
assert f"palabra{i}" in txt
|
||||
|
||||
|
||||
def test_glosario_engancha_groupby_y_pivot():
|
||||
"""Mejora 4b: la agrupación (split-apply-combine) y la tabla dinámica (pivot)
|
||||
se registran en el colector compartido y se marcan clicables en el cuerpo.
|
||||
Sin colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ctx = dict(_ctx_precomputed())
|
||||
ctx["glossary"] = g
|
||||
ch = build_agregacion(_profile(), ctx)
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"groupby", "pivot_table"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
assert "[[term:groupby]]" in body and "[[term:pivot_table]]" in body
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_agregacion(_profile(), _ctx_precomputed())
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -47,53 +47,6 @@ _MAX_MATRIX_LABELS = 16
|
||||
# How many pairs to show in each of the top-positive / top-negative tables.
|
||||
_TOP_N = 10
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance in the
|
||||
# body — the canonical two-step pattern (see ``cat_distr`` for the reference
|
||||
# implementation): ``glossary.add(key, label, definition)`` + the inline span
|
||||
# ``[[term:KEY]]texto visible[[/term]]`` in a Markdown block. Mapping key ->
|
||||
# (label, definition). ``fdr`` is only registered when the FDR summary is present.
|
||||
_TERM_DEFS = {
|
||||
"pearson": (
|
||||
"Pearson (coeficiente r)",
|
||||
"Coeficiente de correlación lineal de Pearson (r) entre dos variables "
|
||||
"numéricas. Va de −1 (relación lineal inversa perfecta) a +1 (directa "
|
||||
"perfecta); 0 indica ausencia de relación lineal. Sólo capta relaciones "
|
||||
"lineales, por eso lleva signo."),
|
||||
"spearman": (
|
||||
"Spearman (correlación de rangos)",
|
||||
"Correlación de rangos de Spearman: el coeficiente de Pearson calculado "
|
||||
"sobre los puestos (rangos) de los valores en vez de sus magnitudes. Mide "
|
||||
"relaciones monótonas (no necesariamente lineales), va de −1 a +1 y es "
|
||||
"robusta frente a valores atípicos."),
|
||||
"cramers_v": (
|
||||
"Cramér's V",
|
||||
"Medida de asociación entre dos variables categóricas, derivada del "
|
||||
"estadístico chi-cuadrado y normalizada al rango 0–1 (0 = independientes, "
|
||||
"1 = asociación total). No tiene signo: sólo mide la intensidad."),
|
||||
"correlation_ratio": (
|
||||
"Razón de correlación (η)",
|
||||
"Razón de correlación (eta) entre una variable numérica y una "
|
||||
"categórica: la fracción de la varianza de la numérica explicada por los "
|
||||
"grupos de la categórica. Va de 0 (los grupos no explican nada) a 1 (la "
|
||||
"explican toda); no tiene signo."),
|
||||
"fdr": (
|
||||
"Comparaciones múltiples (FDR)",
|
||||
"Al evaluar muchos pares a la vez, algunos parecen significativos por "
|
||||
"puro azar. La corrección por tasa de falsos descubrimientos (FDR, "
|
||||
"Benjamini-Hochberg) ajusta los p-valores para controlar la proporción "
|
||||
"esperada de falsos positivos entre los pares declarados significativos."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
the marker), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _is_num(v) -> bool:
|
||||
"""True for a real, finite int/float (not bool, not NaN/inf)."""
|
||||
@@ -292,7 +245,7 @@ def _methods_block(corr: dict):
|
||||
return model.KVTable(rows=rows, title="Métodos de asociación")
|
||||
|
||||
|
||||
def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
|
||||
def _fdr_text(corr: dict) -> str | None:
|
||||
"""One-line summary of the multiple-testing (FDR) correction, or None."""
|
||||
mt = corr.get("multiple_testing")
|
||||
if not isinstance(mt, dict) or not mt:
|
||||
@@ -301,8 +254,7 @@ def _fdr_text(corr: dict, mark_term: bool = False) -> str | None:
|
||||
alpha = mt.get("alpha")
|
||||
n_tests = mt.get("n_tests")
|
||||
n_rej = mt.get("n_rejected")
|
||||
multi = _term(mark_term, "fdr", "comparaciones múltiples")
|
||||
parts = [f"Corrección por {multi} ({method}"]
|
||||
parts = [f"Corrección por comparaciones múltiples ({method}"]
|
||||
if _is_num(alpha):
|
||||
parts[0] += f", α={float(alpha):g}"
|
||||
parts[0] += ")."
|
||||
@@ -337,31 +289,13 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
|
||||
blocks: list = []
|
||||
|
||||
# Register the always-present method terms in the shared glossary and mark
|
||||
# their first appearance clickable (the FDR term is registered lazily below,
|
||||
# only when the FDR summary is actually emitted). Degrades silently when no
|
||||
# collector is in ctx (standalone render) — mark_term stays False.
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
if gloss is not None:
|
||||
for key in ("pearson", "spearman", "cramers_v", "correlation_ratio"):
|
||||
label, definition = _TERM_DEFS[key]
|
||||
gloss.add(key, label, definition)
|
||||
|
||||
# Intro: what this chapter shows and how to read the sign. Build the marked
|
||||
# method names as locals first (avoids backslash-in-f-string for "Cramér's V").
|
||||
t_pearson = _term(mark_term, "pearson", "Pearson")
|
||||
t_spearman = _term(mark_term, "spearman", "Spearman")
|
||||
t_cramers = _term(mark_term, "cramers_v", "Cramér's V")
|
||||
t_corr_ratio = _term(mark_term, "correlation_ratio", "razón de correlación")
|
||||
# Intro: what this chapter shows and how to read the sign.
|
||||
blocks.append(model.Markdown(text=(
|
||||
"Asociación entre columnas. Cada par se evalúa con la métrica adecuada a "
|
||||
f"sus tipos ({t_pearson}/{t_spearman} entre numéricas — con **signo**; "
|
||||
f"{t_cramers} entre categóricas; {t_corr_ratio} num-categórica; "
|
||||
"información mutua como medida común no lineal). Sólo las correlaciones "
|
||||
"**num-num** tienen dirección: por eso los pares **negativos** son siempre "
|
||||
"num-num.")))
|
||||
"sus tipos (Pearson/Spearman entre numéricas — con **signo**; Cramér's V "
|
||||
"entre categóricas; razón de correlación num-categórica; información mutua "
|
||||
"como medida común no lineal). Sólo las correlaciones **num-num** tienen "
|
||||
"dirección: por eso los pares **negativos** son siempre num-num.")))
|
||||
|
||||
# 1) Association matrix (heatmap).
|
||||
labels, trimmed = _ordered_labels(pairs)
|
||||
@@ -403,13 +337,9 @@ def build_correlacion(profile: dict, ctx: dict):
|
||||
"no estacionarias y pueden ser espurias (Granger–Newbold). Compáralas "
|
||||
"sobre los retornos/diferencias antes de interpretarlas.")))
|
||||
|
||||
# 4) FDR summary + methods legend. Register the FDR term only when its
|
||||
# summary is emitted, so the glossary never lists an unreferenced entry.
|
||||
fdr_text = _fdr_text(corr, mark_term=mark_term)
|
||||
# 4) FDR summary + methods legend.
|
||||
fdr_text = _fdr_text(corr)
|
||||
if fdr_text:
|
||||
if gloss is not None:
|
||||
label, definition = _TERM_DEFS["fdr"]
|
||||
gloss.add("fdr", label, definition)
|
||||
blocks.append(model.Markdown(text=fdr_text))
|
||||
methods = _methods_block(corr)
|
||||
if methods is not None:
|
||||
|
||||
@@ -173,25 +173,3 @@ def test_anticorte_matriz_ancha_y_etiquetas_largas_no_se_cortan():
|
||||
assert rx["path"] == pptx and os.path.exists(pptx) and rx["n_slides"] >= 1
|
||||
# A short, unbreakable fragment of the long label survives the wrap.
|
||||
assert "azufre" in _pdf_text(pdf)
|
||||
|
||||
|
||||
def test_glosario_engancha_metodos_y_fdr():
|
||||
"""Mejora 4b: los métodos de correlación (Pearson, Spearman, Cramér's V,
|
||||
razón de correlación) y la corrección por comparaciones múltiples (FDR) se
|
||||
registran en el colector compartido y se marcan clicables en el cuerpo. Sin
|
||||
colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ch = build_correlacion(_profile(), {"glossary": g})
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
for k in ("pearson", "spearman", "cramers_v", "correlation_ratio", "fdr"):
|
||||
assert f"[[term:{k}]]" in body, k
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_correlacion(_profile(), {})
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -55,62 +55,6 @@ _CLUSTER_COLORS = [
|
||||
"#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac",
|
||||
]
|
||||
|
||||
# Glossary terms this chapter explains. Each is registered in the shared
|
||||
# collector (ctx['glossary']) and marked clickable on its first appearance — the
|
||||
# canonical two-step pattern (see ``cat_distr``): ``glossary.add(key, label,
|
||||
# definition)`` + the inline span ``[[term:KEY]]texto[[/term]]`` in a Markdown
|
||||
# block. A term is registered only when its section is actually rendered, so the
|
||||
# glossary never lists an entry no in-text appearance points to.
|
||||
_TERM_DEFS = {
|
||||
"zscore": (
|
||||
"Estandarización z-score",
|
||||
"Transformación que lleva cada columna numérica a media 0 y desviación "
|
||||
"típica 1: a cada valor le resta la media de su columna y lo divide por "
|
||||
"la desviación típica. Así variables con escalas muy distintas (euros "
|
||||
"frente a un ratio 0–1) pesan por igual en las distancias y la varianza."),
|
||||
"pca": (
|
||||
"PCA (componentes principales)",
|
||||
"El análisis de componentes principales resume muchas variables "
|
||||
"numéricas correlacionadas en pocos ejes nuevos (componentes), "
|
||||
"ortogonales entre sí y ordenados por la cantidad de varianza que "
|
||||
"capturan. Permite ver la estructura de los datos en 2D y saber cuántas "
|
||||
"dimensiones bastan para explicarlos."),
|
||||
"kmeans": (
|
||||
"KMeans (segmentación)",
|
||||
"Algoritmo de agrupamiento no supervisado que reparte las filas en k "
|
||||
"segmentos: asigna cada fila al centro (centroide) más cercano y recoloca "
|
||||
"los centroides de forma iterativa hasta minimizar la distancia interna "
|
||||
"de cada grupo. Aquí k se elige automáticamente."),
|
||||
"silhouette": (
|
||||
"Coeficiente de silueta (silhouette)",
|
||||
"Métrica de calidad de un agrupamiento, en el rango −1 a 1: para cada "
|
||||
"fila compara cómo de cerca está de su propio segmento frente al segmento "
|
||||
"vecino más próximo. Cuanto más alto el promedio, más compactos y "
|
||||
"separados están los segmentos."),
|
||||
"isolation_forest": (
|
||||
"Isolation Forest (anomalías)",
|
||||
"Algoritmo de detección de anomalías multivariante: construye árboles que "
|
||||
"parten el espacio con cortes aleatorios y mide cuántos cortes hacen "
|
||||
"falta para aislar cada fila. Las filas raras se aíslan con muy pocos "
|
||||
"cortes y se marcan como outliers según un umbral de contaminación."),
|
||||
}
|
||||
|
||||
|
||||
def _term(mark: bool, key: str, text: str) -> str:
|
||||
"""Wrap ``text`` as a clickable glossary span when ``mark`` is True.
|
||||
|
||||
The visible text is identical with or without the marker (the renderers strip
|
||||
it), so wrapping never changes line layout — it only adds the link.
|
||||
"""
|
||||
return f"[[term:{key}]]{text}[[/term]]" if mark else text
|
||||
|
||||
|
||||
def _register(gloss, key: str) -> None:
|
||||
"""Register term ``key`` in the collector (idempotent); no-op if gloss None."""
|
||||
if gloss is not None:
|
||||
label, definition = _TERM_DEFS[key]
|
||||
gloss.add(key, label, definition)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Formatting helpers (mirror the overview chapter's defensive style).
|
||||
@@ -308,37 +252,34 @@ def _make_cluster_scatter(projection: dict):
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Section builders. Each returns a list of blocks (possibly empty).
|
||||
# --------------------------------------------------------------------------- #
|
||||
def _normalization_intro(gloss=None, mark_term: bool = False) -> list:
|
||||
_register(gloss, "zscore")
|
||||
zscore = _term(mark_term, "zscore", "**estandarizan con z-score**")
|
||||
def _normalization_intro() -> list:
|
||||
text = (
|
||||
"Estos modelos son **no supervisados**: buscan estructura latente sin "
|
||||
"una variable objetivo. Antes de aplicarlos, todas las columnas "
|
||||
f"numéricas se {zscore} (cada valor menos la media, dividido por la "
|
||||
"desviación típica). Sin esta normalización, una variable con escala "
|
||||
"grande (p.ej. ingresos en euros) dominaría las distancias y la varianza "
|
||||
"frente a otra de escala pequeña (p.ej. un ratio entre 0 y 1), sesgando "
|
||||
"tanto el PCA como el KMeans. Tras la estandarización todas las variables "
|
||||
"pesan por igual."
|
||||
"numéricas se **estandarizan con z-score** (cada valor menos la media, "
|
||||
"dividido por la desviación típica). Sin esta normalización, una "
|
||||
"variable con escala grande (p.ej. ingresos en euros) dominaría las "
|
||||
"distancias y la varianza frente a otra de escala pequeña (p.ej. un "
|
||||
"ratio entre 0 y 1), sesgando tanto el PCA como el KMeans. Tras la "
|
||||
"estandarización todas las variables pesan por igual."
|
||||
)
|
||||
return [model.Heading(text="Modelos no supervisados", level=1),
|
||||
model.Markdown(text=text)]
|
||||
|
||||
|
||||
def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
def _pca_section(pca: dict) -> list:
|
||||
if not _is_dict(pca) or not pca.get("explained_variance_ratio"):
|
||||
return []
|
||||
_register(gloss, "pca")
|
||||
blocks = [model.Heading(text="PCA — varianza explicada", level=2)]
|
||||
|
||||
n_used = pca.get("n_rows_used")
|
||||
n_feat = pca.get("n_features")
|
||||
intro = (
|
||||
f"El {_term(mark_term, 'pca', 'PCA')} resume {_fmt_num(n_feat)} variables "
|
||||
"numéricas en componentes ortogonales ordenados por la varianza que "
|
||||
f"capturan ({_fmt_num(n_used)} filas usadas tras eliminar nulos). El "
|
||||
"gráfico de sedimentación (scree) muestra cuánta varianza aporta cada "
|
||||
"componente y su acumulado: un codo marca cuántos componentes bastan."
|
||||
f"El PCA resume {_fmt_num(n_feat)} variables numéricas en componentes "
|
||||
f"ortogonales ordenados por la varianza que capturan "
|
||||
f"({_fmt_num(n_used)} filas usadas tras eliminar nulos). El gráfico de "
|
||||
"sedimentación (scree) muestra cuánta varianza aporta cada componente y "
|
||||
"su acumulado: un codo marca cuántos componentes bastan."
|
||||
)
|
||||
blocks.append(model.Markdown(text=intro))
|
||||
|
||||
@@ -384,14 +325,11 @@ def _pca_section(pca: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
return blocks
|
||||
|
||||
|
||||
def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
gloss=None, mark_term: bool = False) -> list:
|
||||
def _kmeans_section(kmeans: dict, projection: dict, titles) -> list:
|
||||
has_km = _is_dict(kmeans) and kmeans.get("best_k")
|
||||
has_proj = _is_dict(projection) and projection.get("points")
|
||||
if not has_km and not has_proj:
|
||||
return []
|
||||
_register(gloss, "kmeans")
|
||||
_register(gloss, "silhouette")
|
||||
|
||||
blocks = [model.Heading(text="Segmentación (KMeans)", level=2)]
|
||||
|
||||
@@ -399,11 +337,9 @@ def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
sil = (projection or {}).get("silhouette")
|
||||
if sil is None:
|
||||
sil = (kmeans or {}).get("silhouette")
|
||||
t_kmeans = _term(mark_term, "kmeans", "KMeans")
|
||||
t_sil = _term(mark_term, "silhouette", "*silhouette*")
|
||||
intro = (
|
||||
f"{t_kmeans} agrupa las filas en **{_fmt_num(best_k)} segmentos** "
|
||||
f"elegidos automáticamente maximizando el coeficiente de {t_sil} "
|
||||
f"KMeans agrupa las filas en **{_fmt_num(best_k)} segmentos** elegidos "
|
||||
"automáticamente maximizando el coeficiente de *silhouette* "
|
||||
f"(**{_fmt_num(sil)}**, rango −1 a 1: cuanto más alto, segmentos más "
|
||||
"compactos y separados). Los segmentos se proyectan sobre el plano de "
|
||||
"los dos primeros componentes principales para visualizarlos."
|
||||
@@ -458,18 +394,16 @@ def _kmeans_section(kmeans: dict, projection: dict, titles,
|
||||
return blocks
|
||||
|
||||
|
||||
def _outliers_section(outliers: dict, gloss=None, mark_term: bool = False) -> list:
|
||||
def _outliers_section(outliers: dict) -> list:
|
||||
if not _is_dict(outliers) or outliers.get("n_outliers") is None:
|
||||
return []
|
||||
if outliers.get("note") and not outliers.get("n_rows_used"):
|
||||
# insufficient data — nothing meaningful to show.
|
||||
return []
|
||||
_register(gloss, "isolation_forest")
|
||||
blocks = [model.Heading(text="Detección de anomalías (Isolation Forest)",
|
||||
level=2)]
|
||||
isof = _term(mark_term, "isolation_forest", "**Isolation Forest**")
|
||||
explain = (
|
||||
f"{isof} detecta filas anómalas de forma *multivariante*: "
|
||||
"**Isolation Forest** detecta filas anómalas de forma *multivariante*: "
|
||||
"construye árboles que parten el espacio con cortes aleatorios y mide "
|
||||
"cuántos cortes hacen falta para aislar cada fila. Las filas raras "
|
||||
"(combinaciones de valores poco frecuentes considerando **todas las "
|
||||
@@ -550,21 +484,15 @@ def build_modelos(profile: dict, ctx: dict):
|
||||
(kmeans and kmeans.get("best_k")) or (projection and projection.get("points"))
|
||||
) else None
|
||||
|
||||
# Shared glossary collector: terms are registered + marked clickable inside
|
||||
# each section, only when that section actually renders (no orphan entries).
|
||||
glossary = ctx.get("glossary")
|
||||
gloss = glossary if isinstance(glossary, model.GlossaryCollector) else None
|
||||
mark_term = gloss is not None
|
||||
|
||||
sections = []
|
||||
sections += _pca_section(pca, gloss, mark_term) if pca else []
|
||||
sections += _kmeans_section(kmeans, projection, titles, gloss, mark_term)
|
||||
sections += _outliers_section(outliers, gloss, mark_term) if outliers else []
|
||||
sections += _pca_section(pca) if pca else []
|
||||
sections += _kmeans_section(kmeans, projection, titles)
|
||||
sections += _outliers_section(outliers) if outliers else []
|
||||
sections += _normality_section(normality) if normality else []
|
||||
|
||||
if not sections:
|
||||
return None # models block present but nothing renderable.
|
||||
|
||||
blocks = _normalization_intro(gloss, mark_term) + sections
|
||||
blocks = _normalization_intro() + sections
|
||||
return model.Chapter(id=CHAPTER_ID, title=CHAPTER_TITLE,
|
||||
version=CHAPTER_VERSION, blocks=blocks)
|
||||
|
||||
@@ -257,26 +257,3 @@ def test_anticortes_tabla_normalidad_larga_no_corta():
|
||||
# Every column name survives (wrapped/split, never truncated).
|
||||
for i in (0, 19, 39):
|
||||
assert f"col_{i}" in txt
|
||||
|
||||
|
||||
def test_glosario_engancha_terminos_modelos():
|
||||
"""Mejora 4b: PCA, KMeans, silhouette, Isolation Forest y la estandarización
|
||||
z-score se registran en el colector compartido y se marcan clicables en el
|
||||
cuerpo. Sin colector en ctx, el capítulo degrada y no marca nada."""
|
||||
from datascience.automatic_eda.model import GlossaryCollector
|
||||
|
||||
g = GlossaryCollector()
|
||||
ctx = dict(_ctx_full())
|
||||
ctx["glossary"] = g
|
||||
ch = build_modelos(_profile(), ctx)
|
||||
assert ch is not None
|
||||
keys = {t["key"] for t in g.terms()}
|
||||
assert {"zscore", "pca", "kmeans", "silhouette", "isolation_forest"} <= keys
|
||||
body = " ".join(b.text for b in ch.blocks if b.kind == "markdown")
|
||||
for k in ("zscore", "pca", "kmeans", "silhouette", "isolation_forest"):
|
||||
assert f"[[term:{k}]]" in body, k
|
||||
|
||||
# Sin colector: degrada limpio (ningún marcador en el cuerpo).
|
||||
ch2 = build_modelos(_profile(), _ctx_full())
|
||||
body2 = " ".join(b.text for b in ch2.blocks if b.kind == "markdown")
|
||||
assert "[[term:" not in body2
|
||||
|
||||
@@ -2,8 +2,17 @@
|
||||
|
||||
Builds the document cover from a TableProfile plus an optional ``ctx`` of
|
||||
presentation metadata. Reads everything defensively (``.get``) and degrades
|
||||
honestly: a field that is neither in the profile nor in ``ctx`` is shown as a
|
||||
placeholder rather than invented, leaving a hook for the LLM layer to fill it.
|
||||
honestly.
|
||||
|
||||
The dataset size (N rows x M columns) is always shown big, as a heading right
|
||||
under the dataset name (kept together in a ``Group``), not buried in the
|
||||
metadata table. The Description and Granularity are resolved through a cascade
|
||||
so they are never empty: an explicit ``ctx`` value wins; otherwise the LLM block
|
||||
(``profile['llm']`` from ``eda_llm_insights``) provides ``summary`` /
|
||||
``row_meaning``; otherwise a short summary is derived from the profile itself
|
||||
(shape, column-type mix, quality score) and a "Cada fila es…" sentence from the
|
||||
key-candidate columns or the table shape. Nothing is invented: the derived
|
||||
fallbacks state that they come from the profile.
|
||||
|
||||
Contract for chapter authors (see ``docs/capabilities/automatic_eda.md``):
|
||||
build_<id>(profile: dict, ctx: dict) -> Chapter | None
|
||||
@@ -17,10 +26,15 @@ from datetime import datetime, timezone
|
||||
|
||||
from .. import model
|
||||
|
||||
CHAPTER_VERSION = "1.1.0"
|
||||
CHAPTER_VERSION = "1.2.0"
|
||||
CHAPTER_ID = "portada"
|
||||
CHAPTER_TITLE = "Portada"
|
||||
|
||||
# Key under which eda_llm_insights stores its interpretive block in the profile.
|
||||
# The cover reads ``summary`` (what the table is) and ``row_meaning`` (what one
|
||||
# row represents) from it when the LLM layer ran (``run_llm``).
|
||||
_LLM_KEY = "llm"
|
||||
|
||||
# Default human description of what the table quality score measures. Chapters
|
||||
# can override it via ctx["quality_criteria"].
|
||||
_DEFAULT_QUALITY_CRITERIA = (
|
||||
@@ -142,6 +156,88 @@ def _fmt_date_eu(value) -> str:
|
||||
return s
|
||||
|
||||
|
||||
def _llm_block(profile: dict, ctx: dict) -> dict:
|
||||
"""Return the interpretive LLM block (``eda_llm_insights`` output), or {}.
|
||||
|
||||
It is stored under ``profile['llm']`` by ``profile_table(run_llm=True)`` and
|
||||
may also be forwarded in ``ctx['llm']``. Read defensively: anything that is
|
||||
not a dict degrades to an empty dict so the cover never raises.
|
||||
"""
|
||||
block = profile.get(_LLM_KEY)
|
||||
if not isinstance(block, dict):
|
||||
block = ctx.get(_LLM_KEY)
|
||||
return block if isinstance(block, dict) else {}
|
||||
|
||||
|
||||
def _count_column_types(profile: dict, ctx: dict):
|
||||
"""Best-effort (n_numeric, n_categorical) for the dataset.
|
||||
|
||||
Prefers the aggregated ``ctx['document_summary']`` (computed by the engine
|
||||
over the whole body); falls back to counting the profile columns directly so
|
||||
the cover still has the numbers when no summary was passed.
|
||||
"""
|
||||
summary = ctx.get("document_summary")
|
||||
if isinstance(summary, dict):
|
||||
n_num = summary.get("n_numeric")
|
||||
n_cat = summary.get("n_categorical")
|
||||
if n_num is not None or n_cat is not None:
|
||||
return n_num, n_cat
|
||||
cols = profile.get("columns") or []
|
||||
n_num = sum(1 for c in cols if isinstance(c, dict)
|
||||
and c.get("inferred_type") == "numeric")
|
||||
n_cat = sum(1 for c in cols if isinstance(c, dict)
|
||||
and isinstance(c.get("categorical"), dict)
|
||||
and c.get("categorical", {}).get("top")
|
||||
and c.get("inferred_type") != "numeric")
|
||||
return n_num, n_cat
|
||||
|
||||
|
||||
def _derive_description(profile: dict, ctx: dict) -> str:
|
||||
"""A short, honest description of the dataset from the profile.
|
||||
|
||||
Used only when no explicit ``ctx['description']`` and no LLM ``summary`` are
|
||||
available. Summarizes shape, column-type mix and quality score; never empty,
|
||||
never invents business meaning (it states the description was derived)."""
|
||||
n_rows = profile.get("n_rows")
|
||||
n_cols = profile.get("n_cols")
|
||||
n_num, n_cat = _count_column_types(profile, ctx)
|
||||
head = f"Conjunto de datos con {_fmt_int(n_rows)} filas y {_fmt_int(n_cols)} columnas"
|
||||
type_bits = []
|
||||
if n_num:
|
||||
type_bits.append(f"{_fmt_int(n_num)} numéricas")
|
||||
if n_cat:
|
||||
type_bits.append(f"{_fmt_int(n_cat)} categóricas")
|
||||
if type_bits:
|
||||
head += " (" + ", ".join(type_bits) + ")"
|
||||
parts = [head + "."]
|
||||
score = profile.get("quality_score")
|
||||
if score is not None:
|
||||
parts.append(f"Calidad media estimada: {score}/100.")
|
||||
parts.append(
|
||||
"Resumen derivado del perfil; active la interpretación LLM (`run_llm`) "
|
||||
"para una descripción de negocio más rica.")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _derive_granularity(profile: dict, dataset_name: str) -> str:
|
||||
"""A ``Cada fila es…`` granularity sentence from the profile.
|
||||
|
||||
Prefers the key-candidate columns (a row is identified by them); when no key
|
||||
is detected, falls back to the table shape so the line is always meaningful
|
||||
and starts with ``Cada fila es`` as the user requested."""
|
||||
keys = profile.get("key_candidates") or []
|
||||
if keys:
|
||||
shown = ", ".join(str(k) for k in keys[:3])
|
||||
more = "" if len(keys) <= 3 else f" (y {len(keys) - 3} más)"
|
||||
return (f"Cada fila es un registro identificado por {shown}{more}, "
|
||||
"candidata(s) a clave por ser únicas y sin nulos.")
|
||||
n_rows = profile.get("n_rows")
|
||||
tail = f" El dataset tiene {_fmt_int(n_rows)} filas en total." if n_rows else ""
|
||||
return (f"Cada fila es un registro de «{dataset_name}». No se detectó una "
|
||||
"columna identificadora única, así que la granularidad se infiere "
|
||||
"de la forma de la tabla." + tail)
|
||||
|
||||
|
||||
def build_portada(profile: dict, ctx: dict):
|
||||
"""Build the cover Chapter, or None if there is truly nothing to show."""
|
||||
profile = profile or {}
|
||||
@@ -166,30 +262,38 @@ def build_portada(profile: dict, ctx: dict):
|
||||
quality_criteria = ctx.get("quality_criteria") or _DEFAULT_QUALITY_CRITERIA
|
||||
quality_value = "—" if score is None else f"{score} / 100"
|
||||
|
||||
# Granularity: ctx wins; else derive from key candidates; else be honest.
|
||||
llm = _llm_block(profile, ctx)
|
||||
|
||||
# Granularity: explicit ctx wins; then the LLM "row_meaning"; then the key
|
||||
# candidates; finally a shape-based fallback. Always a real "Cada fila es…".
|
||||
granularity = ctx.get("granularity")
|
||||
if not granularity:
|
||||
keys = profile.get("key_candidates") or []
|
||||
if keys:
|
||||
granularity = ("Cada fila parece identificada por "
|
||||
+ ", ".join(str(k) for k in keys[:3]) + ".")
|
||||
else:
|
||||
granularity = ("Cada fila es… (granularidad no determinada — "
|
||||
"pendiente de la capa de cálculo/LLM).")
|
||||
granularity = (llm.get("row_meaning") or "").strip() or None
|
||||
if not granularity:
|
||||
granularity = _derive_granularity(profile, str(dataset_name))
|
||||
|
||||
# Description: explicit ctx wins; then the LLM "summary"; finally a short
|
||||
# profile-derived summary. Never the old empty placeholder.
|
||||
description = ctx.get("description")
|
||||
if not description:
|
||||
description = ("Descripción no provista — pendiente de la capa LLM "
|
||||
"(`run_llm`) o de `ctx['description']`.")
|
||||
description = (llm.get("summary") or "").strip() or None
|
||||
if not description:
|
||||
description = _derive_description(profile, ctx)
|
||||
|
||||
blocks = [
|
||||
# Title + dataset size shown together and BIG (Heading) at the top, kept on
|
||||
# the same page (Group). The size is no longer buried in the metadata table.
|
||||
cover = [
|
||||
model.Heading(text=str(dataset_name), level=1),
|
||||
model.Markdown(text="**Automatic-EDA** · informe exploratorio automático"),
|
||||
model.Heading(text=shape, level=2),
|
||||
]
|
||||
|
||||
blocks = [
|
||||
model.Group(blocks=cover),
|
||||
model.KVTable(rows=[
|
||||
("Fuente", source_origin),
|
||||
("Almacenamiento", storage),
|
||||
("Generado", when),
|
||||
("Tamaño", shape),
|
||||
("Calidad", quality_value),
|
||||
("Criterios de calidad", quality_criteria),
|
||||
]),
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
"""Tests for the PORTADA (cover) chapter — DoD: golden + edges + render.
|
||||
|
||||
Self-contained: builds synthetic TableProfiles (no DuckDB) so the suite is fast
|
||||
and deterministic. Verifies the Fase 4b improvements:
|
||||
|
||||
1. The dataset size (N rows x M columns) is always shown BIG — as a level-2
|
||||
heading kept together with the dataset name in a ``Group`` — and is no longer
|
||||
a row of the metadata table.
|
||||
2. Description and Granularity are resolved through a real cascade and are never
|
||||
the old empty placeholders: an explicit ``ctx`` value wins; otherwise the LLM
|
||||
block (``profile['llm']``) provides ``summary`` / ``row_meaning``; otherwise a
|
||||
short summary is derived from the profile and a "Cada fila es…" sentence from
|
||||
the key-candidate columns or the table shape.
|
||||
3. The chapter degrades without raising on empty/None input.
|
||||
4. It renders inside the full document to both PDF and PPTX showing that content.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
|
||||
from datascience.automatic_eda.model import Group, Heading, KVTable, Markdown
|
||||
from datascience.automatic_eda.chapters.portada import (
|
||||
CHAPTER_ID, CHAPTER_VERSION, build_portada,
|
||||
)
|
||||
from datascience.render_automatic_eda_pdf import render_automatic_eda_pdf
|
||||
from datascience.render_automatic_eda_pptx import render_automatic_eda_pptx
|
||||
|
||||
|
||||
def _profile(with_llm: bool = True, with_keys: bool = True) -> dict:
|
||||
prof = {
|
||||
"table": "titanic",
|
||||
"source": "/data/titanic.csv",
|
||||
"profiled_at": "2026-06-30T10:00:00+00:00",
|
||||
"n_rows": 891,
|
||||
"n_cols": 12,
|
||||
"quality_score": 78.0,
|
||||
"columns": [
|
||||
{"name": "PassengerId", "inferred_type": "numeric",
|
||||
"null_pct": 0.0, "numeric": {"mean": 446.0, "min": 1.0,
|
||||
"max": 891.0, "std": 257.0}},
|
||||
{"name": "Survived", "inferred_type": "numeric",
|
||||
"null_pct": 0.0, "numeric": {"mean": 0.38, "min": 0.0,
|
||||
"max": 1.0, "std": 0.49}},
|
||||
{"name": "Sex", "inferred_type": "categorical", "null_pct": 0.0,
|
||||
"categorical": {"top": [{"value": "male", "count": 577, "pct": 0.65},
|
||||
{"value": "female", "count": 314,
|
||||
"pct": 0.35}],
|
||||
"mode": "male", "n_distinct": 2, "entropy": 0.93}},
|
||||
],
|
||||
}
|
||||
if with_keys:
|
||||
prof["key_candidates"] = ["PassengerId"]
|
||||
if with_llm:
|
||||
prof["llm"] = {
|
||||
"summary": "Pasajeros del Titanic con su supervivencia y datos de viaje.",
|
||||
"row_meaning": "Cada fila es un pasajero del Titanic.",
|
||||
"dictionary": [], "pii": [], "cleaning": [], "analyses": [],
|
||||
}
|
||||
return prof
|
||||
|
||||
|
||||
def _pdf_text(path: str) -> str:
|
||||
txt = "".join((pg.extract_text() or "") for pg in PdfReader(path).pages)
|
||||
return re.sub(r"\s+", " ", txt)
|
||||
|
||||
|
||||
def _pptx_text(path: str) -> str:
|
||||
prs = Presentation(path)
|
||||
parts = []
|
||||
for sl in prs.slides:
|
||||
for sh in sl.shapes:
|
||||
if sh.has_text_frame:
|
||||
parts.append(sh.text_frame.text)
|
||||
if sh.has_table:
|
||||
tb = sh.table
|
||||
for r in range(len(tb.rows)):
|
||||
for c in range(len(tb.columns)):
|
||||
parts.append(tb.cell(r, c).text)
|
||||
return re.sub(r"\s+", " ", " ".join(parts))
|
||||
|
||||
|
||||
def _markdown_after(blocks, heading_text):
|
||||
"""Return the Markdown block that follows a Heading whose text matches."""
|
||||
for i, b in enumerate(blocks):
|
||||
if isinstance(b, Heading) and heading_text.lower() in b.text.lower():
|
||||
for nb in blocks[i + 1:]:
|
||||
if isinstance(nb, Markdown):
|
||||
return nb
|
||||
return None
|
||||
|
||||
|
||||
def test_golden_tamano_grande_y_textos_llm():
|
||||
ch = build_portada(_profile(), {})
|
||||
assert ch is not None
|
||||
assert ch.id == CHAPTER_ID
|
||||
assert ch.version == CHAPTER_VERSION
|
||||
|
||||
# 1) Title + size kept together in a Group; size is a BIG level-2 heading.
|
||||
group = next(b for b in ch.blocks if isinstance(b, Group))
|
||||
inner = group.blocks
|
||||
assert isinstance(inner[0], Heading) and inner[0].level == 1
|
||||
assert inner[0].text == "titanic"
|
||||
size_h = next(b for b in inner if isinstance(b, Heading) and b.level == 2)
|
||||
assert "891" in size_h.text and "12" in size_h.text
|
||||
assert "filas" in size_h.text and "columnas" in size_h.text
|
||||
|
||||
# 2) Size is no longer a row of the metadata table.
|
||||
kv = next(b for b in ch.blocks if isinstance(b, KVTable))
|
||||
labels = [r[0] for r in kv.rows]
|
||||
assert "Tamaño" not in labels
|
||||
assert "Fuente" in labels and "Calidad" in labels
|
||||
|
||||
# 3) Description and Granularity come from the LLM block.
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert desc is not None and "Titanic" in desc.text
|
||||
assert gran is not None and gran.text.startswith("Cada fila es")
|
||||
assert "pasajero" in gran.text.lower()
|
||||
|
||||
|
||||
def test_fallback_sin_llm_usa_keys_y_perfil():
|
||||
# No LLM block: description derived from the profile, granularity from keys.
|
||||
ch = build_portada(_profile(with_llm=False, with_keys=True), {})
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
# Description is the derived summary, never the old "pendiente" placeholder.
|
||||
assert "pendiente" not in desc.text.lower()
|
||||
assert "891" in desc.text and "columnas" in desc.text
|
||||
assert "numéricas" in desc.text or "categóricas" in desc.text
|
||||
# Granularity mentions the key candidate and starts with "Cada fila es".
|
||||
assert gran.text.startswith("Cada fila es")
|
||||
assert "PassengerId" in gran.text
|
||||
assert "…" not in gran.text # the old ellipsis placeholder is gone.
|
||||
|
||||
|
||||
def test_fallback_sin_llm_sin_keys_usa_forma():
|
||||
ch = build_portada(_profile(with_llm=False, with_keys=False), {})
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert gran.text.startswith("Cada fila es")
|
||||
assert "titanic" in gran.text.lower()
|
||||
assert "pendiente" not in gran.text.lower()
|
||||
|
||||
|
||||
def test_ctx_explicito_gana_sobre_llm():
|
||||
ctx = {"description": "Descripción manual.",
|
||||
"granularity": "Cada fila es una unidad manual."}
|
||||
ch = build_portada(_profile(), ctx)
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert desc.text == "Descripción manual."
|
||||
assert gran.text == "Cada fila es una unidad manual."
|
||||
|
||||
|
||||
def test_edge_perfil_vacio_no_lanza():
|
||||
# Empty / None never raise; the cover still shows a size and real texts.
|
||||
for prof, ctx in (({}, {}), (None, None)):
|
||||
ch = build_portada(prof, ctx)
|
||||
assert ch is not None
|
||||
group = next(b for b in ch.blocks if isinstance(b, Group))
|
||||
size_h = next(b for b in group.blocks
|
||||
if isinstance(b, Heading) and b.level == 2)
|
||||
assert "filas" in size_h.text and "columnas" in size_h.text
|
||||
desc = _markdown_after(ch.blocks, "Descripción")
|
||||
gran = _markdown_after(ch.blocks, "Granularidad")
|
||||
assert desc.text and "pendiente" not in desc.text.lower()
|
||||
assert gran.text.startswith("Cada fila es")
|
||||
|
||||
|
||||
def test_golden_render_pdf_muestra_portada():
|
||||
prof = _profile()
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pdf")
|
||||
res = render_automatic_eda_pdf(prof, out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
txt = _pdf_text(out)
|
||||
assert "titanic" in txt.lower()
|
||||
assert "891" in txt and "filas" in txt and "columnas" in txt
|
||||
assert "Titanic" in txt # LLM summary in the Description.
|
||||
assert "Cada fila es" in txt # granularity sentence.
|
||||
|
||||
|
||||
def test_golden_render_pptx_muestra_portada():
|
||||
prof = _profile()
|
||||
with tempfile.TemporaryDirectory() as d:
|
||||
out = os.path.join(d, "eda.pptx")
|
||||
res = render_automatic_eda_pptx(prof, out, {"title": "EDA"})
|
||||
assert res["path"] == out and os.path.exists(out)
|
||||
assert CHAPTER_ID in [c["id"] for c in res["chapters"]]
|
||||
txt = _pptx_text(out)
|
||||
assert "titanic" in txt.lower()
|
||||
assert "891" in txt and "columnas" in txt
|
||||
assert "Cada fila es" in txt
|
||||
Reference in New Issue
Block a user