c4cff5ed5b
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers); render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo) - H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas + join graph) via render_eda_pdf_relational; clave report_pdf_path - aditivos y retrocompatibles (flags default False). 38 tests verdes Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
235 lines
7.5 KiB
Python
235 lines
7.5 KiB
Python
"""Tests para render_eda_markdown."""
|
||
|
||
import sys
|
||
import os
|
||
|
||
sys.path.insert(0, os.path.dirname(__file__))
|
||
|
||
from render_eda_markdown import render_eda_markdown
|
||
|
||
|
||
def _sample_profile(correlations=None, llm=None):
|
||
return {
|
||
"table": "sales",
|
||
"source": "data/sales.csv",
|
||
"profiled_at": "2026-06-20T10:00:00Z",
|
||
"n_rows": 1000,
|
||
"n_cols": 2,
|
||
"size_bytes": 40960,
|
||
"duplicate_rows": 3,
|
||
"duplicate_pct": 0.003,
|
||
"constant_cols": [],
|
||
"all_null_cols": [],
|
||
"null_cell_pct": 0.015,
|
||
"type_breakdown": {"numeric": 1, "categorical": 1},
|
||
"quality_score": 0.92,
|
||
"key_candidates": ["order_id"],
|
||
"correlations": correlations,
|
||
"llm": llm,
|
||
"models": None,
|
||
"columns": [
|
||
{
|
||
"name": "price",
|
||
"physical_type": "DOUBLE",
|
||
"inferred_type": "float",
|
||
"semantic_type": "currency",
|
||
"count": 1000,
|
||
"n_rows": 1000,
|
||
"null_count": 0,
|
||
"null_pct": 0.0,
|
||
"distinct_count": 857,
|
||
"unique_pct": 0.857,
|
||
"flags": [],
|
||
"quality_score": 0.95,
|
||
"numeric": {
|
||
"min": 1.0,
|
||
"max": 99.0,
|
||
"mean": 42.5,
|
||
"median": 40.0,
|
||
"std": 12.3,
|
||
"p25": 30.0,
|
||
"p75": 55.0,
|
||
"p95": 80.0,
|
||
"p99": 95.0,
|
||
"skew": 0.4,
|
||
"kurtosis": 2.1,
|
||
# outlier_pct ya viene en escala 0-100 desde describe_numeric
|
||
# (100 * n_outliers / n), NO en fracción 0-1.
|
||
"outlier_pct": 3.5,
|
||
"distribution_type": "right-skewed",
|
||
"histogram": [
|
||
{"lo": 0, "hi": 25, "count": 100},
|
||
{"lo": 25, "hi": 50, "count": 500},
|
||
{"lo": 50, "hi": 75, "count": 300},
|
||
{"lo": 75, "hi": 100, "count": 50},
|
||
],
|
||
},
|
||
"categorical": None,
|
||
"datetime": None,
|
||
},
|
||
{
|
||
"name": "region",
|
||
"physical_type": "VARCHAR",
|
||
"inferred_type": "string",
|
||
"semantic_type": "category",
|
||
"count": 1000,
|
||
"n_rows": 1000,
|
||
"null_count": 10,
|
||
"null_pct": 0.01,
|
||
"distinct_count": 3,
|
||
"unique_pct": 0.003,
|
||
"flags": ["low_cardinality"],
|
||
"quality_score": 0.80,
|
||
"numeric": None,
|
||
"categorical": {
|
||
"top": [
|
||
{"value": "north", "count": 500, "pct": 0.5},
|
||
{"value": "south", "count": 300, "pct": 0.3},
|
||
{"value": "east", "count": 200, "pct": 0.2},
|
||
],
|
||
"mode": "north",
|
||
"mode_pct": 0.5,
|
||
"n_distinct": 3,
|
||
"entropy": 1.48,
|
||
},
|
||
"datetime": None,
|
||
},
|
||
],
|
||
}
|
||
|
||
|
||
def test_contains_title_and_sections():
|
||
md = render_eda_markdown(_sample_profile())
|
||
assert "# EDA — sales" in md
|
||
assert "## Overview" in md
|
||
assert "## Columnas" in md
|
||
assert "## Numéricas" in md
|
||
assert "## Categóricas" in md
|
||
|
||
|
||
def test_contains_column_names():
|
||
md = render_eda_markdown(_sample_profile())
|
||
assert "price" in md
|
||
assert "region" in md
|
||
|
||
|
||
def test_contains_sparkline():
|
||
md = render_eda_markdown(_sample_profile())
|
||
# Histogram sparkline must render with block characters.
|
||
assert "histogram: `" in md
|
||
assert any(block in md for block in "▁▂▃▄▅▆▇█")
|
||
|
||
|
||
def test_pct_fields_scaled_by_100():
|
||
# *_pct fields are fractions 0-1; the render must show them ×100.
|
||
md = render_eda_markdown(_sample_profile())
|
||
# unique_pct=0.857 -> "85.70%" (must NOT show the raw "0.86%").
|
||
assert "85.7" in md
|
||
assert "0.86%" not in md
|
||
# categorical top pct=0.5 -> "50.0%".
|
||
assert "50.0" in md
|
||
|
||
|
||
def test_outlier_pct_not_double_scaled():
|
||
# outlier_pct ya viene en escala 0-100 (describe_numeric): el render lo muestra
|
||
# tal cual + '%', SIN multiplicar otra vez por 100. outlier_pct=3.5 -> "3.5%",
|
||
# nunca "350%" (el bug del doble ×100).
|
||
md = render_eda_markdown(_sample_profile())
|
||
assert "3.5%" in md
|
||
assert "350" not in md
|
||
|
||
|
||
def test_pct_handles_none_as_blank():
|
||
profile = {
|
||
"table": "t",
|
||
"columns": [
|
||
{
|
||
"name": "c",
|
||
"inferred_type": "float",
|
||
"null_pct": None,
|
||
"unique_pct": None,
|
||
"quality_score": 0.5,
|
||
}
|
||
],
|
||
}
|
||
# None pct renders as empty cell, never "None%" or a crash.
|
||
md = render_eda_markdown(profile)
|
||
assert "None%" not in md
|
||
|
||
|
||
def test_tolerates_none_correlations_and_llm():
|
||
md = render_eda_markdown(_sample_profile(correlations=None, llm=None))
|
||
assert "## Correlaciones" not in md
|
||
assert "## Análisis LLM" not in md
|
||
# Still produced the main body.
|
||
assert "# EDA — sales" in md
|
||
|
||
|
||
def test_tolerates_empty_profile():
|
||
md = render_eda_markdown({})
|
||
assert "# EDA — (unnamed)" in md
|
||
|
||
|
||
def test_tolerates_none_profile():
|
||
md = render_eda_markdown(None)
|
||
assert "# EDA — (unnamed)" in md
|
||
|
||
|
||
def _sample_models():
|
||
"""Bloque `models` como el que produce run_eda_models (PCA/KMeans/...)."""
|
||
return {
|
||
"n_numeric_cols": 3,
|
||
"pca": {
|
||
"n_components": 2,
|
||
"n_rows_used": 1000,
|
||
"n_features": 3,
|
||
"explained_variance_ratio": [0.62, 0.21],
|
||
"cumulative": [0.62, 0.83],
|
||
"top_loadings": [
|
||
{"component": 0, "feature": "price", "loading": 0.71},
|
||
{"component": 1, "feature": "qty", "loading": -0.55},
|
||
],
|
||
},
|
||
"kmeans": {
|
||
"best_k": 3,
|
||
"silhouette": 0.48,
|
||
"cluster_sizes": [500, 300, 200],
|
||
"scores_by_k": [
|
||
{"k": 2, "silhouette": 0.41, "inertia": 1200.0},
|
||
{"k": 3, "silhouette": 0.48, "inertia": 900.0},
|
||
],
|
||
},
|
||
"outliers": {
|
||
"n_outliers": 35,
|
||
"outlier_pct": 3.5,
|
||
"threshold": -0.51,
|
||
},
|
||
"normality": {
|
||
"price": {"jarque_bera": {"p": 0.0001}, "is_normal": False},
|
||
},
|
||
"note": "",
|
||
}
|
||
|
||
|
||
def test_models_section_rendered():
|
||
# H4: el bloque models antes se omitía en markdown; ahora tiene formatter.
|
||
profile = _sample_profile()
|
||
profile["models"] = _sample_models()
|
||
md = render_eda_markdown(profile)
|
||
assert "## Modelos" in md
|
||
assert "### PCA" in md
|
||
assert "### KMeans" in md
|
||
assert "### Outliers multivariante (Isolation Forest)" in md
|
||
assert "### Normalidad" in md
|
||
# Datos reales del PCA renderizados (varianza explicada ×100) y KMeans.
|
||
assert "62.0" in md # explained_variance_ratio 0.62 -> 62.00%
|
||
assert "mejor k = 3" in md
|
||
# outlier_pct del modelo ya viene en escala 0-100: 3.5 -> "3.5%", no "350".
|
||
assert "3.5%" in md
|
||
|
||
|
||
def test_models_absent_when_none():
|
||
# Edge: profile sin models (None) no produce sección Modelos ni rompe.
|
||
md = render_eda_markdown(_sample_profile()) # models=None en el sample
|
||
assert "## Modelos" not in md
|