"""Tests para render_eda_markdown.""" import sys import os sys.path.insert(0, os.path.dirname(__file__)) from render_eda_markdown import render_eda_markdown def _sample_profile(correlations=None, llm=None): return { "table": "sales", "source": "data/sales.csv", "profiled_at": "2026-06-20T10:00:00Z", "n_rows": 1000, "n_cols": 2, "size_bytes": 40960, "duplicate_rows": 3, "duplicate_pct": 0.003, "constant_cols": [], "all_null_cols": [], "null_cell_pct": 0.015, "type_breakdown": {"numeric": 1, "categorical": 1}, "quality_score": 0.92, "key_candidates": ["order_id"], "correlations": correlations, "llm": llm, "models": None, "columns": [ { "name": "price", "physical_type": "DOUBLE", "inferred_type": "float", "semantic_type": "currency", "count": 1000, "n_rows": 1000, "null_count": 0, "null_pct": 0.0, "distinct_count": 857, "unique_pct": 0.857, "flags": [], "quality_score": 0.95, "numeric": { "min": 1.0, "max": 99.0, "mean": 42.5, "median": 40.0, "std": 12.3, "p25": 30.0, "p75": 55.0, "p95": 80.0, "p99": 95.0, "skew": 0.4, "kurtosis": 2.1, # outlier_pct ya viene en escala 0-100 desde describe_numeric # (100 * n_outliers / n), NO en fracción 0-1. "outlier_pct": 3.5, "distribution_type": "right-skewed", "histogram": [ {"lo": 0, "hi": 25, "count": 100}, {"lo": 25, "hi": 50, "count": 500}, {"lo": 50, "hi": 75, "count": 300}, {"lo": 75, "hi": 100, "count": 50}, ], }, "categorical": None, "datetime": None, }, { "name": "region", "physical_type": "VARCHAR", "inferred_type": "string", "semantic_type": "category", "count": 1000, "n_rows": 1000, "null_count": 10, "null_pct": 0.01, "distinct_count": 3, "unique_pct": 0.003, "flags": ["low_cardinality"], "quality_score": 0.80, "numeric": None, "categorical": { "top": [ {"value": "north", "count": 500, "pct": 0.5}, {"value": "south", "count": 300, "pct": 0.3}, {"value": "east", "count": 200, "pct": 0.2}, ], "mode": "north", "mode_pct": 0.5, "n_distinct": 3, "entropy": 1.48, }, "datetime": None, }, ], } def test_contains_title_and_sections(): md = render_eda_markdown(_sample_profile()) assert "# EDA — sales" in md assert "## Overview" in md assert "## Columnas" in md assert "## Numéricas" in md assert "## Categóricas" in md def test_contains_column_names(): md = render_eda_markdown(_sample_profile()) assert "price" in md assert "region" in md def test_contains_sparkline(): md = render_eda_markdown(_sample_profile()) # Histogram sparkline must render with block characters. assert "histogram: `" in md assert any(block in md for block in "▁▂▃▄▅▆▇█") def test_pct_fields_scaled_by_100(): # *_pct fields are fractions 0-1; the render must show them ×100. md = render_eda_markdown(_sample_profile()) # unique_pct=0.857 -> "85.70%" (must NOT show the raw "0.86%"). assert "85.7" in md assert "0.86%" not in md # categorical top pct=0.5 -> "50.0%". assert "50.0" in md def test_outlier_pct_not_double_scaled(): # outlier_pct ya viene en escala 0-100 (describe_numeric): el render lo muestra # tal cual + '%', SIN multiplicar otra vez por 100. outlier_pct=3.5 -> "3.5%", # nunca "350%" (el bug del doble ×100). md = render_eda_markdown(_sample_profile()) assert "3.5%" in md assert "350" not in md def test_pct_handles_none_as_blank(): profile = { "table": "t", "columns": [ { "name": "c", "inferred_type": "float", "null_pct": None, "unique_pct": None, "quality_score": 0.5, } ], } # None pct renders as empty cell, never "None%" or a crash. md = render_eda_markdown(profile) assert "None%" not in md def test_tolerates_none_correlations_and_llm(): md = render_eda_markdown(_sample_profile(correlations=None, llm=None)) assert "## Correlaciones" not in md assert "## Análisis LLM" not in md # Still produced the main body. assert "# EDA — sales" in md def test_tolerates_empty_profile(): md = render_eda_markdown({}) assert "# EDA — (unnamed)" in md def test_tolerates_none_profile(): md = render_eda_markdown(None) assert "# EDA — (unnamed)" in md def _sample_models(): """Bloque `models` como el que produce run_eda_models (PCA/KMeans/...).""" return { "n_numeric_cols": 3, "pca": { "n_components": 2, "n_rows_used": 1000, "n_features": 3, "explained_variance_ratio": [0.62, 0.21], "cumulative": [0.62, 0.83], "top_loadings": [ {"component": 0, "feature": "price", "loading": 0.71}, {"component": 1, "feature": "qty", "loading": -0.55}, ], }, "kmeans": { "best_k": 3, "silhouette": 0.48, "cluster_sizes": [500, 300, 200], "scores_by_k": [ {"k": 2, "silhouette": 0.41, "inertia": 1200.0}, {"k": 3, "silhouette": 0.48, "inertia": 900.0}, ], }, "outliers": { "n_outliers": 35, "outlier_pct": 3.5, "threshold": -0.51, }, "normality": { "price": {"jarque_bera": {"p": 0.0001}, "is_normal": False}, }, "note": "", } def test_models_section_rendered(): # H4: el bloque models antes se omitía en markdown; ahora tiene formatter. profile = _sample_profile() profile["models"] = _sample_models() md = render_eda_markdown(profile) assert "## Modelos" in md assert "### PCA" in md assert "### KMeans" in md assert "### Outliers multivariante (Isolation Forest)" in md assert "### Normalidad" in md # Datos reales del PCA renderizados (varianza explicada ×100) y KMeans. assert "62.0" in md # explained_variance_ratio 0.62 -> 62.00% assert "mejor k = 3" in md # outlier_pct del modelo ya viene en escala 0-100: 3.5 -> "3.5%", no "350". assert "3.5%" in md def test_models_absent_when_none(): # Edge: profile sin models (None) no produce sección Modelos ni rompe. md = render_eda_markdown(_sample_profile()) # models=None en el sample assert "## Modelos" not in md