Files
fn_registry/python/functions/datascience/render_eda_markdown_test.py
T
Egutierrez c4cff5ed5b feat(eda): render de models en markdown + PDF DB-level para profile_database (H4,H9)
- H4: render_eda_markdown anade seccion Modelos (PCA/KMeans/normalidad/outliers);
  render_eda_pdf formatea models/series/caveats como tablas (no str(dict) crudo)
- H9: profile_database gana flag emit_pdf -> PDF movil DB-level (resumen tablas +
  join graph) via render_eda_pdf_relational; clave report_pdf_path
- aditivos y retrocompatibles (flags default False). 38 tests verdes

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 04:05:38 +02:00

235 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests para render_eda_markdown."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from render_eda_markdown import render_eda_markdown
def _sample_profile(correlations=None, llm=None):
return {
"table": "sales",
"source": "data/sales.csv",
"profiled_at": "2026-06-20T10:00:00Z",
"n_rows": 1000,
"n_cols": 2,
"size_bytes": 40960,
"duplicate_rows": 3,
"duplicate_pct": 0.003,
"constant_cols": [],
"all_null_cols": [],
"null_cell_pct": 0.015,
"type_breakdown": {"numeric": 1, "categorical": 1},
"quality_score": 0.92,
"key_candidates": ["order_id"],
"correlations": correlations,
"llm": llm,
"models": None,
"columns": [
{
"name": "price",
"physical_type": "DOUBLE",
"inferred_type": "float",
"semantic_type": "currency",
"count": 1000,
"n_rows": 1000,
"null_count": 0,
"null_pct": 0.0,
"distinct_count": 857,
"unique_pct": 0.857,
"flags": [],
"quality_score": 0.95,
"numeric": {
"min": 1.0,
"max": 99.0,
"mean": 42.5,
"median": 40.0,
"std": 12.3,
"p25": 30.0,
"p75": 55.0,
"p95": 80.0,
"p99": 95.0,
"skew": 0.4,
"kurtosis": 2.1,
# outlier_pct ya viene en escala 0-100 desde describe_numeric
# (100 * n_outliers / n), NO en fracción 0-1.
"outlier_pct": 3.5,
"distribution_type": "right-skewed",
"histogram": [
{"lo": 0, "hi": 25, "count": 100},
{"lo": 25, "hi": 50, "count": 500},
{"lo": 50, "hi": 75, "count": 300},
{"lo": 75, "hi": 100, "count": 50},
],
},
"categorical": None,
"datetime": None,
},
{
"name": "region",
"physical_type": "VARCHAR",
"inferred_type": "string",
"semantic_type": "category",
"count": 1000,
"n_rows": 1000,
"null_count": 10,
"null_pct": 0.01,
"distinct_count": 3,
"unique_pct": 0.003,
"flags": ["low_cardinality"],
"quality_score": 0.80,
"numeric": None,
"categorical": {
"top": [
{"value": "north", "count": 500, "pct": 0.5},
{"value": "south", "count": 300, "pct": 0.3},
{"value": "east", "count": 200, "pct": 0.2},
],
"mode": "north",
"mode_pct": 0.5,
"n_distinct": 3,
"entropy": 1.48,
},
"datetime": None,
},
],
}
def test_contains_title_and_sections():
md = render_eda_markdown(_sample_profile())
assert "# EDA — sales" in md
assert "## Overview" in md
assert "## Columnas" in md
assert "## Numéricas" in md
assert "## Categóricas" in md
def test_contains_column_names():
md = render_eda_markdown(_sample_profile())
assert "price" in md
assert "region" in md
def test_contains_sparkline():
md = render_eda_markdown(_sample_profile())
# Histogram sparkline must render with block characters.
assert "histogram: `" in md
assert any(block in md for block in "▁▂▃▄▅▆▇█")
def test_pct_fields_scaled_by_100():
# *_pct fields are fractions 0-1; the render must show them ×100.
md = render_eda_markdown(_sample_profile())
# unique_pct=0.857 -> "85.70%" (must NOT show the raw "0.86%").
assert "85.7" in md
assert "0.86%" not in md
# categorical top pct=0.5 -> "50.0%".
assert "50.0" in md
def test_outlier_pct_not_double_scaled():
# outlier_pct ya viene en escala 0-100 (describe_numeric): el render lo muestra
# tal cual + '%', SIN multiplicar otra vez por 100. outlier_pct=3.5 -> "3.5%",
# nunca "350%" (el bug del doble ×100).
md = render_eda_markdown(_sample_profile())
assert "3.5%" in md
assert "350" not in md
def test_pct_handles_none_as_blank():
profile = {
"table": "t",
"columns": [
{
"name": "c",
"inferred_type": "float",
"null_pct": None,
"unique_pct": None,
"quality_score": 0.5,
}
],
}
# None pct renders as empty cell, never "None%" or a crash.
md = render_eda_markdown(profile)
assert "None%" not in md
def test_tolerates_none_correlations_and_llm():
md = render_eda_markdown(_sample_profile(correlations=None, llm=None))
assert "## Correlaciones" not in md
assert "## Análisis LLM" not in md
# Still produced the main body.
assert "# EDA — sales" in md
def test_tolerates_empty_profile():
md = render_eda_markdown({})
assert "# EDA — (unnamed)" in md
def test_tolerates_none_profile():
md = render_eda_markdown(None)
assert "# EDA — (unnamed)" in md
def _sample_models():
"""Bloque `models` como el que produce run_eda_models (PCA/KMeans/...)."""
return {
"n_numeric_cols": 3,
"pca": {
"n_components": 2,
"n_rows_used": 1000,
"n_features": 3,
"explained_variance_ratio": [0.62, 0.21],
"cumulative": [0.62, 0.83],
"top_loadings": [
{"component": 0, "feature": "price", "loading": 0.71},
{"component": 1, "feature": "qty", "loading": -0.55},
],
},
"kmeans": {
"best_k": 3,
"silhouette": 0.48,
"cluster_sizes": [500, 300, 200],
"scores_by_k": [
{"k": 2, "silhouette": 0.41, "inertia": 1200.0},
{"k": 3, "silhouette": 0.48, "inertia": 900.0},
],
},
"outliers": {
"n_outliers": 35,
"outlier_pct": 3.5,
"threshold": -0.51,
},
"normality": {
"price": {"jarque_bera": {"p": 0.0001}, "is_normal": False},
},
"note": "",
}
def test_models_section_rendered():
# H4: el bloque models antes se omitía en markdown; ahora tiene formatter.
profile = _sample_profile()
profile["models"] = _sample_models()
md = render_eda_markdown(profile)
assert "## Modelos" in md
assert "### PCA" in md
assert "### KMeans" in md
assert "### Outliers multivariante (Isolation Forest)" in md
assert "### Normalidad" in md
# Datos reales del PCA renderizados (varianza explicada ×100) y KMeans.
assert "62.0" in md # explained_variance_ratio 0.62 -> 62.00%
assert "mejor k = 3" in md
# outlier_pct del modelo ya viene en escala 0-100: 3.5 -> "3.5%", no "350".
assert "3.5%" in md
def test_models_absent_when_none():
# Edge: profile sin models (None) no produce sección Modelos ni rompe.
md = render_eda_markdown(_sample_profile()) # models=None en el sample
assert "## Modelos" not in md