Files
fn_registry/python/functions/datascience/render_eda_markdown_test.py
T
Egutierrez caf8c25d99 fix(eda): bugs de bajo riesgo del benchmark (H1,H5,H12,H13,H14) + tests faltantes
- H1: render_eda_markdown ya no aplica doble x100 a outlier_pct (336% -> real)
- H5: profile_database filtra base_tables_only (excluye VIEWs; sakila 21->16)
- H12: suggest_reexpression salta columnas no-continuas
- H13: to_returns/profile_table elige retornos (financiera) vs diferencias (fisica)
- H14: test de regresion ATTACH sqlite via information_schema
- +8 tests de las funciones eda nuevas (acf_pacf, adf_kpss, ...). 77 tests verdes
- L/M (H2,H3,H4,H6,H7,H8,H9,H10,H11) quedan en issues 0174-0177 para revision

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 03:51:11 +02:00

176 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests para render_eda_markdown."""
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from render_eda_markdown import render_eda_markdown
def _sample_profile(correlations=None, llm=None):
return {
"table": "sales",
"source": "data/sales.csv",
"profiled_at": "2026-06-20T10:00:00Z",
"n_rows": 1000,
"n_cols": 2,
"size_bytes": 40960,
"duplicate_rows": 3,
"duplicate_pct": 0.003,
"constant_cols": [],
"all_null_cols": [],
"null_cell_pct": 0.015,
"type_breakdown": {"numeric": 1, "categorical": 1},
"quality_score": 0.92,
"key_candidates": ["order_id"],
"correlations": correlations,
"llm": llm,
"models": None,
"columns": [
{
"name": "price",
"physical_type": "DOUBLE",
"inferred_type": "float",
"semantic_type": "currency",
"count": 1000,
"n_rows": 1000,
"null_count": 0,
"null_pct": 0.0,
"distinct_count": 857,
"unique_pct": 0.857,
"flags": [],
"quality_score": 0.95,
"numeric": {
"min": 1.0,
"max": 99.0,
"mean": 42.5,
"median": 40.0,
"std": 12.3,
"p25": 30.0,
"p75": 55.0,
"p95": 80.0,
"p99": 95.0,
"skew": 0.4,
"kurtosis": 2.1,
# outlier_pct ya viene en escala 0-100 desde describe_numeric
# (100 * n_outliers / n), NO en fracción 0-1.
"outlier_pct": 3.5,
"distribution_type": "right-skewed",
"histogram": [
{"lo": 0, "hi": 25, "count": 100},
{"lo": 25, "hi": 50, "count": 500},
{"lo": 50, "hi": 75, "count": 300},
{"lo": 75, "hi": 100, "count": 50},
],
},
"categorical": None,
"datetime": None,
},
{
"name": "region",
"physical_type": "VARCHAR",
"inferred_type": "string",
"semantic_type": "category",
"count": 1000,
"n_rows": 1000,
"null_count": 10,
"null_pct": 0.01,
"distinct_count": 3,
"unique_pct": 0.003,
"flags": ["low_cardinality"],
"quality_score": 0.80,
"numeric": None,
"categorical": {
"top": [
{"value": "north", "count": 500, "pct": 0.5},
{"value": "south", "count": 300, "pct": 0.3},
{"value": "east", "count": 200, "pct": 0.2},
],
"mode": "north",
"mode_pct": 0.5,
"n_distinct": 3,
"entropy": 1.48,
},
"datetime": None,
},
],
}
def test_contains_title_and_sections():
md = render_eda_markdown(_sample_profile())
assert "# EDA — sales" in md
assert "## Overview" in md
assert "## Columnas" in md
assert "## Numéricas" in md
assert "## Categóricas" in md
def test_contains_column_names():
md = render_eda_markdown(_sample_profile())
assert "price" in md
assert "region" in md
def test_contains_sparkline():
md = render_eda_markdown(_sample_profile())
# Histogram sparkline must render with block characters.
assert "histogram: `" in md
assert any(block in md for block in "▁▂▃▄▅▆▇█")
def test_pct_fields_scaled_by_100():
# *_pct fields are fractions 0-1; the render must show them ×100.
md = render_eda_markdown(_sample_profile())
# unique_pct=0.857 -> "85.70%" (must NOT show the raw "0.86%").
assert "85.7" in md
assert "0.86%" not in md
# categorical top pct=0.5 -> "50.0%".
assert "50.0" in md
def test_outlier_pct_not_double_scaled():
# outlier_pct ya viene en escala 0-100 (describe_numeric): el render lo muestra
# tal cual + '%', SIN multiplicar otra vez por 100. outlier_pct=3.5 -> "3.5%",
# nunca "350%" (el bug del doble ×100).
md = render_eda_markdown(_sample_profile())
assert "3.5%" in md
assert "350" not in md
def test_pct_handles_none_as_blank():
profile = {
"table": "t",
"columns": [
{
"name": "c",
"inferred_type": "float",
"null_pct": None,
"unique_pct": None,
"quality_score": 0.5,
}
],
}
# None pct renders as empty cell, never "None%" or a crash.
md = render_eda_markdown(profile)
assert "None%" not in md
def test_tolerates_none_correlations_and_llm():
md = render_eda_markdown(_sample_profile(correlations=None, llm=None))
assert "## Correlaciones" not in md
assert "## Análisis LLM" not in md
# Still produced the main body.
assert "# EDA — sales" in md
def test_tolerates_empty_profile():
md = render_eda_markdown({})
assert "# EDA — (unnamed)" in md
def test_tolerates_none_profile():
md = render_eda_markdown(None)
assert "# EDA — (unnamed)" in md