fn_registry/python/functions/datascience/render_eda_markdown_test.py

"""Tests para render_eda_markdown."""

import sys
import os

sys.path.insert(0, os.path.dirname(__file__))

from render_eda_markdown import render_eda_markdown


def _sample_profile(correlations=None, llm=None):
    return {
        "table": "sales",
        "source": "data/sales.csv",
        "profiled_at": "2026-06-20T10:00:00Z",
        "n_rows": 1000,
        "n_cols": 2,
        "size_bytes": 40960,
        "duplicate_rows": 3,
        "duplicate_pct": 0.003,
        "constant_cols": [],
        "all_null_cols": [],
        "null_cell_pct": 0.015,
        "type_breakdown": {"numeric": 1, "categorical": 1},
        "quality_score": 0.92,
        "key_candidates": ["order_id"],
        "correlations": correlations,
        "llm": llm,
        "models": None,
        "columns": [
            {
                "name": "price",
                "physical_type": "DOUBLE",
                "inferred_type": "float",
                "semantic_type": "currency",
                "count": 1000,
                "n_rows": 1000,
                "null_count": 0,
                "null_pct": 0.0,
                "distinct_count": 857,
                "unique_pct": 0.857,
                "flags": [],
                "quality_score": 0.95,
                "numeric": {
                    "min": 1.0,
                    "max": 99.0,
                    "mean": 42.5,
                    "median": 40.0,
                    "std": 12.3,
                    "p25": 30.0,
                    "p75": 55.0,
                    "p95": 80.0,
                    "p99": 95.0,
                    "skew": 0.4,
                    "kurtosis": 2.1,
                    # outlier_pct ya viene en escala 0-100 desde describe_numeric
                    # (100 * n_outliers / n), NO en fracción 0-1.
                    "outlier_pct": 3.5,
                    "distribution_type": "right-skewed",
                    "histogram": [
                        {"lo": 0, "hi": 25, "count": 100},
                        {"lo": 25, "hi": 50, "count": 500},
                        {"lo": 50, "hi": 75, "count": 300},
                        {"lo": 75, "hi": 100, "count": 50},
                    ],
                },
                "categorical": None,
                "datetime": None,
            },
            {
                "name": "region",
                "physical_type": "VARCHAR",
                "inferred_type": "string",
                "semantic_type": "category",
                "count": 1000,
                "n_rows": 1000,
                "null_count": 10,
                "null_pct": 0.01,
                "distinct_count": 3,
                "unique_pct": 0.003,
                "flags": ["low_cardinality"],
                "quality_score": 0.80,
                "numeric": None,
                "categorical": {
                    "top": [
                        {"value": "north", "count": 500, "pct": 0.5},
                        {"value": "south", "count": 300, "pct": 0.3},
                        {"value": "east", "count": 200, "pct": 0.2},
                    ],
                    "mode": "north",
                    "mode_pct": 0.5,
                    "n_distinct": 3,
                    "entropy": 1.48,
                },
                "datetime": None,
            },
        ],
    }


def test_contains_title_and_sections():
    md = render_eda_markdown(_sample_profile())
    assert "# EDA — sales" in md
    assert "## Overview" in md
    assert "## Columnas" in md
    assert "## Numéricas" in md
    assert "## Categóricas" in md


def test_contains_column_names():
    md = render_eda_markdown(_sample_profile())
    assert "price" in md
    assert "region" in md


def test_contains_sparkline():
    md = render_eda_markdown(_sample_profile())
    # Histogram sparkline must render with block characters.
    assert "histogram: `" in md
    assert any(block in md for block in "▁▂▃▄▅▆▇█")


def test_pct_fields_scaled_by_100():
    # *_pct fields are fractions 0-1; the render must show them ×100.
    md = render_eda_markdown(_sample_profile())
    # unique_pct=0.857 -> "85.70%" (must NOT show the raw "0.86%").
    assert "85.7" in md
    assert "0.86%" not in md
    # categorical top pct=0.5 -> "50.0%".
    assert "50.0" in md


def test_outlier_pct_not_double_scaled():
    # outlier_pct ya viene en escala 0-100 (describe_numeric): el render lo muestra
    # tal cual + '%', SIN multiplicar otra vez por 100. outlier_pct=3.5 -> "3.5%",
    # nunca "350%" (el bug del doble ×100).
    md = render_eda_markdown(_sample_profile())
    assert "3.5%" in md
    assert "350" not in md


def test_pct_handles_none_as_blank():
    profile = {
        "table": "t",
        "columns": [
            {
                "name": "c",
                "inferred_type": "float",
                "null_pct": None,
                "unique_pct": None,
                "quality_score": 0.5,
            }
        ],
    }
    # None pct renders as empty cell, never "None%" or a crash.
    md = render_eda_markdown(profile)
    assert "None%" not in md


def test_tolerates_none_correlations_and_llm():
    md = render_eda_markdown(_sample_profile(correlations=None, llm=None))
    assert "## Correlaciones" not in md
    assert "## Análisis LLM" not in md
    # Still produced the main body.
    assert "# EDA — sales" in md


def test_tolerates_empty_profile():
    md = render_eda_markdown({})
    assert "# EDA — (unnamed)" in md


def test_tolerates_none_profile():
    md = render_eda_markdown(None)
    assert "# EDA — (unnamed)" in md


def _sample_models():
    """Bloque `models` como el que produce run_eda_models (PCA/KMeans/...)."""
    return {
        "n_numeric_cols": 3,
        "pca": {
            "n_components": 2,
            "n_rows_used": 1000,
            "n_features": 3,
            "explained_variance_ratio": [0.62, 0.21],
            "cumulative": [0.62, 0.83],
            "top_loadings": [
                {"component": 0, "feature": "price", "loading": 0.71},
                {"component": 1, "feature": "qty", "loading": -0.55},
            ],
        },
        "kmeans": {
            "best_k": 3,
            "silhouette": 0.48,
            "cluster_sizes": [500, 300, 200],
            "scores_by_k": [
                {"k": 2, "silhouette": 0.41, "inertia": 1200.0},
                {"k": 3, "silhouette": 0.48, "inertia": 900.0},
            ],
        },
        "outliers": {
            "n_outliers": 35,
            "outlier_pct": 3.5,
            "threshold": -0.51,
        },
        "normality": {
            "price": {"jarque_bera": {"p": 0.0001}, "is_normal": False},
        },
        "note": "",
    }


def test_models_section_rendered():
    # H4: el bloque models antes se omitía en markdown; ahora tiene formatter.
    profile = _sample_profile()
    profile["models"] = _sample_models()
    md = render_eda_markdown(profile)
    assert "## Modelos" in md
    assert "### PCA" in md
    assert "### KMeans" in md
    assert "### Outliers multivariante (Isolation Forest)" in md
    assert "### Normalidad" in md
    # Datos reales del PCA renderizados (varianza explicada ×100) y KMeans.
    assert "62.0" in md  # explained_variance_ratio 0.62 -> 62.00%
    assert "mejor k = 3" in md
    # outlier_pct del modelo ya viene en escala 0-100: 3.5 -> "3.5%", no "350".
    assert "3.5%" in md


def test_models_absent_when_none():
    # Edge: profile sin models (None) no produce sección Modelos ni rompe.
    md = render_eda_markdown(_sample_profile())  # models=None en el sample
    assert "## Modelos" not in md