feat(eda): capítulo TIMESERIES del AutomaticEDA (evolución + análisis de serie)
Capítulo nuevo build_timeseries(profile, ctx) -> Chapter|None del motor AutomaticEDA. Cuando la tabla tiene columna de fecha/datetime, grafica la evolución de cada columna numérica por periodo (valor agregado + conteo de filas) y los paneles de descomposición STL y autocorrelación (ACF), con el análisis de la serie: estacionariedad (ADF+KPSS), autocorrelación (Ljung-Box), fuerzas de tendencia/estacionalidad (Hyndman) y la transformación sugerida (retornos o diferencias) para evitar correlaciones espurias. Sin columna temporal devuelve None. Consolida series OHLC casi idénticas en un único gráfico conservando el análisis de cada columna. La serie cruda llega por ctx['timeseries_raw'] (mismo patrón que modelos con raw_numeric); las figuras son perezosas (Figure.make) y el paginador del núcleo garantiza no-corte en PDF y PPTX. CHAPTER_VERSION 1.0.0. Cubre los MUST del diseño (report 2043): MUST-9.1 (línea valor-vs-tiempo + conteo por periodo), MUST-9.2 (paneles STL + ACF), MUST-9.3 (perfil datetime + consolidación OHLC). Funciones nuevas del registry (grupo eda), delegadas a fn-constructor, no inline: - detect_time_column (pure): detecta la columna temporal y las numéricas - profile_datetime (pure): rango/frecuencia/regularidad/huecos de la fecha - resample_timeseries (pure): agrega la serie por periodo + conteo - extract_timeseries_raw (impure): lee la serie cruda ordenada de DuckDB/PG Verificación: 69 tests verdes (capítulo 9 + funciones 28 + núcleo/renderers); golden real sobre seattle-weather (estacional) y aapl (OHLC) con PDF+PPTX sin cortar nada (cols_cortadas=[]). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
"""Tests para resample_timeseries (grupo eda)."""
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from resample_timeseries import resample_timeseries
|
||||
|
||||
|
||||
def test_daily_a_mensual_mean():
|
||||
# Serie diaria agregada a buckets mensuales con agg="mean".
|
||||
t = [
|
||||
"2020-01-01", "2020-01-15",
|
||||
"2020-02-01", "2020-02-10", "2020-02-20",
|
||||
]
|
||||
v = [10.0, 20.0, 30.0, 40.0, 50.0]
|
||||
r = resample_timeseries(t, v, freq="monthly", agg="mean")
|
||||
|
||||
assert r["t"] == ["2020-01-01", "2020-02-01"]
|
||||
assert r["v"] == [15.0, 40.0] # (10+20)/2 ; (30+40+50)/3
|
||||
assert r["count"] == [2, 3]
|
||||
assert r["freq"] == "monthly"
|
||||
assert r["agg"] == "mean"
|
||||
assert r["n_in"] == 5
|
||||
assert r["n_buckets"] == 2
|
||||
assert r["downsampled"] is False
|
||||
assert r["note"] == ""
|
||||
|
||||
|
||||
def test_agg_sum_y_last():
|
||||
t = [
|
||||
"2020-01-01", "2020-01-15",
|
||||
"2020-02-01", "2020-02-10", "2020-02-20",
|
||||
]
|
||||
v = [10.0, 20.0, 30.0, 40.0, 50.0]
|
||||
|
||||
r_sum = resample_timeseries(t, v, freq="monthly", agg="sum")
|
||||
assert r_sum["v"] == [30.0, 120.0]
|
||||
assert r_sum["agg"] == "sum"
|
||||
|
||||
# last = valor de la observacion cronologicamente mas reciente del bucket,
|
||||
# aunque el orden de entrada este desordenado.
|
||||
t2 = ["2020-02-20", "2020-02-01", "2020-02-10", "2020-01-15", "2020-01-01"]
|
||||
v2 = [50.0, 30.0, 40.0, 20.0, 10.0]
|
||||
r_last = resample_timeseries(t2, v2, freq="monthly", agg="last")
|
||||
assert r_last["t"] == ["2020-01-01", "2020-02-01"]
|
||||
assert r_last["v"] == [20.0, 50.0] # Jan->2020-01-15=20 ; Feb->2020-02-20=50
|
||||
assert r_last["agg"] == "last"
|
||||
|
||||
|
||||
def test_count_cuenta_observacion_con_valor_none():
|
||||
# Un bucket con un valor None: count cuenta la fila, v ignora el None.
|
||||
t = ["2020-03-05", "2020-03-06", "2020-03-20"]
|
||||
v = [None, 7.0, 9.0]
|
||||
r = resample_timeseries(t, v, freq="monthly", agg="mean")
|
||||
|
||||
assert r["t"] == ["2020-03-01"]
|
||||
assert r["count"] == [3] # 3 filas con fecha valida
|
||||
assert r["v"] == [8.0] # media de los validos: (7+9)/2
|
||||
assert r["n_in"] == 3
|
||||
|
||||
# Bucket entero sin ningun valor numerico valido -> v = None, count sigue.
|
||||
r2 = resample_timeseries(
|
||||
["2020-04-01", "2020-04-02"], [None, "n/a"], freq="monthly"
|
||||
)
|
||||
assert r2["t"] == ["2020-04-01"]
|
||||
assert r2["count"] == [2]
|
||||
assert r2["v"] == [None]
|
||||
|
||||
|
||||
def test_downsampling_respeta_max_points_y_extremos():
|
||||
base = datetime.date(2021, 1, 1)
|
||||
t = [(base + datetime.timedelta(days=i)).isoformat() for i in range(500)]
|
||||
v = [float(i) for i in range(500)]
|
||||
r = resample_timeseries(t, v, freq="daily", agg="mean", max_points=400)
|
||||
|
||||
assert r["n_buckets"] == 500
|
||||
assert r["downsampled"] is True
|
||||
assert len(r["t"]) <= 400
|
||||
assert len(r["t"]) == len(r["v"]) == len(r["count"])
|
||||
# Primero y ultimo bucket conservados.
|
||||
assert r["t"][0] == "2021-01-01"
|
||||
assert r["t"][-1] == (base + datetime.timedelta(days=499)).isoformat()
|
||||
|
||||
|
||||
def test_freq_auto_infiere_mensual():
|
||||
# Fechas separadas ~1 mes -> auto infiere "monthly".
|
||||
t = [f"2022-{m:02d}-01" for m in range(1, 13)]
|
||||
v = [float(m) for m in range(1, 13)]
|
||||
r = resample_timeseries(t, v, freq="auto", agg="mean")
|
||||
|
||||
assert r["freq"] == "monthly"
|
||||
assert r["n_buckets"] == 12
|
||||
assert r["count"] == [1] * 12
|
||||
|
||||
# Fechas diarias consecutivas -> auto infiere "daily".
|
||||
base = datetime.date(2023, 1, 1)
|
||||
td = [(base + datetime.timedelta(days=i)).isoformat() for i in range(20)]
|
||||
rd = resample_timeseries(td, [float(i) for i in range(20)], freq="auto")
|
||||
assert rd["freq"] == "daily"
|
||||
|
||||
|
||||
def test_edge_listas_vacias_o_desiguales():
|
||||
vacio = resample_timeseries([], [])
|
||||
assert vacio["t"] == [] and vacio["v"] == [] and vacio["count"] == []
|
||||
assert vacio["note"] == "datos insuficientes"
|
||||
assert vacio["n_in"] == 0 and vacio["n_buckets"] == 0
|
||||
|
||||
desigual = resample_timeseries(["2020-01-01", "2020-01-02"], [1.0])
|
||||
assert desigual["note"] == "datos insuficientes"
|
||||
assert desigual["t"] == []
|
||||
|
||||
# Todas las fechas invalidas -> tambien insuficiente.
|
||||
invalidas = resample_timeseries(["no-fecha", "tampoco"], [1.0, 2.0])
|
||||
assert invalidas["note"] == "datos insuficientes"
|
||||
assert invalidas["n_in"] == 0
|
||||
Reference in New Issue
Block a user