Files
fn_registry/python/functions/pipelines/run_sales_forecast.py
T
egutierrez 5a4f82cf76 chore: auto-commit (26 archivos)
- python/functions/bigquery/bq_auth.md
- python/functions/bigquery/bq_load_from_file.md
- python/functions/bigquery/bq_load_from_gcs.md
- python/functions/bigquery/client.py
- python/functions/bigquery/queries.py
- python/functions/datascience/__init__.py
- python/functions/datascience/decode_qr_image.py
- python/functions/datascience/load_bq_table_to_duckdb.md
- python/functions/datascience/load_bq_table_to_duckdb.py
- python/functions/pipelines/profile_bq_table.md
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-02 19:00:13 +02:00

299 lines
12 KiB
Python

"""run_sales_forecast — forecast diario de ventas Aurgi a BigQuery (one-shot).
Pipeline IMPURO que produce el forecast diario de ventas (dia x centro x
subcategoria CGQ) y lo escribe en `autingo-159109.sales_forecast.predictions`.
Compone funciones del registry sin reimplementar su logica:
- bq_auth(..., drop_quota_project=True): cliente BigQuery sin quota project ajeno
(evita el 403 USER_PROJECT_DENIED del ADC del usuario).
- bq_query: lee la historia agregada del mart `bi_ventas_mart.base_margenes_aa`
y ejecuta el DELETE de idempotencia (parametros tipados).
- forecast_seasonal_median: modelo PURO (mediana estacional + tendencia acotada)
que genera todas las predicciones de golpe.
- bq_load_from_file: carga las filas (JSONL) a la tabla de predicciones.
Cron previsto: 21:00. Por eso la historia utilizable llega hasta as_of - 1 dia
(el dia as_of aun esta parcial) y se predice as_of + 1 .. as_of + horizon.
Estilo dict-no-throw: nunca lanza; captura errores y devuelve
{status:'error', error, stage}. Idempotente por (run_date, model, author):
borra las predicciones previas de esa combinacion antes de cargar.
"""
import json
import os
import sys
import tempfile
from datetime import date, datetime, timedelta, timezone
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from bigquery import bq_auth, bq_query, bq_load_from_file
from datascience import forecast_seasonal_median
PROJECT_ID = "autingo-159109"
SOURCE_TABLE = "autingo-159109.bi_ventas_mart.base_margenes_aa"
DEST_DATASET = "sales_forecast"
DEST_TABLE = "predictions"
HISTORY_SQL = f"""
SELECT fecha, idCentro, subcat_cqq,
ANY_VALUE(NombreCentro) AS center_name, ANY_VALUE(Ambito) AS ambito,
SUM(CAST(venta_n AS FLOAT64)) AS venta
FROM `{SOURCE_TABLE}`
WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL 18 WEEK) AND DATE_SUB(@as_of, INTERVAL 1 DAY)
AND venta_n IS NOT NULL AND ABS(CAST(venta_n AS FLOAT64)) < 1e9
AND subcat_cqq IS NOT NULL AND idCentro IS NOT NULL
GROUP BY fecha, idCentro, subcat_cqq
"""
DELETE_SQL = (
f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.{DEST_TABLE}` "
"WHERE run_date = @d AND model = @m AND author = @a"
)
# Refresh de la tabla fisica de reales (sales_forecast.actuals_daily), consumida
# por la vista forecast_eval y por los dashboards de competicion. Ventana movil
# para recoger correcciones retroactivas del mart.
ACTUALS_DELETE_SQL = (
f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.actuals_daily` "
"WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)"
)
ACTUALS_INSERT_SQL = f"""
INSERT INTO `{PROJECT_ID}.{DEST_DATASET}.actuals_daily`
(fecha, center_id, center_name, ambito, subcat_cgq, y_real, unidades, loaded_ts)
SELECT forecast_date, IFNULL(center_id, 'SIN_CENTRO'), center_name, ambito,
IFNULL(subcat_cgq, 'Sin subcategoria'),
y_real, unidades, CURRENT_TIMESTAMP()
FROM `{PROJECT_ID}.{DEST_DATASET}.actuals`
WHERE forecast_date BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)
"""
def _refresh_actuals(client, as_of: date, window_days: int = 10) -> None:
"""Rehace los ultimos `window_days` dias de actuals_daily desde la vista actuals."""
params = [
{"name": "as_of", "type": "DATE", "value": as_of},
{"name": "w", "type": "INT64", "value": window_days},
]
bq_query(client, ACTUALS_DELETE_SQL, params=params)
bq_query(client, ACTUALS_INSERT_SQL, params=params)
def _as_date(value) -> date:
if isinstance(value, date) and not isinstance(value, datetime):
return value
if isinstance(value, datetime):
return value.date()
return datetime.strptime(str(value)[:10], "%Y-%m-%d").date()
def run_sales_forecast(
as_of: str = "",
horizon: int = 7,
model: str = "baseline_v1",
author: str = "egutierrez",
dry_run: bool = False,
) -> dict:
"""Genera el forecast diario de ventas y lo escribe en BigQuery.
Args:
as_of: fecha de corte 'YYYY-MM-DD' (dia de la corrida). Vacio = hoy. La
historia utilizable llega hasta as_of - 1 dia; se predice
as_of + 1 .. as_of + horizon. run_date = as_of.
horizon: numero de dias futuros a predecir. Default 7.
model: etiqueta del modelo escrita en cada fila (columna model). Default
'baseline_v1'.
author: autor de la corrida (columna author). Default 'egutierrez'.
dry_run: si True no escribe en BigQuery; devuelve el resumen + una muestra
de filas.
Returns:
dict dict-no-throw. En exito {status:'ok', run_date, series, rows, model,
author, (sample si dry_run)}. En error {status:'error', error, stage}.
"""
try:
run_d = _as_date(as_of) if as_of else date.today()
# Ultimo dia de historia utilizable (inclusive): as_of - 1 dia.
hist_as_of = run_d - timedelta(days=1)
horizon_dates = [
(run_d + timedelta(days=k)).isoformat() for k in range(1, horizon + 1)
]
# 1) Cliente BigQuery sin quota project (evita 403 USER_PROJECT_DENIED).
client = bq_auth(PROJECT_ID, drop_quota_project=True)
# 2) Historia agregada del mart (hasta run_d - 1 via el WHERE de la query).
q = bq_query(
client,
HISTORY_SQL,
params=[{"name": "as_of", "type": "DATE", "value": run_d}],
)
cols = {name: i for i, name in enumerate(q["columns"])}
# Historia por serie + ultimos center_name/ambito conocidos + venta 8 semanas.
history = []
last_meta = {} # series_id -> (max_date, center_name, ambito, center_id, subcat)
recent_sum = {} # series_id -> venta acumulada en las ultimas 8 semanas
active_cutoff = hist_as_of - timedelta(weeks=8)
for row in q["rows"]:
fecha = _as_date(row[cols["fecha"]])
center_id = str(row[cols["idCentro"]])
subcat = row[cols["subcat_cqq"]]
center_name = row[cols["center_name"]]
ambito = row[cols["ambito"]]
venta = float(row[cols["venta"]] or 0.0)
series_id = f"{center_id}|{subcat}"
history.append(
{"series_id": series_id, "date": fecha.isoformat(), "value": venta}
)
prev = last_meta.get(series_id)
if prev is None or fecha > prev[0]:
last_meta[series_id] = (fecha, center_name, ambito, center_id, subcat)
if fecha > active_cutoff:
recent_sum[series_id] = recent_sum.get(series_id, 0.0) + venta
# 3) Series activas: venta > 0 en las ultimas 8 semanas.
active = {sid for sid, s in recent_sum.items() if s > 0.0}
history = [h for h in history if h["series_id"] in active]
if not history:
result = {
"status": "ok",
"run_date": run_d.isoformat(),
"series": 0,
"rows": 0,
"model": model,
"author": author,
}
if dry_run:
result["sample"] = []
return result
# 4) Modelo puro: todas las predicciones de golpe.
preds = forecast_seasonal_median(
history, horizon_dates, as_of=hist_as_of.isoformat()
)
# 5) Filas para la tabla de predicciones.
run_ts = datetime.now(timezone.utc).isoformat()
rows_out = []
for p in preds:
sid = p["series_id"]
meta = last_meta.get(sid)
_, center_name, ambito, center_id, subcat = meta
forecast_date = _as_date(p["date"])
rows_out.append(
{
"run_ts": run_ts,
"run_date": run_d.isoformat(),
"forecast_date": forecast_date.isoformat(),
"lag_days": (forecast_date - run_d).days,
"center_id": center_id,
"center_name": center_name,
"ambito": ambito,
"subcat_cgq": subcat,
"model": model,
"author": author,
"y_pred": round(float(p["y_pred"]), 4),
}
)
summary = {
"status": "ok",
"run_date": run_d.isoformat(),
"series": len(active),
"rows": len(rows_out),
"model": model,
"author": author,
}
# 6) dry-run: no escribe; devuelve resumen + muestra.
if dry_run:
summary["sample"] = rows_out[:5]
return summary
# 7) Idempotencia: borra las predicciones previas de (run_date, model, author).
bq_query(
client,
DELETE_SQL,
params=[
{"name": "d", "type": "DATE", "value": run_d},
{"name": "m", "type": "STRING", "value": model},
{"name": "a", "type": "STRING", "value": author},
],
)
# 8) Carga JSONL a la tabla (WRITE_APPEND, schema fijo de la tabla).
tmp_path = None
try:
fd, tmp_path = tempfile.mkstemp(prefix="sales_forecast_", suffix=".jsonl")
with os.fdopen(fd, "w", encoding="utf-8") as fh:
for r in rows_out:
fh.write(json.dumps(r, ensure_ascii=False) + "\n")
load = bq_load_from_file(
client,
tmp_path,
DEST_DATASET,
DEST_TABLE,
source_format="NEWLINE_DELIMITED_JSON",
write_disposition="WRITE_APPEND",
autodetect=False,
)
finally:
if tmp_path and os.path.exists(tmp_path):
os.remove(tmp_path)
if load.get("status") != "DONE":
return {
"status": "error",
"error": f"load job no termino DONE: {load}",
"stage": "load",
}
summary["rows_loaded"] = load.get("rows_loaded")
summary["job_id"] = load.get("job_id")
# 9) Refresca la tabla fisica de reales (ventana movil de 10 dias) para
# que forecast_eval y el dashboard de competicion comparen contra el
# ultimo estado del mart.
try:
_refresh_actuals(client, run_d)
summary["actuals_refreshed"] = True
except Exception as e: # noqa: BLE001
# No invalida las predicciones ya cargadas: se reporta y se sigue.
summary["actuals_refreshed"] = False
summary["actuals_error"] = str(e)
return summary
except Exception as e: # noqa: BLE001
return {"status": "error", "error": str(e), "stage": "unexpected"}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Forecast diario de ventas Aurgi -> BigQuery sales_forecast.predictions."
)
parser.add_argument("--as-of", default="", help="Fecha de corte YYYY-MM-DD (vacio = hoy).")
parser.add_argument("--horizon", type=int, default=7, help="Dias a predecir. Default 7.")
parser.add_argument("--model", default="baseline_v1", help="Etiqueta del modelo.")
parser.add_argument("--author", default="egutierrez", help="Autor de la corrida.")
parser.add_argument(
"--dry-run", action="store_true", help="No escribe en BigQuery; imprime muestra."
)
args = parser.parse_args()
out = run_sales_forecast(
as_of=args.as_of,
horizon=args.horizon,
model=args.model,
author=args.author,
dry_run=args.dry_run,
)
print(json.dumps(out, ensure_ascii=False, default=str))
sys.exit(0 if out.get("status") == "ok" else 1)