5a4f82cf76
- python/functions/bigquery/bq_auth.md - python/functions/bigquery/bq_load_from_file.md - python/functions/bigquery/bq_load_from_gcs.md - python/functions/bigquery/client.py - python/functions/bigquery/queries.py - python/functions/datascience/__init__.py - python/functions/datascience/decode_qr_image.py - python/functions/datascience/load_bq_table_to_duckdb.md - python/functions/datascience/load_bq_table_to_duckdb.py - python/functions/pipelines/profile_bq_table.md - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
299 lines
12 KiB
Python
299 lines
12 KiB
Python
"""run_sales_forecast — forecast diario de ventas Aurgi a BigQuery (one-shot).
|
|
|
|
Pipeline IMPURO que produce el forecast diario de ventas (dia x centro x
|
|
subcategoria CGQ) y lo escribe en `autingo-159109.sales_forecast.predictions`.
|
|
Compone funciones del registry sin reimplementar su logica:
|
|
|
|
- bq_auth(..., drop_quota_project=True): cliente BigQuery sin quota project ajeno
|
|
(evita el 403 USER_PROJECT_DENIED del ADC del usuario).
|
|
- bq_query: lee la historia agregada del mart `bi_ventas_mart.base_margenes_aa`
|
|
y ejecuta el DELETE de idempotencia (parametros tipados).
|
|
- forecast_seasonal_median: modelo PURO (mediana estacional + tendencia acotada)
|
|
que genera todas las predicciones de golpe.
|
|
- bq_load_from_file: carga las filas (JSONL) a la tabla de predicciones.
|
|
|
|
Cron previsto: 21:00. Por eso la historia utilizable llega hasta as_of - 1 dia
|
|
(el dia as_of aun esta parcial) y se predice as_of + 1 .. as_of + horizon.
|
|
|
|
Estilo dict-no-throw: nunca lanza; captura errores y devuelve
|
|
{status:'error', error, stage}. Idempotente por (run_date, model, author):
|
|
borra las predicciones previas de esa combinacion antes de cargar.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
from datetime import date, datetime, timedelta, timezone
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from bigquery import bq_auth, bq_query, bq_load_from_file
|
|
from datascience import forecast_seasonal_median
|
|
|
|
PROJECT_ID = "autingo-159109"
|
|
SOURCE_TABLE = "autingo-159109.bi_ventas_mart.base_margenes_aa"
|
|
DEST_DATASET = "sales_forecast"
|
|
DEST_TABLE = "predictions"
|
|
|
|
HISTORY_SQL = f"""
|
|
SELECT fecha, idCentro, subcat_cqq,
|
|
ANY_VALUE(NombreCentro) AS center_name, ANY_VALUE(Ambito) AS ambito,
|
|
SUM(CAST(venta_n AS FLOAT64)) AS venta
|
|
FROM `{SOURCE_TABLE}`
|
|
WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL 18 WEEK) AND DATE_SUB(@as_of, INTERVAL 1 DAY)
|
|
AND venta_n IS NOT NULL AND ABS(CAST(venta_n AS FLOAT64)) < 1e9
|
|
AND subcat_cqq IS NOT NULL AND idCentro IS NOT NULL
|
|
GROUP BY fecha, idCentro, subcat_cqq
|
|
"""
|
|
|
|
DELETE_SQL = (
|
|
f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.{DEST_TABLE}` "
|
|
"WHERE run_date = @d AND model = @m AND author = @a"
|
|
)
|
|
|
|
# Refresh de la tabla fisica de reales (sales_forecast.actuals_daily), consumida
|
|
# por la vista forecast_eval y por los dashboards de competicion. Ventana movil
|
|
# para recoger correcciones retroactivas del mart.
|
|
ACTUALS_DELETE_SQL = (
|
|
f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.actuals_daily` "
|
|
"WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)"
|
|
)
|
|
|
|
ACTUALS_INSERT_SQL = f"""
|
|
INSERT INTO `{PROJECT_ID}.{DEST_DATASET}.actuals_daily`
|
|
(fecha, center_id, center_name, ambito, subcat_cgq, y_real, unidades, loaded_ts)
|
|
SELECT forecast_date, IFNULL(center_id, 'SIN_CENTRO'), center_name, ambito,
|
|
IFNULL(subcat_cgq, 'Sin subcategoria'),
|
|
y_real, unidades, CURRENT_TIMESTAMP()
|
|
FROM `{PROJECT_ID}.{DEST_DATASET}.actuals`
|
|
WHERE forecast_date BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)
|
|
"""
|
|
|
|
|
|
def _refresh_actuals(client, as_of: date, window_days: int = 10) -> None:
|
|
"""Rehace los ultimos `window_days` dias de actuals_daily desde la vista actuals."""
|
|
params = [
|
|
{"name": "as_of", "type": "DATE", "value": as_of},
|
|
{"name": "w", "type": "INT64", "value": window_days},
|
|
]
|
|
bq_query(client, ACTUALS_DELETE_SQL, params=params)
|
|
bq_query(client, ACTUALS_INSERT_SQL, params=params)
|
|
|
|
|
|
def _as_date(value) -> date:
|
|
if isinstance(value, date) and not isinstance(value, datetime):
|
|
return value
|
|
if isinstance(value, datetime):
|
|
return value.date()
|
|
return datetime.strptime(str(value)[:10], "%Y-%m-%d").date()
|
|
|
|
|
|
def run_sales_forecast(
|
|
as_of: str = "",
|
|
horizon: int = 7,
|
|
model: str = "baseline_v1",
|
|
author: str = "egutierrez",
|
|
dry_run: bool = False,
|
|
) -> dict:
|
|
"""Genera el forecast diario de ventas y lo escribe en BigQuery.
|
|
|
|
Args:
|
|
as_of: fecha de corte 'YYYY-MM-DD' (dia de la corrida). Vacio = hoy. La
|
|
historia utilizable llega hasta as_of - 1 dia; se predice
|
|
as_of + 1 .. as_of + horizon. run_date = as_of.
|
|
horizon: numero de dias futuros a predecir. Default 7.
|
|
model: etiqueta del modelo escrita en cada fila (columna model). Default
|
|
'baseline_v1'.
|
|
author: autor de la corrida (columna author). Default 'egutierrez'.
|
|
dry_run: si True no escribe en BigQuery; devuelve el resumen + una muestra
|
|
de filas.
|
|
|
|
Returns:
|
|
dict dict-no-throw. En exito {status:'ok', run_date, series, rows, model,
|
|
author, (sample si dry_run)}. En error {status:'error', error, stage}.
|
|
"""
|
|
try:
|
|
run_d = _as_date(as_of) if as_of else date.today()
|
|
# Ultimo dia de historia utilizable (inclusive): as_of - 1 dia.
|
|
hist_as_of = run_d - timedelta(days=1)
|
|
horizon_dates = [
|
|
(run_d + timedelta(days=k)).isoformat() for k in range(1, horizon + 1)
|
|
]
|
|
|
|
# 1) Cliente BigQuery sin quota project (evita 403 USER_PROJECT_DENIED).
|
|
client = bq_auth(PROJECT_ID, drop_quota_project=True)
|
|
|
|
# 2) Historia agregada del mart (hasta run_d - 1 via el WHERE de la query).
|
|
q = bq_query(
|
|
client,
|
|
HISTORY_SQL,
|
|
params=[{"name": "as_of", "type": "DATE", "value": run_d}],
|
|
)
|
|
cols = {name: i for i, name in enumerate(q["columns"])}
|
|
|
|
# Historia por serie + ultimos center_name/ambito conocidos + venta 8 semanas.
|
|
history = []
|
|
last_meta = {} # series_id -> (max_date, center_name, ambito, center_id, subcat)
|
|
recent_sum = {} # series_id -> venta acumulada en las ultimas 8 semanas
|
|
active_cutoff = hist_as_of - timedelta(weeks=8)
|
|
for row in q["rows"]:
|
|
fecha = _as_date(row[cols["fecha"]])
|
|
center_id = str(row[cols["idCentro"]])
|
|
subcat = row[cols["subcat_cqq"]]
|
|
center_name = row[cols["center_name"]]
|
|
ambito = row[cols["ambito"]]
|
|
venta = float(row[cols["venta"]] or 0.0)
|
|
series_id = f"{center_id}|{subcat}"
|
|
|
|
history.append(
|
|
{"series_id": series_id, "date": fecha.isoformat(), "value": venta}
|
|
)
|
|
prev = last_meta.get(series_id)
|
|
if prev is None or fecha > prev[0]:
|
|
last_meta[series_id] = (fecha, center_name, ambito, center_id, subcat)
|
|
if fecha > active_cutoff:
|
|
recent_sum[series_id] = recent_sum.get(series_id, 0.0) + venta
|
|
|
|
# 3) Series activas: venta > 0 en las ultimas 8 semanas.
|
|
active = {sid for sid, s in recent_sum.items() if s > 0.0}
|
|
history = [h for h in history if h["series_id"] in active]
|
|
|
|
if not history:
|
|
result = {
|
|
"status": "ok",
|
|
"run_date": run_d.isoformat(),
|
|
"series": 0,
|
|
"rows": 0,
|
|
"model": model,
|
|
"author": author,
|
|
}
|
|
if dry_run:
|
|
result["sample"] = []
|
|
return result
|
|
|
|
# 4) Modelo puro: todas las predicciones de golpe.
|
|
preds = forecast_seasonal_median(
|
|
history, horizon_dates, as_of=hist_as_of.isoformat()
|
|
)
|
|
|
|
# 5) Filas para la tabla de predicciones.
|
|
run_ts = datetime.now(timezone.utc).isoformat()
|
|
rows_out = []
|
|
for p in preds:
|
|
sid = p["series_id"]
|
|
meta = last_meta.get(sid)
|
|
_, center_name, ambito, center_id, subcat = meta
|
|
forecast_date = _as_date(p["date"])
|
|
rows_out.append(
|
|
{
|
|
"run_ts": run_ts,
|
|
"run_date": run_d.isoformat(),
|
|
"forecast_date": forecast_date.isoformat(),
|
|
"lag_days": (forecast_date - run_d).days,
|
|
"center_id": center_id,
|
|
"center_name": center_name,
|
|
"ambito": ambito,
|
|
"subcat_cgq": subcat,
|
|
"model": model,
|
|
"author": author,
|
|
"y_pred": round(float(p["y_pred"]), 4),
|
|
}
|
|
)
|
|
|
|
summary = {
|
|
"status": "ok",
|
|
"run_date": run_d.isoformat(),
|
|
"series": len(active),
|
|
"rows": len(rows_out),
|
|
"model": model,
|
|
"author": author,
|
|
}
|
|
|
|
# 6) dry-run: no escribe; devuelve resumen + muestra.
|
|
if dry_run:
|
|
summary["sample"] = rows_out[:5]
|
|
return summary
|
|
|
|
# 7) Idempotencia: borra las predicciones previas de (run_date, model, author).
|
|
bq_query(
|
|
client,
|
|
DELETE_SQL,
|
|
params=[
|
|
{"name": "d", "type": "DATE", "value": run_d},
|
|
{"name": "m", "type": "STRING", "value": model},
|
|
{"name": "a", "type": "STRING", "value": author},
|
|
],
|
|
)
|
|
|
|
# 8) Carga JSONL a la tabla (WRITE_APPEND, schema fijo de la tabla).
|
|
tmp_path = None
|
|
try:
|
|
fd, tmp_path = tempfile.mkstemp(prefix="sales_forecast_", suffix=".jsonl")
|
|
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
|
for r in rows_out:
|
|
fh.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
load = bq_load_from_file(
|
|
client,
|
|
tmp_path,
|
|
DEST_DATASET,
|
|
DEST_TABLE,
|
|
source_format="NEWLINE_DELIMITED_JSON",
|
|
write_disposition="WRITE_APPEND",
|
|
autodetect=False,
|
|
)
|
|
finally:
|
|
if tmp_path and os.path.exists(tmp_path):
|
|
os.remove(tmp_path)
|
|
|
|
if load.get("status") != "DONE":
|
|
return {
|
|
"status": "error",
|
|
"error": f"load job no termino DONE: {load}",
|
|
"stage": "load",
|
|
}
|
|
|
|
summary["rows_loaded"] = load.get("rows_loaded")
|
|
summary["job_id"] = load.get("job_id")
|
|
|
|
# 9) Refresca la tabla fisica de reales (ventana movil de 10 dias) para
|
|
# que forecast_eval y el dashboard de competicion comparen contra el
|
|
# ultimo estado del mart.
|
|
try:
|
|
_refresh_actuals(client, run_d)
|
|
summary["actuals_refreshed"] = True
|
|
except Exception as e: # noqa: BLE001
|
|
# No invalida las predicciones ya cargadas: se reporta y se sigue.
|
|
summary["actuals_refreshed"] = False
|
|
summary["actuals_error"] = str(e)
|
|
|
|
return summary
|
|
except Exception as e: # noqa: BLE001
|
|
return {"status": "error", "error": str(e), "stage": "unexpected"}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Forecast diario de ventas Aurgi -> BigQuery sales_forecast.predictions."
|
|
)
|
|
parser.add_argument("--as-of", default="", help="Fecha de corte YYYY-MM-DD (vacio = hoy).")
|
|
parser.add_argument("--horizon", type=int, default=7, help="Dias a predecir. Default 7.")
|
|
parser.add_argument("--model", default="baseline_v1", help="Etiqueta del modelo.")
|
|
parser.add_argument("--author", default="egutierrez", help="Autor de la corrida.")
|
|
parser.add_argument(
|
|
"--dry-run", action="store_true", help="No escribe en BigQuery; imprime muestra."
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
out = run_sales_forecast(
|
|
as_of=args.as_of,
|
|
horizon=args.horizon,
|
|
model=args.model,
|
|
author=args.author,
|
|
dry_run=args.dry_run,
|
|
)
|
|
print(json.dumps(out, ensure_ascii=False, default=str))
|
|
sys.exit(0 if out.get("status") == "ok" else 1)
|