"""run_sales_forecast — forecast diario de ventas Aurgi a BigQuery (one-shot). Pipeline IMPURO que produce el forecast diario de ventas (dia x centro x subcategoria CGQ) y lo escribe en `autingo-159109.sales_forecast.predictions`. Compone funciones del registry sin reimplementar su logica: - bq_auth(..., drop_quota_project=True): cliente BigQuery sin quota project ajeno (evita el 403 USER_PROJECT_DENIED del ADC del usuario). - bq_query: lee la historia agregada del mart `bi_ventas_mart.base_margenes_aa` y ejecuta el DELETE de idempotencia (parametros tipados). - forecast_seasonal_median: modelo PURO (mediana estacional + tendencia acotada) que genera todas las predicciones de golpe. - bq_load_from_file: carga las filas (JSONL) a la tabla de predicciones. Cron previsto: 21:00. Por eso la historia utilizable llega hasta as_of - 1 dia (el dia as_of aun esta parcial) y se predice as_of + 1 .. as_of + horizon. Estilo dict-no-throw: nunca lanza; captura errores y devuelve {status:'error', error, stage}. Idempotente por (run_date, model, author): borra las predicciones previas de esa combinacion antes de cargar. """ import json import os import sys import tempfile from datetime import date, datetime, timedelta, timezone sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from bigquery import bq_auth, bq_query, bq_load_from_file from datascience import forecast_seasonal_median PROJECT_ID = "autingo-159109" SOURCE_TABLE = "autingo-159109.bi_ventas_mart.base_margenes_aa" DEST_DATASET = "sales_forecast" DEST_TABLE = "predictions" HISTORY_SQL = f""" SELECT fecha, idCentro, subcat_cqq, ANY_VALUE(NombreCentro) AS center_name, ANY_VALUE(Ambito) AS ambito, SUM(CAST(venta_n AS FLOAT64)) AS venta FROM `{SOURCE_TABLE}` WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL 18 WEEK) AND DATE_SUB(@as_of, INTERVAL 1 DAY) AND venta_n IS NOT NULL AND ABS(CAST(venta_n AS FLOAT64)) < 1e9 AND subcat_cqq IS NOT NULL AND idCentro IS NOT NULL GROUP BY fecha, idCentro, subcat_cqq """ DELETE_SQL = ( f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.{DEST_TABLE}` " "WHERE run_date = @d AND model = @m AND author = @a" ) # Refresh de la tabla fisica de reales (sales_forecast.actuals_daily), consumida # por la vista forecast_eval y por los dashboards de competicion. Ventana movil # para recoger correcciones retroactivas del mart. ACTUALS_DELETE_SQL = ( f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.actuals_daily` " "WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)" ) ACTUALS_INSERT_SQL = f""" INSERT INTO `{PROJECT_ID}.{DEST_DATASET}.actuals_daily` (fecha, center_id, center_name, ambito, subcat_cgq, y_real, unidades, loaded_ts) SELECT forecast_date, IFNULL(center_id, 'SIN_CENTRO'), center_name, ambito, IFNULL(subcat_cgq, 'Sin subcategoria'), y_real, unidades, CURRENT_TIMESTAMP() FROM `{PROJECT_ID}.{DEST_DATASET}.actuals` WHERE forecast_date BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY) """ def _refresh_actuals(client, as_of: date, window_days: int = 10) -> None: """Rehace los ultimos `window_days` dias de actuals_daily desde la vista actuals.""" params = [ {"name": "as_of", "type": "DATE", "value": as_of}, {"name": "w", "type": "INT64", "value": window_days}, ] bq_query(client, ACTUALS_DELETE_SQL, params=params) bq_query(client, ACTUALS_INSERT_SQL, params=params) def _as_date(value) -> date: if isinstance(value, date) and not isinstance(value, datetime): return value if isinstance(value, datetime): return value.date() return datetime.strptime(str(value)[:10], "%Y-%m-%d").date() def run_sales_forecast( as_of: str = "", horizon: int = 7, model: str = "baseline_v1", author: str = "egutierrez", dry_run: bool = False, ) -> dict: """Genera el forecast diario de ventas y lo escribe en BigQuery. Args: as_of: fecha de corte 'YYYY-MM-DD' (dia de la corrida). Vacio = hoy. La historia utilizable llega hasta as_of - 1 dia; se predice as_of + 1 .. as_of + horizon. run_date = as_of. horizon: numero de dias futuros a predecir. Default 7. model: etiqueta del modelo escrita en cada fila (columna model). Default 'baseline_v1'. author: autor de la corrida (columna author). Default 'egutierrez'. dry_run: si True no escribe en BigQuery; devuelve el resumen + una muestra de filas. Returns: dict dict-no-throw. En exito {status:'ok', run_date, series, rows, model, author, (sample si dry_run)}. En error {status:'error', error, stage}. """ try: run_d = _as_date(as_of) if as_of else date.today() # Ultimo dia de historia utilizable (inclusive): as_of - 1 dia. hist_as_of = run_d - timedelta(days=1) horizon_dates = [ (run_d + timedelta(days=k)).isoformat() for k in range(1, horizon + 1) ] # 1) Cliente BigQuery sin quota project (evita 403 USER_PROJECT_DENIED). client = bq_auth(PROJECT_ID, drop_quota_project=True) # 2) Historia agregada del mart (hasta run_d - 1 via el WHERE de la query). q = bq_query( client, HISTORY_SQL, params=[{"name": "as_of", "type": "DATE", "value": run_d}], ) cols = {name: i for i, name in enumerate(q["columns"])} # Historia por serie + ultimos center_name/ambito conocidos + venta 8 semanas. history = [] last_meta = {} # series_id -> (max_date, center_name, ambito, center_id, subcat) recent_sum = {} # series_id -> venta acumulada en las ultimas 8 semanas active_cutoff = hist_as_of - timedelta(weeks=8) for row in q["rows"]: fecha = _as_date(row[cols["fecha"]]) center_id = str(row[cols["idCentro"]]) subcat = row[cols["subcat_cqq"]] center_name = row[cols["center_name"]] ambito = row[cols["ambito"]] venta = float(row[cols["venta"]] or 0.0) series_id = f"{center_id}|{subcat}" history.append( {"series_id": series_id, "date": fecha.isoformat(), "value": venta} ) prev = last_meta.get(series_id) if prev is None or fecha > prev[0]: last_meta[series_id] = (fecha, center_name, ambito, center_id, subcat) if fecha > active_cutoff: recent_sum[series_id] = recent_sum.get(series_id, 0.0) + venta # 3) Series activas: venta > 0 en las ultimas 8 semanas. active = {sid for sid, s in recent_sum.items() if s > 0.0} history = [h for h in history if h["series_id"] in active] if not history: result = { "status": "ok", "run_date": run_d.isoformat(), "series": 0, "rows": 0, "model": model, "author": author, } if dry_run: result["sample"] = [] return result # 4) Modelo puro: todas las predicciones de golpe. preds = forecast_seasonal_median( history, horizon_dates, as_of=hist_as_of.isoformat() ) # 5) Filas para la tabla de predicciones. run_ts = datetime.now(timezone.utc).isoformat() rows_out = [] for p in preds: sid = p["series_id"] meta = last_meta.get(sid) _, center_name, ambito, center_id, subcat = meta forecast_date = _as_date(p["date"]) rows_out.append( { "run_ts": run_ts, "run_date": run_d.isoformat(), "forecast_date": forecast_date.isoformat(), "lag_days": (forecast_date - run_d).days, "center_id": center_id, "center_name": center_name, "ambito": ambito, "subcat_cgq": subcat, "model": model, "author": author, "y_pred": round(float(p["y_pred"]), 4), } ) summary = { "status": "ok", "run_date": run_d.isoformat(), "series": len(active), "rows": len(rows_out), "model": model, "author": author, } # 6) dry-run: no escribe; devuelve resumen + muestra. if dry_run: summary["sample"] = rows_out[:5] return summary # 7) Idempotencia: borra las predicciones previas de (run_date, model, author). bq_query( client, DELETE_SQL, params=[ {"name": "d", "type": "DATE", "value": run_d}, {"name": "m", "type": "STRING", "value": model}, {"name": "a", "type": "STRING", "value": author}, ], ) # 8) Carga JSONL a la tabla (WRITE_APPEND, schema fijo de la tabla). tmp_path = None try: fd, tmp_path = tempfile.mkstemp(prefix="sales_forecast_", suffix=".jsonl") with os.fdopen(fd, "w", encoding="utf-8") as fh: for r in rows_out: fh.write(json.dumps(r, ensure_ascii=False) + "\n") load = bq_load_from_file( client, tmp_path, DEST_DATASET, DEST_TABLE, source_format="NEWLINE_DELIMITED_JSON", write_disposition="WRITE_APPEND", autodetect=False, ) finally: if tmp_path and os.path.exists(tmp_path): os.remove(tmp_path) if load.get("status") != "DONE": return { "status": "error", "error": f"load job no termino DONE: {load}", "stage": "load", } summary["rows_loaded"] = load.get("rows_loaded") summary["job_id"] = load.get("job_id") # 9) Refresca la tabla fisica de reales (ventana movil de 10 dias) para # que forecast_eval y el dashboard de competicion comparen contra el # ultimo estado del mart. try: _refresh_actuals(client, run_d) summary["actuals_refreshed"] = True except Exception as e: # noqa: BLE001 # No invalida las predicciones ya cargadas: se reporta y se sigue. summary["actuals_refreshed"] = False summary["actuals_error"] = str(e) return summary except Exception as e: # noqa: BLE001 return {"status": "error", "error": str(e), "stage": "unexpected"} if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Forecast diario de ventas Aurgi -> BigQuery sales_forecast.predictions." ) parser.add_argument("--as-of", default="", help="Fecha de corte YYYY-MM-DD (vacio = hoy).") parser.add_argument("--horizon", type=int, default=7, help="Dias a predecir. Default 7.") parser.add_argument("--model", default="baseline_v1", help="Etiqueta del modelo.") parser.add_argument("--author", default="egutierrez", help="Autor de la corrida.") parser.add_argument( "--dry-run", action="store_true", help="No escribe en BigQuery; imprime muestra." ) args = parser.parse_args() out = run_sales_forecast( as_of=args.as_of, horizon=args.horizon, model=args.model, author=args.author, dry_run=args.dry_run, ) print(json.dumps(out, ensure_ascii=False, default=str)) sys.exit(0 if out.get("status") == "ok" else 1)