fn_registry/python/functions/pipelines/run_sales_forecast.py

"""run_sales_forecast — forecast diario de ventas Aurgi a BigQuery (one-shot).

Pipeline IMPURO que produce el forecast diario de ventas (dia x centro x
subcategoria CGQ) y lo escribe en `autingo-159109.sales_forecast.predictions`.
Compone funciones del registry sin reimplementar su logica:

  - bq_auth(..., drop_quota_project=True): cliente BigQuery sin quota project ajeno
    (evita el 403 USER_PROJECT_DENIED del ADC del usuario).
  - bq_query: lee la historia agregada del mart `bi_ventas_mart.base_margenes_aa`
    y ejecuta el DELETE de idempotencia (parametros tipados).
  - forecast_seasonal_median: modelo PURO (mediana estacional + tendencia acotada)
    que genera todas las predicciones de golpe.
  - bq_load_from_file: carga las filas (JSONL) a la tabla de predicciones.

Cron previsto: 21:00. Por eso la historia utilizable llega hasta as_of - 1 dia
(el dia as_of aun esta parcial) y se predice as_of + 1 .. as_of + horizon.

Estilo dict-no-throw: nunca lanza; captura errores y devuelve
{status:'error', error, stage}. Idempotente por (run_date, model, author):
borra las predicciones previas de esa combinacion antes de cargar.
"""

import json
import os
import sys
import tempfile
from datetime import date, datetime, timedelta, timezone

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from bigquery import bq_auth, bq_query, bq_load_from_file
from datascience import forecast_seasonal_median

PROJECT_ID = "autingo-159109"
SOURCE_TABLE = "autingo-159109.bi_ventas_mart.base_margenes_aa"
DEST_DATASET = "sales_forecast"
DEST_TABLE = "predictions"

HISTORY_SQL = f"""
SELECT fecha, idCentro, subcat_cqq,
       ANY_VALUE(NombreCentro) AS center_name, ANY_VALUE(Ambito) AS ambito,
       SUM(CAST(venta_n AS FLOAT64)) AS venta
FROM `{SOURCE_TABLE}`
WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL 18 WEEK) AND DATE_SUB(@as_of, INTERVAL 1 DAY)
  AND venta_n IS NOT NULL AND ABS(CAST(venta_n AS FLOAT64)) < 1e9
  AND subcat_cqq IS NOT NULL AND idCentro IS NOT NULL
GROUP BY fecha, idCentro, subcat_cqq
"""

DELETE_SQL = (
    f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.{DEST_TABLE}` "
    "WHERE run_date = @d AND model = @m AND author = @a"
)

# Refresh de la tabla fisica de reales (sales_forecast.actuals_daily), consumida
# por la vista forecast_eval y por los dashboards de competicion. Ventana movil
# para recoger correcciones retroactivas del mart.
ACTUALS_DELETE_SQL = (
    f"DELETE FROM `{PROJECT_ID}.{DEST_DATASET}.actuals_daily` "
    "WHERE fecha BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)"
)

ACTUALS_INSERT_SQL = f"""
INSERT INTO `{PROJECT_ID}.{DEST_DATASET}.actuals_daily`
  (fecha, center_id, center_name, ambito, subcat_cgq, y_real, unidades, loaded_ts)
SELECT forecast_date, IFNULL(center_id, 'SIN_CENTRO'), center_name, ambito,
       IFNULL(subcat_cgq, 'Sin subcategoria'),
       y_real, unidades, CURRENT_TIMESTAMP()
FROM `{PROJECT_ID}.{DEST_DATASET}.actuals`
WHERE forecast_date BETWEEN DATE_SUB(@as_of, INTERVAL @w DAY) AND DATE_SUB(@as_of, INTERVAL 1 DAY)
"""


def _refresh_actuals(client, as_of: date, window_days: int = 10) -> None:
    """Rehace los ultimos `window_days` dias de actuals_daily desde la vista actuals."""
    params = [
        {"name": "as_of", "type": "DATE", "value": as_of},
        {"name": "w", "type": "INT64", "value": window_days},
    ]
    bq_query(client, ACTUALS_DELETE_SQL, params=params)
    bq_query(client, ACTUALS_INSERT_SQL, params=params)


def _as_date(value) -> date:
    if isinstance(value, date) and not isinstance(value, datetime):
        return value
    if isinstance(value, datetime):
        return value.date()
    return datetime.strptime(str(value)[:10], "%Y-%m-%d").date()


def run_sales_forecast(
    as_of: str = "",
    horizon: int = 7,
    model: str = "baseline_v1",
    author: str = "egutierrez",
    dry_run: bool = False,
) -> dict:
    """Genera el forecast diario de ventas y lo escribe en BigQuery.

    Args:
        as_of: fecha de corte 'YYYY-MM-DD' (dia de la corrida). Vacio = hoy. La
            historia utilizable llega hasta as_of - 1 dia; se predice
            as_of + 1 .. as_of + horizon. run_date = as_of.
        horizon: numero de dias futuros a predecir. Default 7.
        model: etiqueta del modelo escrita en cada fila (columna model). Default
            'baseline_v1'.
        author: autor de la corrida (columna author). Default 'egutierrez'.
        dry_run: si True no escribe en BigQuery; devuelve el resumen + una muestra
            de filas.

    Returns:
        dict dict-no-throw. En exito {status:'ok', run_date, series, rows, model,
        author, (sample si dry_run)}. En error {status:'error', error, stage}.
    """
    try:
        run_d = _as_date(as_of) if as_of else date.today()
        # Ultimo dia de historia utilizable (inclusive): as_of - 1 dia.
        hist_as_of = run_d - timedelta(days=1)
        horizon_dates = [
            (run_d + timedelta(days=k)).isoformat() for k in range(1, horizon + 1)
        ]

        # 1) Cliente BigQuery sin quota project (evita 403 USER_PROJECT_DENIED).
        client = bq_auth(PROJECT_ID, drop_quota_project=True)

        # 2) Historia agregada del mart (hasta run_d - 1 via el WHERE de la query).
        q = bq_query(
            client,
            HISTORY_SQL,
            params=[{"name": "as_of", "type": "DATE", "value": run_d}],
        )
        cols = {name: i for i, name in enumerate(q["columns"])}

        # Historia por serie + ultimos center_name/ambito conocidos + venta 8 semanas.
        history = []
        last_meta = {}          # series_id -> (max_date, center_name, ambito, center_id, subcat)
        recent_sum = {}         # series_id -> venta acumulada en las ultimas 8 semanas
        active_cutoff = hist_as_of - timedelta(weeks=8)
        for row in q["rows"]:
            fecha = _as_date(row[cols["fecha"]])
            center_id = str(row[cols["idCentro"]])
            subcat = row[cols["subcat_cqq"]]
            center_name = row[cols["center_name"]]
            ambito = row[cols["ambito"]]
            venta = float(row[cols["venta"]] or 0.0)
            series_id = f"{center_id}|{subcat}"

            history.append(
                {"series_id": series_id, "date": fecha.isoformat(), "value": venta}
            )
            prev = last_meta.get(series_id)
            if prev is None or fecha > prev[0]:
                last_meta[series_id] = (fecha, center_name, ambito, center_id, subcat)
            if fecha > active_cutoff:
                recent_sum[series_id] = recent_sum.get(series_id, 0.0) + venta

        # 3) Series activas: venta > 0 en las ultimas 8 semanas.
        active = {sid for sid, s in recent_sum.items() if s > 0.0}
        history = [h for h in history if h["series_id"] in active]

        if not history:
            result = {
                "status": "ok",
                "run_date": run_d.isoformat(),
                "series": 0,
                "rows": 0,
                "model": model,
                "author": author,
            }
            if dry_run:
                result["sample"] = []
            return result

        # 4) Modelo puro: todas las predicciones de golpe.
        preds = forecast_seasonal_median(
            history, horizon_dates, as_of=hist_as_of.isoformat()
        )

        # 5) Filas para la tabla de predicciones.
        run_ts = datetime.now(timezone.utc).isoformat()
        rows_out = []
        for p in preds:
            sid = p["series_id"]
            meta = last_meta.get(sid)
            _, center_name, ambito, center_id, subcat = meta
            forecast_date = _as_date(p["date"])
            rows_out.append(
                {
                    "run_ts": run_ts,
                    "run_date": run_d.isoformat(),
                    "forecast_date": forecast_date.isoformat(),
                    "lag_days": (forecast_date - run_d).days,
                    "center_id": center_id,
                    "center_name": center_name,
                    "ambito": ambito,
                    "subcat_cgq": subcat,
                    "model": model,
                    "author": author,
                    "y_pred": round(float(p["y_pred"]), 4),
                }
            )

        summary = {
            "status": "ok",
            "run_date": run_d.isoformat(),
            "series": len(active),
            "rows": len(rows_out),
            "model": model,
            "author": author,
        }

        # 6) dry-run: no escribe; devuelve resumen + muestra.
        if dry_run:
            summary["sample"] = rows_out[:5]
            return summary

        # 7) Idempotencia: borra las predicciones previas de (run_date, model, author).
        bq_query(
            client,
            DELETE_SQL,
            params=[
                {"name": "d", "type": "DATE", "value": run_d},
                {"name": "m", "type": "STRING", "value": model},
                {"name": "a", "type": "STRING", "value": author},
            ],
        )

        # 8) Carga JSONL a la tabla (WRITE_APPEND, schema fijo de la tabla).
        tmp_path = None
        try:
            fd, tmp_path = tempfile.mkstemp(prefix="sales_forecast_", suffix=".jsonl")
            with os.fdopen(fd, "w", encoding="utf-8") as fh:
                for r in rows_out:
                    fh.write(json.dumps(r, ensure_ascii=False) + "\n")
            load = bq_load_from_file(
                client,
                tmp_path,
                DEST_DATASET,
                DEST_TABLE,
                source_format="NEWLINE_DELIMITED_JSON",
                write_disposition="WRITE_APPEND",
                autodetect=False,
            )
        finally:
            if tmp_path and os.path.exists(tmp_path):
                os.remove(tmp_path)

        if load.get("status") != "DONE":
            return {
                "status": "error",
                "error": f"load job no termino DONE: {load}",
                "stage": "load",
            }

        summary["rows_loaded"] = load.get("rows_loaded")
        summary["job_id"] = load.get("job_id")

        # 9) Refresca la tabla fisica de reales (ventana movil de 10 dias) para
        # que forecast_eval y el dashboard de competicion comparen contra el
        # ultimo estado del mart.
        try:
            _refresh_actuals(client, run_d)
            summary["actuals_refreshed"] = True
        except Exception as e:  # noqa: BLE001
            # No invalida las predicciones ya cargadas: se reporta y se sigue.
            summary["actuals_refreshed"] = False
            summary["actuals_error"] = str(e)

        return summary
    except Exception as e:  # noqa: BLE001
        return {"status": "error", "error": str(e), "stage": "unexpected"}


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Forecast diario de ventas Aurgi -> BigQuery sales_forecast.predictions."
    )
    parser.add_argument("--as-of", default="", help="Fecha de corte YYYY-MM-DD (vacio = hoy).")
    parser.add_argument("--horizon", type=int, default=7, help="Dias a predecir. Default 7.")
    parser.add_argument("--model", default="baseline_v1", help="Etiqueta del modelo.")
    parser.add_argument("--author", default="egutierrez", help="Autor de la corrida.")
    parser.add_argument(
        "--dry-run", action="store_true", help="No escribe en BigQuery; imprime muestra."
    )
    args = parser.parse_args()

    out = run_sales_forecast(
        as_of=args.as_of,
        horizon=args.horizon,
        model=args.model,
        author=args.author,
        dry_run=args.dry_run,
    )
    print(json.dumps(out, ensure_ascii=False, default=str))
    sys.exit(0 if out.get("status") == "ok" else 1)