fn_registry/python/functions/pipelines/profile_bq_table.py

"""profile_bq_table — EDA one-shot de una tabla/vista BigQuery con el grupo `eda`.

Pipeline impuro: materializa una tabla o vista de BigQuery (por defecto COMPLETA —
todas las filas — o una muestra si se pasa `sample_frac`, con seudonimizacion PII
opcional, LOPDGDD/RGPD) a un DuckDB local con `load_bq_table_to_duckdb`, y la
perfila end-to-end con `profile_table` del grupo de capacidad `eda`, emitiendo el
informe AutomaticEDA (PDF A5 movil + PPTX 16:9), Markdown y JSON sidecar. Es el
adaptador BigQuery que faltaba en el grupo `eda`, resuelto por composicion
(BigQuery -> DuckDB local -> profile_table) sin duplicar la logica de perfilado ni
de render.

Modo por defecto = FULL: `sample_frac=None` perfila TODAS las filas del origen
(preferencia estandar del usuario: los EDA se corren sobre el total salvo que se
pida lo contrario). El muestreo es opt-in explicito: `sample_frac=0.05` perfila
~5 % de las filas; `max_rows` es un tope duro opcional (0 = sin tope).

Funciones del registry compuestas (NO se reimplementa su logica):
  - load_bq_table_to_duckdb : trae la tabla/vista BigQuery a un DuckDB local
                              (completa por defecto, o muestra si sample_frac).
  - profile_table           : orquestador one-shot del grupo `eda` que perfila la
                              DuckDB materializada y emite el informe AutomaticEDA.

Estilo dict-no-throw del grupo `eda`: nunca lanza; devuelve {status:'error', ...}.
"""

import os
import tempfile

from datascience import load_bq_table_to_duckdb
from pipelines.profile_table import profile_table


def profile_bq_table(
    table_fqn: str,
    sample_frac: float = None,
    max_rows: int = 0,
    pseudonymize_cols: list = None,
    run_models: bool = True,
    run_series: bool = False,
    run_llm: bool = False,
    project_id: str = "",
    report_dir: str = "reports",
    duckdb_path: str = "",
    keep_duckdb: bool = False,
) -> dict:
    """EDA one-shot de una tabla/vista BigQuery.

    Por defecto perfila TODAS las filas del origen (`sample_frac=None`, modo FULL).
    Materializa el origen (con seudonimizacion PII opcional) a un DuckDB local y lo
    perfila con `profile_table` del grupo `eda`, emitiendo el informe AutomaticEDA
    (PDF A5 movil + PPTX 16:9) + Markdown + JSON sidecar.

    Args:
        table_fqn: FQN de la tabla/vista BigQuery ("project.dataset.table").
        sample_frac: None (default) = FULL, perfila todas las filas. Un float en
            (0,1) activa el muestreo opt-in (`WHERE rand() < frac`, ~frac del total).
        max_rows: Tope duro opcional de filas (LIMIT). 0 (default) = sin tope.
        pseudonymize_cols: Columnas PII a seudonimizar (hash) antes de materializar.
        run_models: Modelos baratos (PCA/KMeans/IsolationForest/normalidad).
        run_series: Analisis de serie temporal por columna numerica.
        run_llm: 1 llamada LLM sobre el perfil agregado (nunca filas crudas).
        project_id: Proyecto GCP de facturacion. Vacio = primer segmento del FQN.
        report_dir: Directorio de salida de los reports.
        duckdb_path: Ruta DuckDB a usar. Vacio = temporal autogestionado.
        keep_duckdb: Si True conserva el DuckDB materializado.

    Returns:
        dict dict-no-throw con el resultado del pipeline (ver output del .md).
    """
    tmp_created = False
    try:
        # DuckDB temporal si no se pasa ruta.
        if not duckdb_path:
            fd, duckdb_path = tempfile.mkstemp(prefix="eda_bq_", suffix=".duckdb")
            os.close(fd)
            os.remove(duckdb_path)  # que lo cree DuckDB limpio
            tmp_created = True

        load = load_bq_table_to_duckdb(
            table_fqn,
            duckdb_path,
            sample_frac=sample_frac,
            max_rows=max_rows,
            project_id=project_id,
            pseudonymize_cols=pseudonymize_cols,
        )
        if load.get("status") != "ok":
            return {
                "status": "error",
                "error": load.get("error", "load fallo"),
                "stage": "load",
            }

        prof = profile_table(
            duckdb_path,
            load["table"],
            backend="duckdb",
            run_models=run_models,
            run_series=run_series,
            run_llm=run_llm,
            emit_automatic=True,   # PDF A5 movil + PPTX 16:9
            emit_pdf=False,
            write_report=True,     # Markdown + JSON sidecar
            report_dir=report_dir,
        )
        if prof.get("status") != "ok":
            return {
                "status": "error",
                "error": prof.get("error", "profile fallo"),
                "stage": "profile",
                "load": load,
            }

        return {
            "status": "ok",
            "table_fqn": table_fqn,
            "load": {
                k: load[k]
                for k in ("n_rows_source", "n_rows_fetched", "sampled", "sample_frac", "pseudonymized", "table")
                if k in load
            },
            "duckdb_path": duckdb_path if keep_duckdb else None,
            "report_md_path": prof.get("report_md_path"),
            "report_json_path": prof.get("report_json_path"),
            "aeda_pdf_path": prof.get("aeda_pdf_path"),
            "aeda_pptx_path": prof.get("aeda_pptx_path"),
            "aeda_manifest_path": prof.get("aeda_manifest_path"),
            "profile": prof.get("profile"),
        }
    except Exception as e:  # noqa: BLE001
        return {"status": "error", "error": str(e)}
    finally:
        # Limpia el DuckDB temporal salvo que se pida conservarlo.
        if tmp_created and not keep_duckdb and duckdb_path and os.path.exists(duckdb_path):
            try:
                os.remove(duckdb_path)
            except OSError:
                pass