fn_registry/python/functions/datascience/summarize_table_pg.py

"""summarize_table_pg — perfil base de una tabla PostgreSQL con SQL push-down.

Funcion impura: lee de un servidor PostgreSQL a traves de la primitiva read-only
del grupo `postgres`, `pg_query`. Es el adaptador PostgreSQL del corazon del grupo
de capacidad `eda` (exploratory data analysis), espejo de `summarize_table_duckdb`:
construye EXACTAMENTE el mismo esqueleto de TableProfile (mismas claves) usando
queries agregadas que hacen push-down en el motor de PostgreSQL y NO traen filas a
RAM (count, count(DISTINCT), min/max/avg/stddev, percentile_cont).

Lo que NO calcula aqui (a proposito, para ser barata): skew, kurtosis, histograma,
percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates,
quality_score ni el semantic_type. Esas claves quedan en None / [] para que las
rellenen luego otras funciones del grupo `eda` sobre una muestra. El contrato de
claves (TableProfile / ColumnProfile) es compartido por todo el grupo `eda` y es
identico al de `summarize_table_duckdb`, de modo que `profile_table` y el resto del
grupo consumen el resultado igual con fuente PostgreSQL.

Estilo dict-no-throw del grupo: nunca lanza; captura cualquier error y devuelve
{status:'error', error:str}.
"""

import re
from datetime import datetime, timezone

from infra import pg_query

# Identificador SQL valido. PostgreSQL no admite parametros posicionales para el
# nombre de tabla/columna en el cuerpo del SELECT, asi que hay que validar e
# interpolar citado con comillas dobles.
_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")

# Umbral de filas por debajo del cual calculamos COUNT(DISTINCT) EXACTO. Por
# encima cap el distinct a n_rows (no estimamos con HLL: PostgreSQL no lo da de
# serie sin extension). Documentado en el .md.
_EXACT_DISTINCT_MAX_ROWS = 200_000

# Tipos PostgreSQL (data_type de information_schema) que mapean a "numeric".
_NUMERIC_TYPES = {
    "smallint", "integer", "bigint",
    "decimal", "numeric", "real", "double precision",
    "smallserial", "serial", "bigserial",
}
# Tipos PostgreSQL que mapean a "datetime".
_DATETIME_TYPES = {
    "date", "time", "timestamp",
    "timestamp without time zone", "timestamp with time zone",
    "time without time zone", "time with time zone",
}
# Tipos PostgreSQL textuales (candidatos a categorical/text).
_TEXT_TYPES = {
    "text", "character varying", "varchar", "character", "char", "bpchar",
}

# Claves del sub-dict numeric. summarize solo rellena unas pocas; el resto
# quedan en None hasta que una funcion de muestreo las complete.
_NUMERIC_SUB_KEYS = (
    "min", "max", "mean", "median", "mode", "std", "variance", "cv",
    "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
    "skew", "kurtosis", "n_outliers", "outlier_pct", "zero_pct",
    "negative_pct", "distribution_type", "histogram",
)


def _base_data_type(data_type: str) -> str:
    """Normaliza un data_type de information_schema a su forma base en minusculas.

    information_schema.columns.data_type ya viene sin parametros (p.ej. "numeric"
    en vez de "numeric(10,2)" y "character varying" en vez de "varchar(50)"), pero
    normalizamos a minusculas y quitamos espacios laterales por seguridad.
    """
    return (data_type or "").strip().lower()


def _infer_type(data_type: str, distinct_count, n_rows: int) -> str:
    """Mapea el data_type PostgreSQL al inferred_type del contrato eda.

    numeric / datetime / boolean salen directos del tipo. Para los tipos textuales
    se decide entre categorical y text con la misma heuristica de cardinalidad que
    el adaptador DuckDB: categorical si distinct_count <= 50 o
    distinct_count/n_rows < 0.5; si no text.
    """
    base = _base_data_type(data_type)
    if base in _NUMERIC_TYPES:
        return "numeric"
    if base in _DATETIME_TYPES:
        return "datetime"
    if base in ("boolean", "bool"):
        return "boolean"
    if base in _TEXT_TYPES:
        au = distinct_count if distinct_count is not None else 0
        if n_rows <= 0:
            return "categorical"
        if au <= 50 or (au / n_rows) < 0.5:
            return "categorical"
        return "text"
    # Tipos complejos (json, jsonb, uuid, array, bytea, ...): tratamos como text.
    return "text"


def _to_float(value):
    """Convierte a float un valor agregado de PostgreSQL (Decimal/str/None).

    pg_query normaliza Decimal a float, pero min/max de columnas no numericas (o
    valores no convertibles) caen aqui y devolvemos None.
    """
    if value is None:
        return None
    try:
        return float(value)
    except (TypeError, ValueError):
        return None


def _to_int(value):
    """Convierte a int de forma defensiva (count(*), count(col) vienen como int)."""
    if value is None:
        return 0
    try:
        return int(value)
    except (TypeError, ValueError):
        return 0


def summarize_table_pg(
    dsn: str,
    table: str,
    schema: str = "public",
    high_card_ratio: float = 0.9,
) -> dict:
    """Perfila una tabla PostgreSQL con SQL push-down (sin traer filas a RAM).

    Devuelve el MISMO esqueleto TableProfile que summarize_table_duckdb (mismas
    claves exactas), para que el resto del grupo `eda` funcione igual con fuente
    PostgreSQL. dict-no-throw.

    Args:
        dsn: cadena de conexion PostgreSQL, p.ej.
            "postgresql://user:pass@localhost:5432/mydb". Un DSN invalido o un
            servidor inalcanzable devuelve {status:'error', ...} (no lanza).
        table: nombre de la tabla a perfilar. Se valida contra
            ^[A-Za-z_][A-Za-z0-9_]*$ y se cita en el SQL (los identificadores no
            son parametrizables).
        schema: schema PostgreSQL donde vive la tabla (default "public"). Se valida
            con el mismo patron y se cita.
        high_card_ratio: umbral de unicidad (unique_pct) a partir del cual una
            columna categorical se marca con el flag "high_cardinality". Default 0.9.

    Returns:
        dict. En exito: {status:'ok', profile: <TableProfile>}. En error (sin
        lanzar): {status:'error', error:str}.
    """
    try:
        if not _IDENT_RE.match(table or ""):
            return {
                "status": "error",
                "error": (
                    f"nombre de tabla invalido: {table!r} "
                    "(debe casar con ^[A-Za-z_][A-Za-z0-9_]*$)"
                ),
            }
        if not _IDENT_RE.match(schema or ""):
            return {
                "status": "error",
                "error": (
                    f"nombre de schema invalido: {schema!r} "
                    "(debe casar con ^[A-Za-z_][A-Za-z0-9_]*$)"
                ),
            }

        qtable = f'"{schema}"."{table}"'

        # 1) Columnas + tipos desde information_schema (parametros posicionales).
        cols_res = pg_query(
            dsn,
            "SELECT column_name, data_type FROM information_schema.columns "
            "WHERE table_schema = %s AND table_name = %s "
            "ORDER BY ordinal_position",
            params=[schema, table],
        )
        if cols_res["status"] != "ok":
            return {"status": "error", "error": cols_res["error"]}
        col_rows = cols_res["rows"]
        if not col_rows:
            return {
                "status": "error",
                "error": (
                    f"tabla no encontrada o sin columnas: {schema}.{table}"
                ),
            }
        col_meta = [
            (r.get("column_name"), r.get("data_type")) for r in col_rows
        ]

        # 2) Numero total de filas.
        count_res = pg_query(dsn, f"SELECT count(*) AS n FROM {qtable}")
        if count_res["status"] != "ok":
            return {"status": "error", "error": count_res["error"]}
        n_rows = _to_int(count_res["rows"][0]["n"]) if count_res["rows"] else 0

        # 3) Por columna: una query agregada con push-down en el motor. Combina
        # count no-nulo + count(DISTINCT) (exacto si n_rows <= umbral) +, para
        # columnas numericas, min/max/avg/stddev_samp/percentiles. No trae filas.
        exact_distinct_ok = (
            0 < n_rows <= _EXACT_DISTINCT_MAX_ROWS
        )
        columns = []
        for name, data_type in col_meta:
            if not _IDENT_RE.match(name or ""):
                # Columna con identificador no estandar: la perfilamos sin
                # agregados numericos (defensivo, no deberia pasar en information_schema).
                columns.append(
                    _build_column_profile(
                        name, data_type, n_rows, high_card_ratio,
                        non_null=n_rows, distinct=None, agg=None,
                    )
                )
                continue

            qcol = f'"{name}"'
            base_type = _base_data_type(data_type)
            is_numeric = base_type in _NUMERIC_TYPES

            select_parts = [f"count({qcol}) AS non_null"]
            if exact_distinct_ok:
                select_parts.append(f"count(DISTINCT {qcol}) AS distinct_n")
            if is_numeric:
                select_parts.extend([
                    f"min({qcol}) AS mn",
                    f"max({qcol}) AS mx",
                    f"avg({qcol}) AS av",
                    f"stddev_samp({qcol}) AS sd",
                    f"percentile_cont(0.25) WITHIN GROUP (ORDER BY {qcol}) AS p25",
                    f"percentile_cont(0.5)  WITHIN GROUP (ORDER BY {qcol}) AS p50",
                    f"percentile_cont(0.75) WITHIN GROUP (ORDER BY {qcol}) AS p75",
                ])

            agg_sql = f"SELECT {', '.join(select_parts)} FROM {qtable}"
            agg_res = pg_query(dsn, agg_sql)
            if agg_res["status"] != "ok":
                return {"status": "error", "error": agg_res["error"]}
            agg = agg_res["rows"][0] if agg_res["rows"] else {}

            non_null = _to_int(agg.get("non_null"))
            distinct = (
                _to_int(agg.get("distinct_n")) if exact_distinct_ok else None
            )

            columns.append(
                _build_column_profile(
                    name, data_type, n_rows, high_card_ratio,
                    non_null=non_null, distinct=distinct,
                    agg=agg if is_numeric else None,
                )
            )

        type_breakdown = {
            "numeric": 0,
            "categorical": 0,
            "datetime": 0,
            "text": 0,
            "boolean": 0,
        }
        for col in columns:
            it = col["inferred_type"]
            if it in type_breakdown:
                type_breakdown[it] += 1

        constant_cols = [c["name"] for c in columns if "constant" in c["flags"]]
        all_null_cols = [c["name"] for c in columns if c["null_pct"] == 1.0]
        null_cell_pct = (
            sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
        )

        profile = {
            "table": table,
            "source": "postgres",
            "profiled_at": datetime.now(timezone.utc).isoformat(),
            "n_rows": n_rows,
            "n_cols": len(columns),
            "size_bytes": None,
            "duplicate_rows": None,
            "duplicate_pct": None,
            "constant_cols": constant_cols,
            "all_null_cols": all_null_cols,
            "null_cell_pct": null_cell_pct,
            "type_breakdown": type_breakdown,
            "columns": columns,
            "correlations": None,
            "key_candidates": [],
            "quality_score": None,
            "llm": None,
            "models": None,
        }
        return {"status": "ok", "profile": profile}
    except Exception as e:  # noqa: BLE001
        return {"status": "error", "error": str(e)}


def _build_column_profile(
    name: str,
    data_type: str,
    n_rows: int,
    high_card_ratio: float,
    non_null: int,
    distinct,
    agg: dict = None,
) -> dict:
    """Construye un ColumnProfile del contrato eda a partir de los agregados PG.

    name/data_type: metadata de information_schema.
    non_null: count(col) no-nulo de la query agregada.
    distinct: count(DISTINCT col) exacto si n_rows <= umbral; None si por encima
        (entonces se capa a n_rows).
    agg: fila de agregados numericos (min/max/avg/stddev/p25/p50/p75) o None para
        columnas no numericas.

    El shape devuelto es IDENTICO al de summarize_table_duckdb._build_column_profile.
    """
    null_count = n_rows - non_null if n_rows > 0 else 0
    if null_count < 0:
        null_count = 0
    null_pct = (null_count / n_rows) if n_rows > 0 else 0.0

    # distinct_count: exacto si disponible; si no, capado a n_rows.
    if distinct is not None:
        distinct_count = min(distinct, n_rows) if n_rows > 0 else distinct
    else:
        # Tabla grande (> umbral): no calculamos distinct exacto; lo capamos a
        # non_null como cota superior conservadora (a lo sumo tantos distintos
        # como valores no nulos), y a su vez a n_rows.
        distinct_count = min(non_null, n_rows) if n_rows > 0 else non_null

    inferred_type = _infer_type(data_type, distinct_count, n_rows)

    unique_pct = min(distinct_count / n_rows, 1.0) if n_rows > 0 else 0.0

    numeric = None
    if inferred_type == "numeric":
        numeric = {k: None for k in _NUMERIC_SUB_KEYS}
        if agg is not None:
            numeric["min"] = _to_float(agg.get("mn"))
            numeric["max"] = _to_float(agg.get("mx"))
            numeric["mean"] = _to_float(agg.get("av"))
            numeric["std"] = _to_float(agg.get("sd"))
            numeric["p25"] = _to_float(agg.get("p25"))
            numeric["p50"] = _to_float(agg.get("p50"))
            numeric["p75"] = _to_float(agg.get("p75"))

    flags = []
    if distinct_count <= 1:
        flags.append("constant")
    if unique_pct >= 0.99 and null_pct == 0:
        flags.append("possible_id")
    if inferred_type == "categorical" and unique_pct >= high_card_ratio:
        flags.append("high_cardinality")
    if null_pct > 0.5:
        flags.append("mostly_null")

    return {
        "name": name,
        "physical_type": data_type,
        "inferred_type": inferred_type,
        "semantic_type": "",
        "count": non_null,
        "n_rows": n_rows,
        "null_count": null_count,
        "null_pct": null_pct,
        "empty_count": None,
        "empty_pct": None,
        "distinct_count": distinct_count,
        "unique_pct": unique_pct,
        "flags": flags,
        "quality_score": None,
        "numeric": numeric,
        "categorical": None,
        "datetime": None,
    }