feat(infra): auto-commit con 56 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-21 14:22:55 +02:00
parent c1071a82b3
commit 32c7336bf6
56 changed files with 5307 additions and 100 deletions
@@ -0,0 +1,377 @@
+"""summarize_table_pg — perfil base de una tabla PostgreSQL con SQL push-down.
+
+Funcion impura: lee de un servidor PostgreSQL a traves de la primitiva read-only
+del grupo `postgres`, `pg_query`. Es el adaptador PostgreSQL del corazon del grupo
+de capacidad `eda` (exploratory data analysis), espejo de `summarize_table_duckdb`:
+construye EXACTAMENTE el mismo esqueleto de TableProfile (mismas claves) usando
+queries agregadas que hacen push-down en el motor de PostgreSQL y NO traen filas a
+RAM (count, count(DISTINCT), min/max/avg/stddev, percentile_cont).
+
+Lo que NO calcula aqui (a proposito, para ser barata): skew, kurtosis, histograma,
+percentiles finos (p1/p5/p95/p99), moda, outliers, correlaciones, key_candidates,
+quality_score ni el semantic_type. Esas claves quedan en None / [] para que las
+rellenen luego otras funciones del grupo `eda` sobre una muestra. El contrato de
+claves (TableProfile / ColumnProfile) es compartido por todo el grupo `eda` y es
+identico al de `summarize_table_duckdb`, de modo que `profile_table` y el resto del
+grupo consumen el resultado igual con fuente PostgreSQL.
+
+Estilo dict-no-throw del grupo: nunca lanza; captura cualquier error y devuelve
+{status:'error', error:str}.
+"""
+
+import re
+from datetime import datetime, timezone
+
+from infra import pg_query
+
+# Identificador SQL valido. PostgreSQL no admite parametros posicionales para el
+# nombre de tabla/columna en el cuerpo del SELECT, asi que hay que validar e
+# interpolar citado con comillas dobles.
+_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+# Umbral de filas por debajo del cual calculamos COUNT(DISTINCT) EXACTO. Por
+# encima cap el distinct a n_rows (no estimamos con HLL: PostgreSQL no lo da de
+# serie sin extension). Documentado en el .md.
+_EXACT_DISTINCT_MAX_ROWS = 200_000
+
+# Tipos PostgreSQL (data_type de information_schema) que mapean a "numeric".
+_NUMERIC_TYPES = {
+    "smallint", "integer", "bigint",
+    "decimal", "numeric", "real", "double precision",
+    "smallserial", "serial", "bigserial",
+}
+# Tipos PostgreSQL que mapean a "datetime".
+_DATETIME_TYPES = {
+    "date", "time", "timestamp",
+    "timestamp without time zone", "timestamp with time zone",
+    "time without time zone", "time with time zone",
+}
+# Tipos PostgreSQL textuales (candidatos a categorical/text).
+_TEXT_TYPES = {
+    "text", "character varying", "varchar", "character", "char", "bpchar",
+}
+
+# Claves del sub-dict numeric. summarize solo rellena unas pocas; el resto
+# quedan en None hasta que una funcion de muestreo las complete.
+_NUMERIC_SUB_KEYS = (
+    "min", "max", "mean", "median", "mode", "std", "variance", "cv",
+    "p1", "p5", "p25", "p50", "p75", "p95", "p99", "iqr",
+    "skew", "kurtosis", "n_outliers", "outlier_pct", "zero_pct",
+    "negative_pct", "distribution_type", "histogram",
+)
+
+
+def _base_data_type(data_type: str) -> str:
+    """Normaliza un data_type de information_schema a su forma base en minusculas.
+
+    information_schema.columns.data_type ya viene sin parametros (p.ej. "numeric"
+    en vez de "numeric(10,2)" y "character varying" en vez de "varchar(50)"), pero
+    normalizamos a minusculas y quitamos espacios laterales por seguridad.
+    """
+    return (data_type or "").strip().lower()
+
+
+def _infer_type(data_type: str, distinct_count, n_rows: int) -> str:
+    """Mapea el data_type PostgreSQL al inferred_type del contrato eda.
+
+    numeric / datetime / boolean salen directos del tipo. Para los tipos textuales
+    se decide entre categorical y text con la misma heuristica de cardinalidad que
+    el adaptador DuckDB: categorical si distinct_count <= 50 o
+    distinct_count/n_rows < 0.5; si no text.
+    """
+    base = _base_data_type(data_type)
+    if base in _NUMERIC_TYPES:
+        return "numeric"
+    if base in _DATETIME_TYPES:
+        return "datetime"
+    if base in ("boolean", "bool"):
+        return "boolean"
+    if base in _TEXT_TYPES:
+        au = distinct_count if distinct_count is not None else 0
+        if n_rows <= 0:
+            return "categorical"
+        if au <= 50 or (au / n_rows) < 0.5:
+            return "categorical"
+        return "text"
+    # Tipos complejos (json, jsonb, uuid, array, bytea, ...): tratamos como text.
+    return "text"
+
+
+def _to_float(value):
+    """Convierte a float un valor agregado de PostgreSQL (Decimal/str/None).
+
+    pg_query normaliza Decimal a float, pero min/max de columnas no numericas (o
+    valores no convertibles) caen aqui y devolvemos None.
+    """
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _to_int(value):
+    """Convierte a int de forma defensiva (count(*), count(col) vienen como int)."""
+    if value is None:
+        return 0
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return 0
+
+
+def summarize_table_pg(
+    dsn: str,
+    table: str,
+    schema: str = "public",
+    high_card_ratio: float = 0.9,
+) -> dict:
+    """Perfila una tabla PostgreSQL con SQL push-down (sin traer filas a RAM).
+
+    Devuelve el MISMO esqueleto TableProfile que summarize_table_duckdb (mismas
+    claves exactas), para que el resto del grupo `eda` funcione igual con fuente
+    PostgreSQL. dict-no-throw.
+
+    Args:
+        dsn: cadena de conexion PostgreSQL, p.ej.
+            "postgresql://user:pass@localhost:5432/mydb". Un DSN invalido o un
+            servidor inalcanzable devuelve {status:'error', ...} (no lanza).
+        table: nombre de la tabla a perfilar. Se valida contra
+            ^[A-Za-z_][A-Za-z0-9_]*$ y se cita en el SQL (los identificadores no
+            son parametrizables).
+        schema: schema PostgreSQL donde vive la tabla (default "public"). Se valida
+            con el mismo patron y se cita.
+        high_card_ratio: umbral de unicidad (unique_pct) a partir del cual una
+            columna categorical se marca con el flag "high_cardinality". Default 0.9.
+
+    Returns:
+        dict. En exito: {status:'ok', profile: <TableProfile>}. En error (sin
+        lanzar): {status:'error', error:str}.
+    """
+    try:
+        if not _IDENT_RE.match(table or ""):
+            return {
+                "status": "error",
+                "error": (
+                    f"nombre de tabla invalido: {table!r} "
+                    "(debe casar con ^[A-Za-z_][A-Za-z0-9_]*$)"
+                ),
+            }
+        if not _IDENT_RE.match(schema or ""):
+            return {
+                "status": "error",
+                "error": (
+                    f"nombre de schema invalido: {schema!r} "
+                    "(debe casar con ^[A-Za-z_][A-Za-z0-9_]*$)"
+                ),
+            }
+
+        qtable = f'"{schema}"."{table}"'
+
+        # 1) Columnas + tipos desde information_schema (parametros posicionales).
+        cols_res = pg_query(
+            dsn,
+            "SELECT column_name, data_type FROM information_schema.columns "
+            "WHERE table_schema = %s AND table_name = %s "
+            "ORDER BY ordinal_position",
+            params=[schema, table],
+        )
+        if cols_res["status"] != "ok":
+            return {"status": "error", "error": cols_res["error"]}
+        col_rows = cols_res["rows"]
+        if not col_rows:
+            return {
+                "status": "error",
+                "error": (
+                    f"tabla no encontrada o sin columnas: {schema}.{table}"
+                ),
+            }
+        col_meta = [
+            (r.get("column_name"), r.get("data_type")) for r in col_rows
+        ]
+
+        # 2) Numero total de filas.
+        count_res = pg_query(dsn, f"SELECT count(*) AS n FROM {qtable}")
+        if count_res["status"] != "ok":
+            return {"status": "error", "error": count_res["error"]}
+        n_rows = _to_int(count_res["rows"][0]["n"]) if count_res["rows"] else 0
+
+        # 3) Por columna: una query agregada con push-down en el motor. Combina
+        # count no-nulo + count(DISTINCT) (exacto si n_rows <= umbral) +, para
+        # columnas numericas, min/max/avg/stddev_samp/percentiles. No trae filas.
+        exact_distinct_ok = (
+            0 < n_rows <= _EXACT_DISTINCT_MAX_ROWS
+        )
+        columns = []
+        for name, data_type in col_meta:
+            if not _IDENT_RE.match(name or ""):
+                # Columna con identificador no estandar: la perfilamos sin
+                # agregados numericos (defensivo, no deberia pasar en information_schema).
+                columns.append(
+                    _build_column_profile(
+                        name, data_type, n_rows, high_card_ratio,
+                        non_null=n_rows, distinct=None, agg=None,
+                    )
+                )
+                continue
+
+            qcol = f'"{name}"'
+            base_type = _base_data_type(data_type)
+            is_numeric = base_type in _NUMERIC_TYPES
+
+            select_parts = [f"count({qcol}) AS non_null"]
+            if exact_distinct_ok:
+                select_parts.append(f"count(DISTINCT {qcol}) AS distinct_n")
+            if is_numeric:
+                select_parts.extend([
+                    f"min({qcol}) AS mn",
+                    f"max({qcol}) AS mx",
+                    f"avg({qcol}) AS av",
+                    f"stddev_samp({qcol}) AS sd",
+                    f"percentile_cont(0.25) WITHIN GROUP (ORDER BY {qcol}) AS p25",
+                    f"percentile_cont(0.5)  WITHIN GROUP (ORDER BY {qcol}) AS p50",
+                    f"percentile_cont(0.75) WITHIN GROUP (ORDER BY {qcol}) AS p75",
+                ])
+
+            agg_sql = f"SELECT {', '.join(select_parts)} FROM {qtable}"
+            agg_res = pg_query(dsn, agg_sql)
+            if agg_res["status"] != "ok":
+                return {"status": "error", "error": agg_res["error"]}
+            agg = agg_res["rows"][0] if agg_res["rows"] else {}
+
+            non_null = _to_int(agg.get("non_null"))
+            distinct = (
+                _to_int(agg.get("distinct_n")) if exact_distinct_ok else None
+            )
+
+            columns.append(
+                _build_column_profile(
+                    name, data_type, n_rows, high_card_ratio,
+                    non_null=non_null, distinct=distinct,
+                    agg=agg if is_numeric else None,
+                )
+            )
+
+        type_breakdown = {
+            "numeric": 0,
+            "categorical": 0,
+            "datetime": 0,
+            "text": 0,
+            "boolean": 0,
+        }
+        for col in columns:
+            it = col["inferred_type"]
+            if it in type_breakdown:
+                type_breakdown[it] += 1
+
+        constant_cols = [c["name"] for c in columns if "constant" in c["flags"]]
+        all_null_cols = [c["name"] for c in columns if c["null_pct"] == 1.0]
+        null_cell_pct = (
+            sum(c["null_pct"] for c in columns) / len(columns) if columns else 0.0
+        )
+
+        profile = {
+            "table": table,
+            "source": "postgres",
+            "profiled_at": datetime.now(timezone.utc).isoformat(),
+            "n_rows": n_rows,
+            "n_cols": len(columns),
+            "size_bytes": None,
+            "duplicate_rows": None,
+            "duplicate_pct": None,
+            "constant_cols": constant_cols,
+            "all_null_cols": all_null_cols,
+            "null_cell_pct": null_cell_pct,
+            "type_breakdown": type_breakdown,
+            "columns": columns,
+            "correlations": None,
+            "key_candidates": [],
+            "quality_score": None,
+            "llm": None,
+            "models": None,
+        }
+        return {"status": "ok", "profile": profile}
+    except Exception as e:  # noqa: BLE001
+        return {"status": "error", "error": str(e)}
+
+
+def _build_column_profile(
+    name: str,
+    data_type: str,
+    n_rows: int,
+    high_card_ratio: float,
+    non_null: int,
+    distinct,
+    agg: dict = None,
+) -> dict:
+    """Construye un ColumnProfile del contrato eda a partir de los agregados PG.
+
+    name/data_type: metadata de information_schema.
+    non_null: count(col) no-nulo de la query agregada.
+    distinct: count(DISTINCT col) exacto si n_rows <= umbral; None si por encima
+        (entonces se capa a n_rows).
+    agg: fila de agregados numericos (min/max/avg/stddev/p25/p50/p75) o None para
+        columnas no numericas.
+
+    El shape devuelto es IDENTICO al de summarize_table_duckdb._build_column_profile.
+    """
+    null_count = n_rows - non_null if n_rows > 0 else 0
+    if null_count < 0:
+        null_count = 0
+    null_pct = (null_count / n_rows) if n_rows > 0 else 0.0
+
+    # distinct_count: exacto si disponible; si no, capado a n_rows.
+    if distinct is not None:
+        distinct_count = min(distinct, n_rows) if n_rows > 0 else distinct
+    else:
+        # Tabla grande (> umbral): no calculamos distinct exacto; lo capamos a
+        # non_null como cota superior conservadora (a lo sumo tantos distintos
+        # como valores no nulos), y a su vez a n_rows.
+        distinct_count = min(non_null, n_rows) if n_rows > 0 else non_null
+
+    inferred_type = _infer_type(data_type, distinct_count, n_rows)
+
+    unique_pct = min(distinct_count / n_rows, 1.0) if n_rows > 0 else 0.0
+
+    numeric = None
+    if inferred_type == "numeric":
+        numeric = {k: None for k in _NUMERIC_SUB_KEYS}
+        if agg is not None:
+            numeric["min"] = _to_float(agg.get("mn"))
+            numeric["max"] = _to_float(agg.get("mx"))
+            numeric["mean"] = _to_float(agg.get("av"))
+            numeric["std"] = _to_float(agg.get("sd"))
+            numeric["p25"] = _to_float(agg.get("p25"))
+            numeric["p50"] = _to_float(agg.get("p50"))
+            numeric["p75"] = _to_float(agg.get("p75"))
+
+    flags = []
+    if distinct_count <= 1:
+        flags.append("constant")
+    if unique_pct >= 0.99 and null_pct == 0:
+        flags.append("possible_id")
+    if inferred_type == "categorical" and unique_pct >= high_card_ratio:
+        flags.append("high_cardinality")
+    if null_pct > 0.5:
+        flags.append("mostly_null")
+
+    return {
+        "name": name,
+        "physical_type": data_type,
+        "inferred_type": inferred_type,
+        "semantic_type": "",
+        "count": non_null,
+        "n_rows": n_rows,
+        "null_count": null_count,
+        "null_pct": null_pct,
+        "empty_count": None,
+        "empty_pct": None,
+        "distinct_count": distinct_count,
+        "unique_pct": unique_pct,
+        "flags": flags,
+        "quality_score": None,
+        "numeric": numeric,
+        "categorical": None,
+        "datetime": None,
+    }