feat(infra): auto-commit con 56 cambios

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-21 14:22:55 +02:00
parent c1071a82b3
commit 32c7336bf6
56 changed files with 5307 additions and 100 deletions
@@ -0,0 +1,253 @@
"""Tests para summarize_table_pg sin servidor PostgreSQL.
Monkeypatchea el primitivo de lectura PG (`pg_query`, importado en el modulo) para
devolver filas simuladas: introspeccion de information_schema, count(*) y los
agregados por columna. Asserta el shape del TableProfile/ColumnProfile (claves,
tipos inferidos, flags, sub-dict numeric) — identico al de summarize_table_duckdb.
No requiere PostgreSQL real.
"""
import sys
import pytest
from .summarize_table_pg import summarize_table_pg
# El objeto-modulo real donde vive la funcion (robusto frente al shadowing
# nombre-modulo/funcion del __init__ y al doble-import de pytest): es el modulo
# cuyo global `pg_query` usa summarize_table_pg, asi el monkeypatch surte efecto.
mod = sys.modules[summarize_table_pg.__module__]
# Tabla simulada `ventas` (mismo esquema conceptual que el test de duckdb):
# id INTEGER -> unico, sin nulls -> possible_id (numeric)
# region TEXT -> categorica baja cardinalidad (3 distintos)
# total NUMERIC -> numerica con un null (count no-nulo = 3)
# pais TEXT -> constante ('ES')
#
# 4 filas. Valores de total no nulos: 120.5, 80.0, 45.25.
_N_ROWS = 4
_COLUMNS = [
{"column_name": "id", "data_type": "integer"},
{"column_name": "region", "data_type": "text"},
{"column_name": "total", "data_type": "numeric"},
{"column_name": "pais", "data_type": "character varying"},
]
# Agregados precomputados por columna (lo que devolveria PostgreSQL).
_AGG_BY_COL = {
"id": {
"non_null": 4, "distinct_n": 4,
"mn": 1, "mx": 4, "av": 2.5, "sd": 1.2909944487358056,
"p25": 1.75, "p50": 2.5, "p75": 3.25,
},
"region": {"non_null": 4, "distinct_n": 3},
"total": {
"non_null": 3, "distinct_n": 3,
"mn": 45.25, "mx": 120.5, "av": 81.91666666666667,
"sd": 37.70159, "p25": 62.625, "p50": 80.0, "p75": 100.25,
},
"pais": {"non_null": 4, "distinct_n": 1},
}
def _fake_pg_query(dsn, sql, params=None, max_rows=10000):
"""Despacha por la forma del SQL para simular pg_query sin servidor."""
sql_l = sql.lower()
# 1) Introspeccion de columnas.
if "information_schema.columns" in sql_l:
return {
"status": "ok",
"columns": ["column_name", "data_type"],
"rows": list(_COLUMNS),
"row_count": len(_COLUMNS),
"truncated": False,
}
# 2) count(*) total de filas.
if "count(*) as n" in sql_l:
return {
"status": "ok",
"columns": ["n"],
"rows": [{"n": _N_ROWS}],
"row_count": 1,
"truncated": False,
}
# 3) Agregados por columna: identificar la columna por su identificador citado.
for col, agg in _AGG_BY_COL.items():
if f'"{col}"' in sql:
return {
"status": "ok",
"columns": list(agg.keys()),
"rows": [dict(agg)],
"row_count": 1,
"truncated": False,
}
raise AssertionError(f"SQL inesperado en fake pg_query: {sql}")
@pytest.fixture(autouse=True)
def patch_pg_query(monkeypatch):
"""Reemplaza el pg_query que el modulo importo por la version simulada."""
monkeypatch.setattr(mod, "pg_query", _fake_pg_query)
def test_shape_y_metadatos_tabla():
res = summarize_table_pg("postgresql://x/y", "ventas")
assert res["status"] == "ok"
profile = res["profile"]
for key in (
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
"duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols",
"null_cell_pct", "type_breakdown", "columns", "correlations",
"key_candidates", "quality_score", "llm", "models",
):
assert key in profile, f"falta clave {key} en TableProfile"
assert profile["table"] == "ventas"
assert profile["source"] == "postgres"
assert profile["n_rows"] == 4
assert profile["n_cols"] == 4
assert len(profile["columns"]) == 4
assert profile["key_candidates"] == []
assert profile["quality_score"] is None
assert profile["correlations"] is None
assert profile["models"] is None
assert profile["llm"] is None
def test_column_profile_shape():
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
by_name = {c["name"]: c for c in profile["columns"]}
for col in profile["columns"]:
for key in (
"name", "physical_type", "inferred_type", "semantic_type", "count",
"n_rows", "null_count", "null_pct", "empty_count", "empty_pct",
"distinct_count", "unique_pct", "flags", "quality_score",
"numeric", "categorical", "datetime",
):
assert key in col, f"falta clave {key} en ColumnProfile {col['name']}"
assert col["semantic_type"] == ""
assert col["quality_score"] is None
assert col["categorical"] is None
assert col["datetime"] is None
# id: numerica, sin nulls, unica -> possible_id.
idc = by_name["id"]
assert idc["inferred_type"] == "numeric"
assert idc["null_count"] == 0
assert idc["count"] == 4
assert idc["distinct_count"] == 4
assert idc["unique_pct"] == 1.0
assert "possible_id" in idc["flags"]
# region: categorica baja cardinalidad.
region = by_name["region"]
assert region["inferred_type"] == "categorical"
assert region["distinct_count"] == 3
assert region["numeric"] is None
# total: numerica con un null. count no-nulo = 3.
total = by_name["total"]
assert total["inferred_type"] == "numeric"
assert total["null_count"] == 1
assert total["count"] == 3
assert total["numeric"] is not None
assert total["numeric"]["min"] == pytest.approx(45.25)
assert total["numeric"]["max"] == pytest.approx(120.5)
assert total["numeric"]["mean"] is not None
assert total["numeric"]["std"] is not None
assert total["numeric"]["p25"] == pytest.approx(62.625)
assert total["numeric"]["p50"] == pytest.approx(80.0)
assert total["numeric"]["p75"] == pytest.approx(100.25)
# claves finas siguen en None (las completa otra funcion del grupo eda).
assert total["numeric"]["skew"] is None
assert total["numeric"]["kurtosis"] is None
assert total["numeric"]["histogram"] is None
assert total["numeric"]["p99"] is None
# pais: constante -> flag constant + aparece en constant_cols.
assert "constant" in by_name["pais"]["flags"]
assert "pais" in profile["constant_cols"]
def test_null_pct_total():
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
total = next(c for c in profile["columns"] if c["name"] == "total")
# 1 null sobre 4 filas.
assert total["null_pct"] == pytest.approx(0.25)
def test_distinct_no_excede_filas():
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
n_rows = profile["n_rows"]
for col in profile["columns"]:
assert col["distinct_count"] <= n_rows
assert col["unique_pct"] <= 1.0
def test_type_breakdown():
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
tb = profile["type_breakdown"]
assert set(tb.keys()) == {
"numeric", "categorical", "datetime", "text", "boolean"
}
assert tb["numeric"] == 2 # id, total
assert tb["categorical"] == 2 # region, pais
assert tb["datetime"] == 0
assert tb["boolean"] == 0
assert tb["text"] == 0
def test_tabla_invalida_devuelve_error():
res = summarize_table_pg("postgresql://x/y", "ventas; DROP TABLE ventas")
assert res["status"] == "error"
assert "invalido" in res["error"]
def test_schema_invalido_devuelve_error():
res = summarize_table_pg("postgresql://x/y", "ventas", schema="pub lic")
assert res["status"] == "error"
assert "schema" in res["error"]
def test_tabla_inexistente_devuelve_error(monkeypatch):
"""information_schema sin filas -> error (tabla no encontrada)."""
def empty_pg_query(dsn, sql, params=None, max_rows=10000):
if "information_schema.columns" in sql.lower():
return {
"status": "ok", "columns": ["column_name", "data_type"],
"rows": [], "row_count": 0, "truncated": False,
}
raise AssertionError("no deberia llegar aqui")
monkeypatch.setattr(mod, "pg_query", empty_pg_query)
res = summarize_table_pg("postgresql://x/y", "no_existe")
assert res["status"] == "error"
assert "no encontrada" in res["error"]
def test_error_de_lectura_pg_se_propaga(monkeypatch):
"""Si pg_query devuelve error en el count, summarize lo propaga dict-no-throw."""
def failing_count(dsn, sql, params=None, max_rows=10000):
sql_l = sql.lower()
if "information_schema.columns" in sql_l:
return {
"status": "ok", "columns": ["column_name", "data_type"],
"rows": list(_COLUMNS), "row_count": len(_COLUMNS),
"truncated": False,
}
if "count(*) as n" in sql_l:
return {"status": "error", "error": "connection refused"}
raise AssertionError("no deberia llegar a los agregados")
monkeypatch.setattr(mod, "pg_query", failing_count)
res = summarize_table_pg("postgresql://x/y", "ventas")
assert res["status"] == "error"
assert "connection refused" in res["error"]