feat(infra): auto-commit con 56 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,253 @@
|
||||
"""Tests para summarize_table_pg sin servidor PostgreSQL.
|
||||
|
||||
Monkeypatchea el primitivo de lectura PG (`pg_query`, importado en el modulo) para
|
||||
devolver filas simuladas: introspeccion de information_schema, count(*) y los
|
||||
agregados por columna. Asserta el shape del TableProfile/ColumnProfile (claves,
|
||||
tipos inferidos, flags, sub-dict numeric) — identico al de summarize_table_duckdb.
|
||||
No requiere PostgreSQL real.
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from .summarize_table_pg import summarize_table_pg
|
||||
|
||||
# El objeto-modulo real donde vive la funcion (robusto frente al shadowing
|
||||
# nombre-modulo/funcion del __init__ y al doble-import de pytest): es el modulo
|
||||
# cuyo global `pg_query` usa summarize_table_pg, asi el monkeypatch surte efecto.
|
||||
mod = sys.modules[summarize_table_pg.__module__]
|
||||
|
||||
# Tabla simulada `ventas` (mismo esquema conceptual que el test de duckdb):
|
||||
# id INTEGER -> unico, sin nulls -> possible_id (numeric)
|
||||
# region TEXT -> categorica baja cardinalidad (3 distintos)
|
||||
# total NUMERIC -> numerica con un null (count no-nulo = 3)
|
||||
# pais TEXT -> constante ('ES')
|
||||
#
|
||||
# 4 filas. Valores de total no nulos: 120.5, 80.0, 45.25.
|
||||
|
||||
_N_ROWS = 4
|
||||
|
||||
_COLUMNS = [
|
||||
{"column_name": "id", "data_type": "integer"},
|
||||
{"column_name": "region", "data_type": "text"},
|
||||
{"column_name": "total", "data_type": "numeric"},
|
||||
{"column_name": "pais", "data_type": "character varying"},
|
||||
]
|
||||
|
||||
# Agregados precomputados por columna (lo que devolveria PostgreSQL).
|
||||
_AGG_BY_COL = {
|
||||
"id": {
|
||||
"non_null": 4, "distinct_n": 4,
|
||||
"mn": 1, "mx": 4, "av": 2.5, "sd": 1.2909944487358056,
|
||||
"p25": 1.75, "p50": 2.5, "p75": 3.25,
|
||||
},
|
||||
"region": {"non_null": 4, "distinct_n": 3},
|
||||
"total": {
|
||||
"non_null": 3, "distinct_n": 3,
|
||||
"mn": 45.25, "mx": 120.5, "av": 81.91666666666667,
|
||||
"sd": 37.70159, "p25": 62.625, "p50": 80.0, "p75": 100.25,
|
||||
},
|
||||
"pais": {"non_null": 4, "distinct_n": 1},
|
||||
}
|
||||
|
||||
|
||||
def _fake_pg_query(dsn, sql, params=None, max_rows=10000):
|
||||
"""Despacha por la forma del SQL para simular pg_query sin servidor."""
|
||||
sql_l = sql.lower()
|
||||
|
||||
# 1) Introspeccion de columnas.
|
||||
if "information_schema.columns" in sql_l:
|
||||
return {
|
||||
"status": "ok",
|
||||
"columns": ["column_name", "data_type"],
|
||||
"rows": list(_COLUMNS),
|
||||
"row_count": len(_COLUMNS),
|
||||
"truncated": False,
|
||||
}
|
||||
|
||||
# 2) count(*) total de filas.
|
||||
if "count(*) as n" in sql_l:
|
||||
return {
|
||||
"status": "ok",
|
||||
"columns": ["n"],
|
||||
"rows": [{"n": _N_ROWS}],
|
||||
"row_count": 1,
|
||||
"truncated": False,
|
||||
}
|
||||
|
||||
# 3) Agregados por columna: identificar la columna por su identificador citado.
|
||||
for col, agg in _AGG_BY_COL.items():
|
||||
if f'"{col}"' in sql:
|
||||
return {
|
||||
"status": "ok",
|
||||
"columns": list(agg.keys()),
|
||||
"rows": [dict(agg)],
|
||||
"row_count": 1,
|
||||
"truncated": False,
|
||||
}
|
||||
|
||||
raise AssertionError(f"SQL inesperado en fake pg_query: {sql}")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def patch_pg_query(monkeypatch):
|
||||
"""Reemplaza el pg_query que el modulo importo por la version simulada."""
|
||||
monkeypatch.setattr(mod, "pg_query", _fake_pg_query)
|
||||
|
||||
|
||||
def test_shape_y_metadatos_tabla():
|
||||
res = summarize_table_pg("postgresql://x/y", "ventas")
|
||||
assert res["status"] == "ok"
|
||||
profile = res["profile"]
|
||||
|
||||
for key in (
|
||||
"table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes",
|
||||
"duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols",
|
||||
"null_cell_pct", "type_breakdown", "columns", "correlations",
|
||||
"key_candidates", "quality_score", "llm", "models",
|
||||
):
|
||||
assert key in profile, f"falta clave {key} en TableProfile"
|
||||
|
||||
assert profile["table"] == "ventas"
|
||||
assert profile["source"] == "postgres"
|
||||
assert profile["n_rows"] == 4
|
||||
assert profile["n_cols"] == 4
|
||||
assert len(profile["columns"]) == 4
|
||||
assert profile["key_candidates"] == []
|
||||
assert profile["quality_score"] is None
|
||||
assert profile["correlations"] is None
|
||||
assert profile["models"] is None
|
||||
assert profile["llm"] is None
|
||||
|
||||
|
||||
def test_column_profile_shape():
|
||||
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
|
||||
by_name = {c["name"]: c for c in profile["columns"]}
|
||||
|
||||
for col in profile["columns"]:
|
||||
for key in (
|
||||
"name", "physical_type", "inferred_type", "semantic_type", "count",
|
||||
"n_rows", "null_count", "null_pct", "empty_count", "empty_pct",
|
||||
"distinct_count", "unique_pct", "flags", "quality_score",
|
||||
"numeric", "categorical", "datetime",
|
||||
):
|
||||
assert key in col, f"falta clave {key} en ColumnProfile {col['name']}"
|
||||
assert col["semantic_type"] == ""
|
||||
assert col["quality_score"] is None
|
||||
assert col["categorical"] is None
|
||||
assert col["datetime"] is None
|
||||
|
||||
# id: numerica, sin nulls, unica -> possible_id.
|
||||
idc = by_name["id"]
|
||||
assert idc["inferred_type"] == "numeric"
|
||||
assert idc["null_count"] == 0
|
||||
assert idc["count"] == 4
|
||||
assert idc["distinct_count"] == 4
|
||||
assert idc["unique_pct"] == 1.0
|
||||
assert "possible_id" in idc["flags"]
|
||||
|
||||
# region: categorica baja cardinalidad.
|
||||
region = by_name["region"]
|
||||
assert region["inferred_type"] == "categorical"
|
||||
assert region["distinct_count"] == 3
|
||||
assert region["numeric"] is None
|
||||
|
||||
# total: numerica con un null. count no-nulo = 3.
|
||||
total = by_name["total"]
|
||||
assert total["inferred_type"] == "numeric"
|
||||
assert total["null_count"] == 1
|
||||
assert total["count"] == 3
|
||||
assert total["numeric"] is not None
|
||||
assert total["numeric"]["min"] == pytest.approx(45.25)
|
||||
assert total["numeric"]["max"] == pytest.approx(120.5)
|
||||
assert total["numeric"]["mean"] is not None
|
||||
assert total["numeric"]["std"] is not None
|
||||
assert total["numeric"]["p25"] == pytest.approx(62.625)
|
||||
assert total["numeric"]["p50"] == pytest.approx(80.0)
|
||||
assert total["numeric"]["p75"] == pytest.approx(100.25)
|
||||
# claves finas siguen en None (las completa otra funcion del grupo eda).
|
||||
assert total["numeric"]["skew"] is None
|
||||
assert total["numeric"]["kurtosis"] is None
|
||||
assert total["numeric"]["histogram"] is None
|
||||
assert total["numeric"]["p99"] is None
|
||||
|
||||
# pais: constante -> flag constant + aparece en constant_cols.
|
||||
assert "constant" in by_name["pais"]["flags"]
|
||||
assert "pais" in profile["constant_cols"]
|
||||
|
||||
|
||||
def test_null_pct_total():
|
||||
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
|
||||
total = next(c for c in profile["columns"] if c["name"] == "total")
|
||||
# 1 null sobre 4 filas.
|
||||
assert total["null_pct"] == pytest.approx(0.25)
|
||||
|
||||
|
||||
def test_distinct_no_excede_filas():
|
||||
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
|
||||
n_rows = profile["n_rows"]
|
||||
for col in profile["columns"]:
|
||||
assert col["distinct_count"] <= n_rows
|
||||
assert col["unique_pct"] <= 1.0
|
||||
|
||||
|
||||
def test_type_breakdown():
|
||||
profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"]
|
||||
tb = profile["type_breakdown"]
|
||||
assert set(tb.keys()) == {
|
||||
"numeric", "categorical", "datetime", "text", "boolean"
|
||||
}
|
||||
assert tb["numeric"] == 2 # id, total
|
||||
assert tb["categorical"] == 2 # region, pais
|
||||
assert tb["datetime"] == 0
|
||||
assert tb["boolean"] == 0
|
||||
assert tb["text"] == 0
|
||||
|
||||
|
||||
def test_tabla_invalida_devuelve_error():
|
||||
res = summarize_table_pg("postgresql://x/y", "ventas; DROP TABLE ventas")
|
||||
assert res["status"] == "error"
|
||||
assert "invalido" in res["error"]
|
||||
|
||||
|
||||
def test_schema_invalido_devuelve_error():
|
||||
res = summarize_table_pg("postgresql://x/y", "ventas", schema="pub lic")
|
||||
assert res["status"] == "error"
|
||||
assert "schema" in res["error"]
|
||||
|
||||
|
||||
def test_tabla_inexistente_devuelve_error(monkeypatch):
|
||||
"""information_schema sin filas -> error (tabla no encontrada)."""
|
||||
def empty_pg_query(dsn, sql, params=None, max_rows=10000):
|
||||
if "information_schema.columns" in sql.lower():
|
||||
return {
|
||||
"status": "ok", "columns": ["column_name", "data_type"],
|
||||
"rows": [], "row_count": 0, "truncated": False,
|
||||
}
|
||||
raise AssertionError("no deberia llegar aqui")
|
||||
|
||||
monkeypatch.setattr(mod, "pg_query", empty_pg_query)
|
||||
res = summarize_table_pg("postgresql://x/y", "no_existe")
|
||||
assert res["status"] == "error"
|
||||
assert "no encontrada" in res["error"]
|
||||
|
||||
|
||||
def test_error_de_lectura_pg_se_propaga(monkeypatch):
|
||||
"""Si pg_query devuelve error en el count, summarize lo propaga dict-no-throw."""
|
||||
def failing_count(dsn, sql, params=None, max_rows=10000):
|
||||
sql_l = sql.lower()
|
||||
if "information_schema.columns" in sql_l:
|
||||
return {
|
||||
"status": "ok", "columns": ["column_name", "data_type"],
|
||||
"rows": list(_COLUMNS), "row_count": len(_COLUMNS),
|
||||
"truncated": False,
|
||||
}
|
||||
if "count(*) as n" in sql_l:
|
||||
return {"status": "error", "error": "connection refused"}
|
||||
raise AssertionError("no deberia llegar a los agregados")
|
||||
|
||||
monkeypatch.setattr(mod, "pg_query", failing_count)
|
||||
res = summarize_table_pg("postgresql://x/y", "ventas")
|
||||
assert res["status"] == "error"
|
||||
assert "connection refused" in res["error"]
|
||||
Reference in New Issue
Block a user