"""Tests para summarize_table_pg sin servidor PostgreSQL. Monkeypatchea el primitivo de lectura PG (`pg_query`, importado en el modulo) para devolver filas simuladas: introspeccion de information_schema, count(*) y los agregados por columna. Asserta el shape del TableProfile/ColumnProfile (claves, tipos inferidos, flags, sub-dict numeric) — identico al de summarize_table_duckdb. No requiere PostgreSQL real. """ import sys import pytest from .summarize_table_pg import summarize_table_pg # El objeto-modulo real donde vive la funcion (robusto frente al shadowing # nombre-modulo/funcion del __init__ y al doble-import de pytest): es el modulo # cuyo global `pg_query` usa summarize_table_pg, asi el monkeypatch surte efecto. mod = sys.modules[summarize_table_pg.__module__] # Tabla simulada `ventas` (mismo esquema conceptual que el test de duckdb): # id INTEGER -> unico, sin nulls -> possible_id (numeric) # region TEXT -> categorica baja cardinalidad (3 distintos) # total NUMERIC -> numerica con un null (count no-nulo = 3) # pais TEXT -> constante ('ES') # # 4 filas. Valores de total no nulos: 120.5, 80.0, 45.25. _N_ROWS = 4 _COLUMNS = [ {"column_name": "id", "data_type": "integer"}, {"column_name": "region", "data_type": "text"}, {"column_name": "total", "data_type": "numeric"}, {"column_name": "pais", "data_type": "character varying"}, ] # Agregados precomputados por columna (lo que devolveria PostgreSQL). _AGG_BY_COL = { "id": { "non_null": 4, "distinct_n": 4, "mn": 1, "mx": 4, "av": 2.5, "sd": 1.2909944487358056, "p25": 1.75, "p50": 2.5, "p75": 3.25, }, "region": {"non_null": 4, "distinct_n": 3}, "total": { "non_null": 3, "distinct_n": 3, "mn": 45.25, "mx": 120.5, "av": 81.91666666666667, "sd": 37.70159, "p25": 62.625, "p50": 80.0, "p75": 100.25, }, "pais": {"non_null": 4, "distinct_n": 1}, } def _fake_pg_query(dsn, sql, params=None, max_rows=10000): """Despacha por la forma del SQL para simular pg_query sin servidor.""" sql_l = sql.lower() # 1) Introspeccion de columnas. if "information_schema.columns" in sql_l: return { "status": "ok", "columns": ["column_name", "data_type"], "rows": list(_COLUMNS), "row_count": len(_COLUMNS), "truncated": False, } # 2) count(*) total de filas. if "count(*) as n" in sql_l: return { "status": "ok", "columns": ["n"], "rows": [{"n": _N_ROWS}], "row_count": 1, "truncated": False, } # 3) Agregados por columna: identificar la columna por su identificador citado. for col, agg in _AGG_BY_COL.items(): if f'"{col}"' in sql: return { "status": "ok", "columns": list(agg.keys()), "rows": [dict(agg)], "row_count": 1, "truncated": False, } raise AssertionError(f"SQL inesperado en fake pg_query: {sql}") @pytest.fixture(autouse=True) def patch_pg_query(monkeypatch): """Reemplaza el pg_query que el modulo importo por la version simulada.""" monkeypatch.setattr(mod, "pg_query", _fake_pg_query) def test_shape_y_metadatos_tabla(): res = summarize_table_pg("postgresql://x/y", "ventas") assert res["status"] == "ok" profile = res["profile"] for key in ( "table", "source", "profiled_at", "n_rows", "n_cols", "size_bytes", "duplicate_rows", "duplicate_pct", "constant_cols", "all_null_cols", "null_cell_pct", "type_breakdown", "columns", "correlations", "key_candidates", "quality_score", "llm", "models", ): assert key in profile, f"falta clave {key} en TableProfile" assert profile["table"] == "ventas" assert profile["source"] == "postgres" assert profile["n_rows"] == 4 assert profile["n_cols"] == 4 assert len(profile["columns"]) == 4 assert profile["key_candidates"] == [] assert profile["quality_score"] is None assert profile["correlations"] is None assert profile["models"] is None assert profile["llm"] is None def test_column_profile_shape(): profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"] by_name = {c["name"]: c for c in profile["columns"]} for col in profile["columns"]: for key in ( "name", "physical_type", "inferred_type", "semantic_type", "count", "n_rows", "null_count", "null_pct", "empty_count", "empty_pct", "distinct_count", "unique_pct", "flags", "quality_score", "numeric", "categorical", "datetime", ): assert key in col, f"falta clave {key} en ColumnProfile {col['name']}" assert col["semantic_type"] == "" assert col["quality_score"] is None assert col["categorical"] is None assert col["datetime"] is None # id: numerica, sin nulls, unica -> possible_id. idc = by_name["id"] assert idc["inferred_type"] == "numeric" assert idc["null_count"] == 0 assert idc["count"] == 4 assert idc["distinct_count"] == 4 assert idc["unique_pct"] == 1.0 assert "possible_id" in idc["flags"] # region: categorica baja cardinalidad. region = by_name["region"] assert region["inferred_type"] == "categorical" assert region["distinct_count"] == 3 assert region["numeric"] is None # total: numerica con un null. count no-nulo = 3. total = by_name["total"] assert total["inferred_type"] == "numeric" assert total["null_count"] == 1 assert total["count"] == 3 assert total["numeric"] is not None assert total["numeric"]["min"] == pytest.approx(45.25) assert total["numeric"]["max"] == pytest.approx(120.5) assert total["numeric"]["mean"] is not None assert total["numeric"]["std"] is not None assert total["numeric"]["p25"] == pytest.approx(62.625) assert total["numeric"]["p50"] == pytest.approx(80.0) assert total["numeric"]["p75"] == pytest.approx(100.25) # claves finas siguen en None (las completa otra funcion del grupo eda). assert total["numeric"]["skew"] is None assert total["numeric"]["kurtosis"] is None assert total["numeric"]["histogram"] is None assert total["numeric"]["p99"] is None # pais: constante -> flag constant + aparece en constant_cols. assert "constant" in by_name["pais"]["flags"] assert "pais" in profile["constant_cols"] def test_null_pct_total(): profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"] total = next(c for c in profile["columns"] if c["name"] == "total") # 1 null sobre 4 filas. assert total["null_pct"] == pytest.approx(0.25) def test_distinct_no_excede_filas(): profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"] n_rows = profile["n_rows"] for col in profile["columns"]: assert col["distinct_count"] <= n_rows assert col["unique_pct"] <= 1.0 def test_type_breakdown(): profile = summarize_table_pg("postgresql://x/y", "ventas")["profile"] tb = profile["type_breakdown"] assert set(tb.keys()) == { "numeric", "categorical", "datetime", "text", "boolean" } assert tb["numeric"] == 2 # id, total assert tb["categorical"] == 2 # region, pais assert tb["datetime"] == 0 assert tb["boolean"] == 0 assert tb["text"] == 0 def test_tabla_invalida_devuelve_error(): res = summarize_table_pg("postgresql://x/y", "ventas; DROP TABLE ventas") assert res["status"] == "error" assert "invalido" in res["error"] def test_schema_invalido_devuelve_error(): res = summarize_table_pg("postgresql://x/y", "ventas", schema="pub lic") assert res["status"] == "error" assert "schema" in res["error"] def test_tabla_inexistente_devuelve_error(monkeypatch): """information_schema sin filas -> error (tabla no encontrada).""" def empty_pg_query(dsn, sql, params=None, max_rows=10000): if "information_schema.columns" in sql.lower(): return { "status": "ok", "columns": ["column_name", "data_type"], "rows": [], "row_count": 0, "truncated": False, } raise AssertionError("no deberia llegar aqui") monkeypatch.setattr(mod, "pg_query", empty_pg_query) res = summarize_table_pg("postgresql://x/y", "no_existe") assert res["status"] == "error" assert "no encontrada" in res["error"] def test_error_de_lectura_pg_se_propaga(monkeypatch): """Si pg_query devuelve error en el count, summarize lo propaga dict-no-throw.""" def failing_count(dsn, sql, params=None, max_rows=10000): sql_l = sql.lower() if "information_schema.columns" in sql_l: return { "status": "ok", "columns": ["column_name", "data_type"], "rows": list(_COLUMNS), "row_count": len(_COLUMNS), "truncated": False, } if "count(*) as n" in sql_l: return {"status": "error", "error": "connection refused"} raise AssertionError("no deberia llegar a los agregados") monkeypatch.setattr(mod, "pg_query", failing_count) res = summarize_table_pg("postgresql://x/y", "ventas") assert res["status"] == "error" assert "connection refused" in res["error"]