Files
fn_registry/python/functions/datascience/build_column_dictionary_test.py
T
egutierrez 5a4f82cf76 chore: auto-commit (26 archivos)
- python/functions/bigquery/bq_auth.md
- python/functions/bigquery/bq_load_from_file.md
- python/functions/bigquery/bq_load_from_gcs.md
- python/functions/bigquery/client.py
- python/functions/bigquery/queries.py
- python/functions/datascience/__init__.py
- python/functions/datascience/decode_qr_image.py
- python/functions/datascience/load_bq_table_to_duckdb.md
- python/functions/datascience/load_bq_table_to_duckdb.py
- python/functions/pipelines/profile_bq_table.md
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-02 19:00:13 +02:00

194 lines
7.1 KiB
Python

"""Tests para build_column_dictionary.
Verifica el aplanado de un DatabaseProfile del grupo eda a un diccionario de
columnas buscable: entradas por columna, marca de PII desde el semantic_type,
deteccion de columnas compartidas por nombre (join keys), lectura defensiva y
que la funcion es pura (no muta el input).
"""
import copy
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
from build_column_dictionary import build_column_dictionary
def _col(name, inferred_type="categorical", semantic_type="", null_pct=0.0,
distinct_count=10, categorical=None) -> dict:
"""ColumnProfile minimo con las claves del contrato eda usadas por la funcion."""
return {
"name": name,
"physical_type": "VARCHAR",
"inferred_type": inferred_type,
"semantic_type": semantic_type,
"null_pct": null_pct,
"distinct_count": distinct_count,
"flags": [],
"numeric": None,
"categorical": categorical,
"datetime": None,
}
def _db_profile() -> dict:
"""DatabaseProfile de juguete con dos tablas y una columna de join comun."""
return {
"db_path": "toy.duckdb",
"n_tables": 2,
"table_profiles": [
{
"table": "clientes",
"columns": [
_col("customer_id", "numeric", "", 0.0, 1000),
_col("email", "text", "email", 0.05, 990),
_col(
"ciudad",
"categorical",
"",
0.0,
3,
categorical={
"top": [
{"value": "Madrid", "count": 5, "pct": 0.5},
{"value": "Bilbao", "count": 3, "pct": 0.3},
]
},
),
],
},
{
"table": "pedidos",
"columns": [
_col("customer_id", "numeric", "", 0.0, 800),
_col("iban", "text", "iban", 0.1, 795),
],
},
],
}
# --------------------------------------------------------------------------- #
# Golden
# --------------------------------------------------------------------------- #
def test_golden_flattens_two_tables():
res = build_column_dictionary(_db_profile())
assert res["status"] == "ok"
assert res["n_tables"] == 2
assert res["n_columns"] == 5
# Una entrada por columna, con las claves del contrato.
keys = {
"table", "column", "inferred_type", "semantic_type",
"is_pii", "null_pct", "n_distinct", "top_values",
}
for e in res["entries"]:
assert keys.issubset(e.keys())
# El markdown tiene la tabla y la seccion de join keys.
assert "## Columnas" in res["markdown"]
assert "candidatas a join key" in res["markdown"]
# --------------------------------------------------------------------------- #
# PII desde el semantic_type real del grupo
# --------------------------------------------------------------------------- #
def test_pii_flagged_from_semantic_type():
res = build_column_dictionary(_db_profile())
pii_cols = {(e["table"], e["column"]) for e in res["pii_columns"]}
assert ("clientes", "email") in pii_cols
assert ("pedidos", "iban") in pii_cols
# customer_id / ciudad NO son PII.
assert ("clientes", "customer_id") not in pii_cols
assert ("clientes", "ciudad") not in pii_cols
# Coherencia entre is_pii en entries y la lista pii_columns.
assert res["pii_columns"] == [e for e in res["entries"] if e["is_pii"]]
def test_empty_semantic_type_maps_to_none_and_not_pii():
res = build_column_dictionary(_db_profile())
ciudad = next(
e for e in res["entries"]
if e["table"] == "clientes" and e["column"] == "ciudad"
)
assert ciudad["semantic_type"] is None
assert ciudad["is_pii"] is False
# --------------------------------------------------------------------------- #
# Columnas compartidas por nombre = candidatas a join key
# --------------------------------------------------------------------------- #
def test_shared_column_names_detected_as_join_keys():
res = build_column_dictionary(_db_profile())
md = res["markdown"]
# customer_id aparece en las dos tablas -> listada en la seccion de join keys.
join_section = md.split("## Columnas\n")[0]
assert "customer_id" in join_section
assert "clientes" in join_section and "pedidos" in join_section
# email solo esta en una tabla -> no aparece en la seccion de join keys.
assert "email" not in join_section
# --------------------------------------------------------------------------- #
# top_values desde el bloque categorical
# --------------------------------------------------------------------------- #
def test_top_values_from_categorical_block():
res = build_column_dictionary(_db_profile())
ciudad = next(e for e in res["entries"] if e["column"] == "ciudad")
assert ciudad["top_values"] == ["Madrid", "Bilbao"]
# Columnas sin bloque categorical -> None.
email = next(e for e in res["entries"] if e["column"] == "email")
assert email["top_values"] is None
# --------------------------------------------------------------------------- #
# Entrada vacia / malformada -> resultado vacio en ok
# --------------------------------------------------------------------------- #
def test_empty_profile_returns_empty_ok():
empty = build_column_dictionary({})
assert empty == {
"status": "ok", "n_tables": 0, "n_columns": 0,
"entries": [], "pii_columns": [], "markdown": "",
}
def test_malformed_input_returns_empty_ok():
for bad in (None, [], "nope", 42, {"table_profiles": "x"}):
res = build_column_dictionary(bad)
assert res["status"] == "ok"
assert res["n_columns"] == 0
assert res["entries"] == []
assert res["markdown"] == ""
def test_missing_keys_read_defensively():
# TableProfiles y columnas con claves ausentes / basura no rompen.
profile = {
"table_profiles": [
{"table": "t1", "columns": [{"name": "a"}, "no-dict", None]},
"no-dict",
{"table": "t2"}, # sin columns
{"columns": [{}]}, # sin table, columna vacia
]
}
res = build_column_dictionary(profile)
assert res["status"] == "ok"
# t1 (1 col dict valida; "no-dict" y None se saltan) + tabla sin table
# (1 col {}). t2 no tiene columns -> no cuenta como tabla.
assert res["n_tables"] == 2
assert res["n_columns"] == 2
a = next(e for e in res["entries"] if e["column"] == "a")
assert a["semantic_type"] is None
assert a["null_pct"] is None
assert a["n_distinct"] is None
assert a["top_values"] is None
# --------------------------------------------------------------------------- #
# Pureza
# --------------------------------------------------------------------------- #
def test_does_not_mutate_input():
profile = _db_profile()
snapshot = copy.deepcopy(profile)
build_column_dictionary(profile)
assert profile == snapshot