5a4f82cf76
- python/functions/bigquery/bq_auth.md - python/functions/bigquery/bq_load_from_file.md - python/functions/bigquery/bq_load_from_gcs.md - python/functions/bigquery/client.py - python/functions/bigquery/queries.py - python/functions/datascience/__init__.py - python/functions/datascience/decode_qr_image.py - python/functions/datascience/load_bq_table_to_duckdb.md - python/functions/datascience/load_bq_table_to_duckdb.py - python/functions/pipelines/profile_bq_table.md - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
194 lines
7.1 KiB
Python
194 lines
7.1 KiB
Python
"""Tests para build_column_dictionary.
|
|
|
|
Verifica el aplanado de un DatabaseProfile del grupo eda a un diccionario de
|
|
columnas buscable: entradas por columna, marca de PII desde el semantic_type,
|
|
deteccion de columnas compartidas por nombre (join keys), lectura defensiva y
|
|
que la funcion es pura (no muta el input).
|
|
"""
|
|
|
|
import copy
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|
|
|
from build_column_dictionary import build_column_dictionary
|
|
|
|
|
|
def _col(name, inferred_type="categorical", semantic_type="", null_pct=0.0,
|
|
distinct_count=10, categorical=None) -> dict:
|
|
"""ColumnProfile minimo con las claves del contrato eda usadas por la funcion."""
|
|
return {
|
|
"name": name,
|
|
"physical_type": "VARCHAR",
|
|
"inferred_type": inferred_type,
|
|
"semantic_type": semantic_type,
|
|
"null_pct": null_pct,
|
|
"distinct_count": distinct_count,
|
|
"flags": [],
|
|
"numeric": None,
|
|
"categorical": categorical,
|
|
"datetime": None,
|
|
}
|
|
|
|
|
|
def _db_profile() -> dict:
|
|
"""DatabaseProfile de juguete con dos tablas y una columna de join comun."""
|
|
return {
|
|
"db_path": "toy.duckdb",
|
|
"n_tables": 2,
|
|
"table_profiles": [
|
|
{
|
|
"table": "clientes",
|
|
"columns": [
|
|
_col("customer_id", "numeric", "", 0.0, 1000),
|
|
_col("email", "text", "email", 0.05, 990),
|
|
_col(
|
|
"ciudad",
|
|
"categorical",
|
|
"",
|
|
0.0,
|
|
3,
|
|
categorical={
|
|
"top": [
|
|
{"value": "Madrid", "count": 5, "pct": 0.5},
|
|
{"value": "Bilbao", "count": 3, "pct": 0.3},
|
|
]
|
|
},
|
|
),
|
|
],
|
|
},
|
|
{
|
|
"table": "pedidos",
|
|
"columns": [
|
|
_col("customer_id", "numeric", "", 0.0, 800),
|
|
_col("iban", "text", "iban", 0.1, 795),
|
|
],
|
|
},
|
|
],
|
|
}
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Golden
|
|
# --------------------------------------------------------------------------- #
|
|
def test_golden_flattens_two_tables():
|
|
res = build_column_dictionary(_db_profile())
|
|
assert res["status"] == "ok"
|
|
assert res["n_tables"] == 2
|
|
assert res["n_columns"] == 5
|
|
# Una entrada por columna, con las claves del contrato.
|
|
keys = {
|
|
"table", "column", "inferred_type", "semantic_type",
|
|
"is_pii", "null_pct", "n_distinct", "top_values",
|
|
}
|
|
for e in res["entries"]:
|
|
assert keys.issubset(e.keys())
|
|
# El markdown tiene la tabla y la seccion de join keys.
|
|
assert "## Columnas" in res["markdown"]
|
|
assert "candidatas a join key" in res["markdown"]
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# PII desde el semantic_type real del grupo
|
|
# --------------------------------------------------------------------------- #
|
|
def test_pii_flagged_from_semantic_type():
|
|
res = build_column_dictionary(_db_profile())
|
|
pii_cols = {(e["table"], e["column"]) for e in res["pii_columns"]}
|
|
assert ("clientes", "email") in pii_cols
|
|
assert ("pedidos", "iban") in pii_cols
|
|
# customer_id / ciudad NO son PII.
|
|
assert ("clientes", "customer_id") not in pii_cols
|
|
assert ("clientes", "ciudad") not in pii_cols
|
|
# Coherencia entre is_pii en entries y la lista pii_columns.
|
|
assert res["pii_columns"] == [e for e in res["entries"] if e["is_pii"]]
|
|
|
|
|
|
def test_empty_semantic_type_maps_to_none_and_not_pii():
|
|
res = build_column_dictionary(_db_profile())
|
|
ciudad = next(
|
|
e for e in res["entries"]
|
|
if e["table"] == "clientes" and e["column"] == "ciudad"
|
|
)
|
|
assert ciudad["semantic_type"] is None
|
|
assert ciudad["is_pii"] is False
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Columnas compartidas por nombre = candidatas a join key
|
|
# --------------------------------------------------------------------------- #
|
|
def test_shared_column_names_detected_as_join_keys():
|
|
res = build_column_dictionary(_db_profile())
|
|
md = res["markdown"]
|
|
# customer_id aparece en las dos tablas -> listada en la seccion de join keys.
|
|
join_section = md.split("## Columnas\n")[0]
|
|
assert "customer_id" in join_section
|
|
assert "clientes" in join_section and "pedidos" in join_section
|
|
# email solo esta en una tabla -> no aparece en la seccion de join keys.
|
|
assert "email" not in join_section
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# top_values desde el bloque categorical
|
|
# --------------------------------------------------------------------------- #
|
|
def test_top_values_from_categorical_block():
|
|
res = build_column_dictionary(_db_profile())
|
|
ciudad = next(e for e in res["entries"] if e["column"] == "ciudad")
|
|
assert ciudad["top_values"] == ["Madrid", "Bilbao"]
|
|
# Columnas sin bloque categorical -> None.
|
|
email = next(e for e in res["entries"] if e["column"] == "email")
|
|
assert email["top_values"] is None
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Entrada vacia / malformada -> resultado vacio en ok
|
|
# --------------------------------------------------------------------------- #
|
|
def test_empty_profile_returns_empty_ok():
|
|
empty = build_column_dictionary({})
|
|
assert empty == {
|
|
"status": "ok", "n_tables": 0, "n_columns": 0,
|
|
"entries": [], "pii_columns": [], "markdown": "",
|
|
}
|
|
|
|
|
|
def test_malformed_input_returns_empty_ok():
|
|
for bad in (None, [], "nope", 42, {"table_profiles": "x"}):
|
|
res = build_column_dictionary(bad)
|
|
assert res["status"] == "ok"
|
|
assert res["n_columns"] == 0
|
|
assert res["entries"] == []
|
|
assert res["markdown"] == ""
|
|
|
|
|
|
def test_missing_keys_read_defensively():
|
|
# TableProfiles y columnas con claves ausentes / basura no rompen.
|
|
profile = {
|
|
"table_profiles": [
|
|
{"table": "t1", "columns": [{"name": "a"}, "no-dict", None]},
|
|
"no-dict",
|
|
{"table": "t2"}, # sin columns
|
|
{"columns": [{}]}, # sin table, columna vacia
|
|
]
|
|
}
|
|
res = build_column_dictionary(profile)
|
|
assert res["status"] == "ok"
|
|
# t1 (1 col dict valida; "no-dict" y None se saltan) + tabla sin table
|
|
# (1 col {}). t2 no tiene columns -> no cuenta como tabla.
|
|
assert res["n_tables"] == 2
|
|
assert res["n_columns"] == 2
|
|
a = next(e for e in res["entries"] if e["column"] == "a")
|
|
assert a["semantic_type"] is None
|
|
assert a["null_pct"] is None
|
|
assert a["n_distinct"] is None
|
|
assert a["top_values"] is None
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# Pureza
|
|
# --------------------------------------------------------------------------- #
|
|
def test_does_not_mutate_input():
|
|
profile = _db_profile()
|
|
snapshot = copy.deepcopy(profile)
|
|
build_column_dictionary(profile)
|
|
assert profile == snapshot
|