chore: auto-commit (26 archivos)
- python/functions/bigquery/bq_auth.md - python/functions/bigquery/bq_load_from_file.md - python/functions/bigquery/bq_load_from_gcs.md - python/functions/bigquery/client.py - python/functions/bigquery/queries.py - python/functions/datascience/__init__.py - python/functions/datascience/decode_qr_image.py - python/functions/datascience/load_bq_table_to_duckdb.md - python/functions/datascience/load_bq_table_to_duckdb.py - python/functions/pipelines/profile_bq_table.md - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
"""Tests para build_column_dictionary.
|
||||
|
||||
Verifica el aplanado de un DatabaseProfile del grupo eda a un diccionario de
|
||||
columnas buscable: entradas por columna, marca de PII desde el semantic_type,
|
||||
deteccion de columnas compartidas por nombre (join keys), lectura defensiva y
|
||||
que la funcion es pura (no muta el input).
|
||||
"""
|
||||
|
||||
import copy
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
from build_column_dictionary import build_column_dictionary
|
||||
|
||||
|
||||
def _col(name, inferred_type="categorical", semantic_type="", null_pct=0.0,
|
||||
distinct_count=10, categorical=None) -> dict:
|
||||
"""ColumnProfile minimo con las claves del contrato eda usadas por la funcion."""
|
||||
return {
|
||||
"name": name,
|
||||
"physical_type": "VARCHAR",
|
||||
"inferred_type": inferred_type,
|
||||
"semantic_type": semantic_type,
|
||||
"null_pct": null_pct,
|
||||
"distinct_count": distinct_count,
|
||||
"flags": [],
|
||||
"numeric": None,
|
||||
"categorical": categorical,
|
||||
"datetime": None,
|
||||
}
|
||||
|
||||
|
||||
def _db_profile() -> dict:
|
||||
"""DatabaseProfile de juguete con dos tablas y una columna de join comun."""
|
||||
return {
|
||||
"db_path": "toy.duckdb",
|
||||
"n_tables": 2,
|
||||
"table_profiles": [
|
||||
{
|
||||
"table": "clientes",
|
||||
"columns": [
|
||||
_col("customer_id", "numeric", "", 0.0, 1000),
|
||||
_col("email", "text", "email", 0.05, 990),
|
||||
_col(
|
||||
"ciudad",
|
||||
"categorical",
|
||||
"",
|
||||
0.0,
|
||||
3,
|
||||
categorical={
|
||||
"top": [
|
||||
{"value": "Madrid", "count": 5, "pct": 0.5},
|
||||
{"value": "Bilbao", "count": 3, "pct": 0.3},
|
||||
]
|
||||
},
|
||||
),
|
||||
],
|
||||
},
|
||||
{
|
||||
"table": "pedidos",
|
||||
"columns": [
|
||||
_col("customer_id", "numeric", "", 0.0, 800),
|
||||
_col("iban", "text", "iban", 0.1, 795),
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Golden
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_golden_flattens_two_tables():
|
||||
res = build_column_dictionary(_db_profile())
|
||||
assert res["status"] == "ok"
|
||||
assert res["n_tables"] == 2
|
||||
assert res["n_columns"] == 5
|
||||
# Una entrada por columna, con las claves del contrato.
|
||||
keys = {
|
||||
"table", "column", "inferred_type", "semantic_type",
|
||||
"is_pii", "null_pct", "n_distinct", "top_values",
|
||||
}
|
||||
for e in res["entries"]:
|
||||
assert keys.issubset(e.keys())
|
||||
# El markdown tiene la tabla y la seccion de join keys.
|
||||
assert "## Columnas" in res["markdown"]
|
||||
assert "candidatas a join key" in res["markdown"]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# PII desde el semantic_type real del grupo
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_pii_flagged_from_semantic_type():
|
||||
res = build_column_dictionary(_db_profile())
|
||||
pii_cols = {(e["table"], e["column"]) for e in res["pii_columns"]}
|
||||
assert ("clientes", "email") in pii_cols
|
||||
assert ("pedidos", "iban") in pii_cols
|
||||
# customer_id / ciudad NO son PII.
|
||||
assert ("clientes", "customer_id") not in pii_cols
|
||||
assert ("clientes", "ciudad") not in pii_cols
|
||||
# Coherencia entre is_pii en entries y la lista pii_columns.
|
||||
assert res["pii_columns"] == [e for e in res["entries"] if e["is_pii"]]
|
||||
|
||||
|
||||
def test_empty_semantic_type_maps_to_none_and_not_pii():
|
||||
res = build_column_dictionary(_db_profile())
|
||||
ciudad = next(
|
||||
e for e in res["entries"]
|
||||
if e["table"] == "clientes" and e["column"] == "ciudad"
|
||||
)
|
||||
assert ciudad["semantic_type"] is None
|
||||
assert ciudad["is_pii"] is False
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Columnas compartidas por nombre = candidatas a join key
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_shared_column_names_detected_as_join_keys():
|
||||
res = build_column_dictionary(_db_profile())
|
||||
md = res["markdown"]
|
||||
# customer_id aparece en las dos tablas -> listada en la seccion de join keys.
|
||||
join_section = md.split("## Columnas\n")[0]
|
||||
assert "customer_id" in join_section
|
||||
assert "clientes" in join_section and "pedidos" in join_section
|
||||
# email solo esta en una tabla -> no aparece en la seccion de join keys.
|
||||
assert "email" not in join_section
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# top_values desde el bloque categorical
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_top_values_from_categorical_block():
|
||||
res = build_column_dictionary(_db_profile())
|
||||
ciudad = next(e for e in res["entries"] if e["column"] == "ciudad")
|
||||
assert ciudad["top_values"] == ["Madrid", "Bilbao"]
|
||||
# Columnas sin bloque categorical -> None.
|
||||
email = next(e for e in res["entries"] if e["column"] == "email")
|
||||
assert email["top_values"] is None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Entrada vacia / malformada -> resultado vacio en ok
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_empty_profile_returns_empty_ok():
|
||||
empty = build_column_dictionary({})
|
||||
assert empty == {
|
||||
"status": "ok", "n_tables": 0, "n_columns": 0,
|
||||
"entries": [], "pii_columns": [], "markdown": "",
|
||||
}
|
||||
|
||||
|
||||
def test_malformed_input_returns_empty_ok():
|
||||
for bad in (None, [], "nope", 42, {"table_profiles": "x"}):
|
||||
res = build_column_dictionary(bad)
|
||||
assert res["status"] == "ok"
|
||||
assert res["n_columns"] == 0
|
||||
assert res["entries"] == []
|
||||
assert res["markdown"] == ""
|
||||
|
||||
|
||||
def test_missing_keys_read_defensively():
|
||||
# TableProfiles y columnas con claves ausentes / basura no rompen.
|
||||
profile = {
|
||||
"table_profiles": [
|
||||
{"table": "t1", "columns": [{"name": "a"}, "no-dict", None]},
|
||||
"no-dict",
|
||||
{"table": "t2"}, # sin columns
|
||||
{"columns": [{}]}, # sin table, columna vacia
|
||||
]
|
||||
}
|
||||
res = build_column_dictionary(profile)
|
||||
assert res["status"] == "ok"
|
||||
# t1 (1 col dict valida; "no-dict" y None se saltan) + tabla sin table
|
||||
# (1 col {}). t2 no tiene columns -> no cuenta como tabla.
|
||||
assert res["n_tables"] == 2
|
||||
assert res["n_columns"] == 2
|
||||
a = next(e for e in res["entries"] if e["column"] == "a")
|
||||
assert a["semantic_type"] is None
|
||||
assert a["null_pct"] is None
|
||||
assert a["n_distinct"] is None
|
||||
assert a["top_values"] is None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Pureza
|
||||
# --------------------------------------------------------------------------- #
|
||||
def test_does_not_mutate_input():
|
||||
profile = _db_profile()
|
||||
snapshot = copy.deepcopy(profile)
|
||||
build_column_dictionary(profile)
|
||||
assert profile == snapshot
|
||||
Reference in New Issue
Block a user