"""Tests para build_column_dictionary. Verifica el aplanado de un DatabaseProfile del grupo eda a un diccionario de columnas buscable: entradas por columna, marca de PII desde el semantic_type, deteccion de columnas compartidas por nombre (join keys), lectura defensiva y que la funcion es pura (no muta el input). """ import copy import os import sys sys.path.insert(0, os.path.dirname(__file__)) from build_column_dictionary import build_column_dictionary def _col(name, inferred_type="categorical", semantic_type="", null_pct=0.0, distinct_count=10, categorical=None) -> dict: """ColumnProfile minimo con las claves del contrato eda usadas por la funcion.""" return { "name": name, "physical_type": "VARCHAR", "inferred_type": inferred_type, "semantic_type": semantic_type, "null_pct": null_pct, "distinct_count": distinct_count, "flags": [], "numeric": None, "categorical": categorical, "datetime": None, } def _db_profile() -> dict: """DatabaseProfile de juguete con dos tablas y una columna de join comun.""" return { "db_path": "toy.duckdb", "n_tables": 2, "table_profiles": [ { "table": "clientes", "columns": [ _col("customer_id", "numeric", "", 0.0, 1000), _col("email", "text", "email", 0.05, 990), _col( "ciudad", "categorical", "", 0.0, 3, categorical={ "top": [ {"value": "Madrid", "count": 5, "pct": 0.5}, {"value": "Bilbao", "count": 3, "pct": 0.3}, ] }, ), ], }, { "table": "pedidos", "columns": [ _col("customer_id", "numeric", "", 0.0, 800), _col("iban", "text", "iban", 0.1, 795), ], }, ], } # --------------------------------------------------------------------------- # # Golden # --------------------------------------------------------------------------- # def test_golden_flattens_two_tables(): res = build_column_dictionary(_db_profile()) assert res["status"] == "ok" assert res["n_tables"] == 2 assert res["n_columns"] == 5 # Una entrada por columna, con las claves del contrato. keys = { "table", "column", "inferred_type", "semantic_type", "is_pii", "null_pct", "n_distinct", "top_values", } for e in res["entries"]: assert keys.issubset(e.keys()) # El markdown tiene la tabla y la seccion de join keys. assert "## Columnas" in res["markdown"] assert "candidatas a join key" in res["markdown"] # --------------------------------------------------------------------------- # # PII desde el semantic_type real del grupo # --------------------------------------------------------------------------- # def test_pii_flagged_from_semantic_type(): res = build_column_dictionary(_db_profile()) pii_cols = {(e["table"], e["column"]) for e in res["pii_columns"]} assert ("clientes", "email") in pii_cols assert ("pedidos", "iban") in pii_cols # customer_id / ciudad NO son PII. assert ("clientes", "customer_id") not in pii_cols assert ("clientes", "ciudad") not in pii_cols # Coherencia entre is_pii en entries y la lista pii_columns. assert res["pii_columns"] == [e for e in res["entries"] if e["is_pii"]] def test_empty_semantic_type_maps_to_none_and_not_pii(): res = build_column_dictionary(_db_profile()) ciudad = next( e for e in res["entries"] if e["table"] == "clientes" and e["column"] == "ciudad" ) assert ciudad["semantic_type"] is None assert ciudad["is_pii"] is False # --------------------------------------------------------------------------- # # Columnas compartidas por nombre = candidatas a join key # --------------------------------------------------------------------------- # def test_shared_column_names_detected_as_join_keys(): res = build_column_dictionary(_db_profile()) md = res["markdown"] # customer_id aparece en las dos tablas -> listada en la seccion de join keys. join_section = md.split("## Columnas\n")[0] assert "customer_id" in join_section assert "clientes" in join_section and "pedidos" in join_section # email solo esta en una tabla -> no aparece en la seccion de join keys. assert "email" not in join_section # --------------------------------------------------------------------------- # # top_values desde el bloque categorical # --------------------------------------------------------------------------- # def test_top_values_from_categorical_block(): res = build_column_dictionary(_db_profile()) ciudad = next(e for e in res["entries"] if e["column"] == "ciudad") assert ciudad["top_values"] == ["Madrid", "Bilbao"] # Columnas sin bloque categorical -> None. email = next(e for e in res["entries"] if e["column"] == "email") assert email["top_values"] is None # --------------------------------------------------------------------------- # # Entrada vacia / malformada -> resultado vacio en ok # --------------------------------------------------------------------------- # def test_empty_profile_returns_empty_ok(): empty = build_column_dictionary({}) assert empty == { "status": "ok", "n_tables": 0, "n_columns": 0, "entries": [], "pii_columns": [], "markdown": "", } def test_malformed_input_returns_empty_ok(): for bad in (None, [], "nope", 42, {"table_profiles": "x"}): res = build_column_dictionary(bad) assert res["status"] == "ok" assert res["n_columns"] == 0 assert res["entries"] == [] assert res["markdown"] == "" def test_missing_keys_read_defensively(): # TableProfiles y columnas con claves ausentes / basura no rompen. profile = { "table_profiles": [ {"table": "t1", "columns": [{"name": "a"}, "no-dict", None]}, "no-dict", {"table": "t2"}, # sin columns {"columns": [{}]}, # sin table, columna vacia ] } res = build_column_dictionary(profile) assert res["status"] == "ok" # t1 (1 col dict valida; "no-dict" y None se saltan) + tabla sin table # (1 col {}). t2 no tiene columns -> no cuenta como tabla. assert res["n_tables"] == 2 assert res["n_columns"] == 2 a = next(e for e in res["entries"] if e["column"] == "a") assert a["semantic_type"] is None assert a["null_pct"] is None assert a["n_distinct"] is None assert a["top_values"] is None # --------------------------------------------------------------------------- # # Pureza # --------------------------------------------------------------------------- # def test_does_not_mutate_input(): profile = _db_profile() snapshot = copy.deepcopy(profile) build_column_dictionary(profile) assert profile == snapshot