"""Traza el linaje recursivo de las vistas de customer_marts hasta las tablas fuente. Para cada objeto: obtiene su tipo (VIEW/BASE TABLE/EXTERNAL/MATERIALIZED VIEW) y su DDL via INFORMATION_SCHEMA.TABLES, extrae las referencias a otras tablas del DDL y recurre sobre las que son vistas. Vuelca el grafo completo a un JSON en scratchpad. """ import json import re import sys import warnings warnings.filterwarnings("ignore") import google.auth from google.cloud import bigquery PROJECT = "autingo-159109" creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/bigquery"]) creds = creds.with_quota_project(None) client = bigquery.Client(project=PROJECT, credentials=creds) # Cache de metadata por dataset: {dataset: {table_name: {"type":..., "ddl":...}}} dataset_cache: dict[str, dict] = {} def load_dataset(dataset: str) -> dict: """Carga todas las tablas/vistas de un dataset (una query por dataset).""" if dataset in dataset_cache: return dataset_cache[dataset] result: dict[str, dict] = {} try: sql = f""" SELECT table_name, table_type, ddl FROM `{PROJECT}`.`{dataset}`.INFORMATION_SCHEMA.TABLES """ for r in client.query(sql).result(): result[r.table_name] = {"type": r.table_type, "ddl": r.ddl or ""} except Exception as e: # noqa: BLE001 print(f" [warn] no se pudo leer dataset {dataset}: {e}", file=sys.stderr) dataset_cache[dataset] = result return result # En el DDL que emite INFORMATION_SCHEMA, las referencias a otras tablas SIEMPRE van # entre backticks y totalmente cualificadas: `proyecto.dataset.tabla`. Los alias de # CTE/JOIN (dp, fcp, f...) nunca llevan backticks, asi que restringiendo a lo que hay # entre backticks eliminamos todo el ruido. BACKTICK_RE = re.compile(r"`([^`]+)`") # Variante con cada parte en su propio backtick: `proj`.`dataset`.`tabla` MULTIPART_RE = re.compile( r"`([A-Za-z0-9_-]+)`\.`([A-Za-z0-9_-]+)`(?:\.`([A-Za-z0-9_-]+)`)?" ) def _norm(proj: str, ds: str, tbl: str) -> tuple[str, str] | None: if ds.upper() == "INFORMATION_SCHEMA" or tbl.upper() == "INFORMATION_SCHEMA": return None return (ds, tbl) def extract_refs(ddl: str) -> set[tuple[str, str]]: """Devuelve el conjunto de (dataset, table) referenciados en el cuerpo del DDL. Se queda con el SELECT (tras el primer 'AS') para no capturar el nombre del propio objeto. """ body = ddl m = re.search(r"\bAS\b", ddl, flags=re.IGNORECASE) if m: body = ddl[m.end():] refs: set[tuple[str, str]] = set() # Estilo `proyecto.dataset.tabla` (todo en un backtick). for tok in BACKTICK_RE.findall(body): parts = [p for p in tok.split(".") if p] if len(parts) == 3: r = _norm(parts[0], parts[1], parts[2]) elif len(parts) == 2: r = _norm(PROJECT, parts[0], parts[1]) else: r = None if r: refs.add(r) # Estilo `proj`.`dataset`.`tabla` (parte por backtick, 3 partes cualificadas). # OJO: `alias`.`columna` (2 partes con cada parte en su propio backtick) es una # referencia a columna, NO a tabla — se descarta exigiendo las 3 partes. for mt in MULTIPART_RE.finditer(body): g1, g2, g3 = mt.group(1), mt.group(2), mt.group(3) if g3: r = _norm(g1, g2, g3) if r: refs.add(r) return refs graph: dict[str, dict] = {} # key "dataset.table" -> {type, ddl, refs:[...]} visited: set[str] = set() def visit(dataset: str, table: str, depth: int = 0): key = f"{dataset}.{table}" if key in visited: return visited.add(key) meta = load_dataset(dataset).get(table) if meta is None: graph[key] = {"type": "UNKNOWN", "ddl": "", "refs": [], "depth": depth} return ddl = meta["ddl"] ttype = meta["type"] refs: list[str] = [] if ttype in ("VIEW", "MATERIALIZED VIEW"): for ds, tbl in sorted(extract_refs(ddl)): # Evitar auto-referencia if ds == dataset and tbl == table: continue refs.append(f"{ds}.{tbl}") graph[key] = {"type": ttype, "ddl": ddl, "refs": refs, "depth": depth} for ref in refs: rds, rtbl = ref.split(".", 1) visit(rds, rtbl, depth + 1) # Semillas: las 14 vistas de customer_marts. SEEDS = [ "customer_brand_affinity", "customer_category_spend", "customer_channel", "customer_contactability", "customer_monetary", "customer_payment_method", "customer_predictive", "customer_product", "customer_profile", "customer_promo_tolerance", "customer_promo_usage", "customer_store_spend", "customer_temporal", "customer_vehicles", ] for s in SEEDS: visit("customer_marts", s, 0) out = { "project": PROJECT, "seeds": [f"customer_marts.{s}" for s in SEEDS], "graph": graph, } with open("scratchpad/lineage_graph.json", "w") as f: json.dump(out, f, indent=2, ensure_ascii=False) # Resumen n_view = sum(1 for v in graph.values() if v["type"] in ("VIEW", "MATERIALIZED VIEW")) n_base = sum(1 for v in graph.values() if v["type"] == "BASE TABLE") n_ext = sum(1 for v in graph.values() if v["type"] == "EXTERNAL") n_unk = sum(1 for v in graph.values() if v["type"] == "UNKNOWN") print(f"objetos totales: {len(graph)} vistas: {n_view} base: {n_base} external: {n_ext} desconocidos: {n_unk}") print("\n== objetos por dataset ==") by_ds: dict[str, int] = {} for k in graph: ds = k.split(".", 1)[0] by_ds[ds] = by_ds.get(ds, 0) + 1 for ds, n in sorted(by_ds.items(), key=lambda x: -x[1]): print(f" {n:3d} {ds}")