"""Traza la construccion de clientes_intel: para cada tabla, recupera el SQL del ultimo job que la escribio (INFORMATION_SCHEMA.JOBS) + sus referenced_tables, y recorre hacia atras hasta las tablas fuente (TPV, customers, users, Navision, Salesforce). Vuelca todo a scratchpad/intel_build.json. """ import json import warnings warnings.filterwarnings("ignore") import google.auth from google.cloud import bigquery PROJECT = "autingo-159109" REGION = "region-europe-west1" creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/bigquery"]) creds = creds.with_quota_project(None) c = bigquery.Client(project=PROJECT, credentials=creds) # Ultimo job por tabla destino en clientes_intel: query + referenced_tables + stmt. sql = f""" WITH j AS ( SELECT dest.table_id AS tbl, query, statement_type AS stmt, creation_time, ARRAY( SELECT AS STRUCT rt.project_id, rt.dataset_id, rt.table_id FROM UNNEST(referenced_tables) rt ) AS refs, ROW_NUMBER() OVER (PARTITION BY dest.table_id ORDER BY creation_time DESC) AS rn FROM `{PROJECT}`.`{REGION}`.INFORMATION_SCHEMA.JOBS_BY_PROJECT, UNNEST([destination_table]) dest WHERE dest.dataset_id = 'clientes_intel' AND state = 'DONE' AND error_result IS NULL AND statement_type IS NOT NULL AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 120 DAY) ) SELECT tbl, query, stmt, creation_time, refs FROM j WHERE rn = 1 ORDER BY tbl """ builds = {} for r in c.query(sql).result(): refs = [] for rt in r.refs: refs.append(f"{rt['dataset_id']}.{rt['table_id']}") builds[r.tbl] = { "query": r.query or "", "stmt": r.stmt, "last_run": str(r.creation_time), "refs": sorted(set(x for x in refs if not x.endswith(f".{r.tbl}"))), } json.dump(builds, open("scratchpad/intel_build.json", "w"), indent=2, ensure_ascii=False) print(f"tablas clientes_intel con SQL de construccion capturado: {len(builds)}\n") # Recursion desde las 12 tablas usadas por customer_marts. SEED = [ "dim_persona", "dim_vehiculo", "fact_transaccion", "fact_campana_respuesta", "feat_cliente_persona", "feat_cliente_vehiculo", "seg_cliente_360", "score_clv", "reco_acciones", "map_persona_canal8", "map_persona_fuente", "map_persona_vehiculo", ] intel_involved = set() external_sources = set() stack = list(SEED) while stack: t = stack.pop() if t in intel_involved: continue intel_involved.add(t) b = builds.get(t) if not b: continue for ref in b["refs"]: ds, tbl = ref.split(".", 1) if ds == "clientes_intel": if tbl not in intel_involved: stack.append(tbl) else: external_sources.add(ref) print("== tablas clientes_intel implicadas en el linaje de customer_marts ==") for t in sorted(intel_involved): b = builds.get(t, {}) print(f" {t:26s} {b.get('stmt','(sin job)')}") print("\n== FUENTES EXTERNAS (fuera de clientes_intel) usadas por el pipeline ==") for s in sorted(external_sources): print(f" {s}") # Marcar las fuentes de CLIENTE que pide el usuario. KEYS = ["customer", "customers", "cliente", "user", "usuario", "tpv", "salesforce", "sf_", "contact", "mkt_cloud", "persona"] print("\n== fuentes que parecen de CLIENTE/usuario ==") for s in sorted(external_sources): low = s.lower() if any(k in low for k in KEYS): print(f" {s}") json.dump({ "intel_involved": sorted(intel_involved), "external_sources": sorted(external_sources), }, open("scratchpad/intel_lineage.json", "w"), indent=2, ensure_ascii=False)