2ebc9efeb2
- scratchpad/gen_docs.py - scratchpad/gen_intel.py - scratchpad/gen_verify.py - scratchpad/intel_build.json - scratchpad/intel_lineage.json - scratchpad/lineage_graph.json - scratchpad/trace_intel.py - scratchpad/trace_lineage.py Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
107 lines
3.5 KiB
Python
107 lines
3.5 KiB
Python
"""Traza la construccion de clientes_intel: para cada tabla, recupera el SQL del ultimo
|
|
job que la escribio (INFORMATION_SCHEMA.JOBS) + sus referenced_tables, y recorre hacia
|
|
atras hasta las tablas fuente (TPV, customers, users, Navision, Salesforce).
|
|
|
|
Vuelca todo a scratchpad/intel_build.json.
|
|
"""
|
|
import json
|
|
import warnings
|
|
|
|
warnings.filterwarnings("ignore")
|
|
import google.auth
|
|
from google.cloud import bigquery
|
|
|
|
PROJECT = "autingo-159109"
|
|
REGION = "region-europe-west1"
|
|
|
|
creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/bigquery"])
|
|
creds = creds.with_quota_project(None)
|
|
c = bigquery.Client(project=PROJECT, credentials=creds)
|
|
|
|
# Ultimo job por tabla destino en clientes_intel: query + referenced_tables + stmt.
|
|
sql = f"""
|
|
WITH j AS (
|
|
SELECT
|
|
dest.table_id AS tbl,
|
|
query,
|
|
statement_type AS stmt,
|
|
creation_time,
|
|
ARRAY(
|
|
SELECT AS STRUCT rt.project_id, rt.dataset_id, rt.table_id
|
|
FROM UNNEST(referenced_tables) rt
|
|
) AS refs,
|
|
ROW_NUMBER() OVER (PARTITION BY dest.table_id ORDER BY creation_time DESC) AS rn
|
|
FROM `{PROJECT}`.`{REGION}`.INFORMATION_SCHEMA.JOBS_BY_PROJECT,
|
|
UNNEST([destination_table]) dest
|
|
WHERE dest.dataset_id = 'clientes_intel'
|
|
AND state = 'DONE' AND error_result IS NULL
|
|
AND statement_type IS NOT NULL
|
|
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 120 DAY)
|
|
)
|
|
SELECT tbl, query, stmt, creation_time, refs FROM j WHERE rn = 1
|
|
ORDER BY tbl
|
|
"""
|
|
|
|
builds = {}
|
|
for r in c.query(sql).result():
|
|
refs = []
|
|
for rt in r.refs:
|
|
refs.append(f"{rt['dataset_id']}.{rt['table_id']}")
|
|
builds[r.tbl] = {
|
|
"query": r.query or "",
|
|
"stmt": r.stmt,
|
|
"last_run": str(r.creation_time),
|
|
"refs": sorted(set(x for x in refs if not x.endswith(f".{r.tbl}"))),
|
|
}
|
|
|
|
json.dump(builds, open("scratchpad/intel_build.json", "w"), indent=2, ensure_ascii=False)
|
|
print(f"tablas clientes_intel con SQL de construccion capturado: {len(builds)}\n")
|
|
|
|
# Recursion desde las 12 tablas usadas por customer_marts.
|
|
SEED = [
|
|
"dim_persona", "dim_vehiculo", "fact_transaccion", "fact_campana_respuesta",
|
|
"feat_cliente_persona", "feat_cliente_vehiculo", "seg_cliente_360", "score_clv",
|
|
"reco_acciones", "map_persona_canal8", "map_persona_fuente", "map_persona_vehiculo",
|
|
]
|
|
intel_involved = set()
|
|
external_sources = set()
|
|
stack = list(SEED)
|
|
while stack:
|
|
t = stack.pop()
|
|
if t in intel_involved:
|
|
continue
|
|
intel_involved.add(t)
|
|
b = builds.get(t)
|
|
if not b:
|
|
continue
|
|
for ref in b["refs"]:
|
|
ds, tbl = ref.split(".", 1)
|
|
if ds == "clientes_intel":
|
|
if tbl not in intel_involved:
|
|
stack.append(tbl)
|
|
else:
|
|
external_sources.add(ref)
|
|
|
|
print("== tablas clientes_intel implicadas en el linaje de customer_marts ==")
|
|
for t in sorted(intel_involved):
|
|
b = builds.get(t, {})
|
|
print(f" {t:26s} {b.get('stmt','(sin job)')}")
|
|
|
|
print("\n== FUENTES EXTERNAS (fuera de clientes_intel) usadas por el pipeline ==")
|
|
for s in sorted(external_sources):
|
|
print(f" {s}")
|
|
|
|
# Marcar las fuentes de CLIENTE que pide el usuario.
|
|
KEYS = ["customer", "customers", "cliente", "user", "usuario", "tpv", "salesforce",
|
|
"sf_", "contact", "mkt_cloud", "persona"]
|
|
print("\n== fuentes que parecen de CLIENTE/usuario ==")
|
|
for s in sorted(external_sources):
|
|
low = s.lower()
|
|
if any(k in low for k in KEYS):
|
|
print(f" {s}")
|
|
|
|
json.dump({
|
|
"intel_involved": sorted(intel_involved),
|
|
"external_sources": sorted(external_sources),
|
|
}, open("scratchpad/intel_lineage.json", "w"), indent=2, ensure_ascii=False)
|