chore: auto-commit (8 archivos)
- scratchpad/gen_docs.py - scratchpad/gen_intel.py - scratchpad/gen_verify.py - scratchpad/intel_build.json - scratchpad/intel_lineage.json - scratchpad/lineage_graph.json - scratchpad/trace_intel.py - scratchpad/trace_lineage.py Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,106 @@
|
||||
"""Traza la construccion de clientes_intel: para cada tabla, recupera el SQL del ultimo
|
||||
job que la escribio (INFORMATION_SCHEMA.JOBS) + sus referenced_tables, y recorre hacia
|
||||
atras hasta las tablas fuente (TPV, customers, users, Navision, Salesforce).
|
||||
|
||||
Vuelca todo a scratchpad/intel_build.json.
|
||||
"""
|
||||
import json
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
import google.auth
|
||||
from google.cloud import bigquery
|
||||
|
||||
PROJECT = "autingo-159109"
|
||||
REGION = "region-europe-west1"
|
||||
|
||||
creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/bigquery"])
|
||||
creds = creds.with_quota_project(None)
|
||||
c = bigquery.Client(project=PROJECT, credentials=creds)
|
||||
|
||||
# Ultimo job por tabla destino en clientes_intel: query + referenced_tables + stmt.
|
||||
sql = f"""
|
||||
WITH j AS (
|
||||
SELECT
|
||||
dest.table_id AS tbl,
|
||||
query,
|
||||
statement_type AS stmt,
|
||||
creation_time,
|
||||
ARRAY(
|
||||
SELECT AS STRUCT rt.project_id, rt.dataset_id, rt.table_id
|
||||
FROM UNNEST(referenced_tables) rt
|
||||
) AS refs,
|
||||
ROW_NUMBER() OVER (PARTITION BY dest.table_id ORDER BY creation_time DESC) AS rn
|
||||
FROM `{PROJECT}`.`{REGION}`.INFORMATION_SCHEMA.JOBS_BY_PROJECT,
|
||||
UNNEST([destination_table]) dest
|
||||
WHERE dest.dataset_id = 'clientes_intel'
|
||||
AND state = 'DONE' AND error_result IS NULL
|
||||
AND statement_type IS NOT NULL
|
||||
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 120 DAY)
|
||||
)
|
||||
SELECT tbl, query, stmt, creation_time, refs FROM j WHERE rn = 1
|
||||
ORDER BY tbl
|
||||
"""
|
||||
|
||||
builds = {}
|
||||
for r in c.query(sql).result():
|
||||
refs = []
|
||||
for rt in r.refs:
|
||||
refs.append(f"{rt['dataset_id']}.{rt['table_id']}")
|
||||
builds[r.tbl] = {
|
||||
"query": r.query or "",
|
||||
"stmt": r.stmt,
|
||||
"last_run": str(r.creation_time),
|
||||
"refs": sorted(set(x for x in refs if not x.endswith(f".{r.tbl}"))),
|
||||
}
|
||||
|
||||
json.dump(builds, open("scratchpad/intel_build.json", "w"), indent=2, ensure_ascii=False)
|
||||
print(f"tablas clientes_intel con SQL de construccion capturado: {len(builds)}\n")
|
||||
|
||||
# Recursion desde las 12 tablas usadas por customer_marts.
|
||||
SEED = [
|
||||
"dim_persona", "dim_vehiculo", "fact_transaccion", "fact_campana_respuesta",
|
||||
"feat_cliente_persona", "feat_cliente_vehiculo", "seg_cliente_360", "score_clv",
|
||||
"reco_acciones", "map_persona_canal8", "map_persona_fuente", "map_persona_vehiculo",
|
||||
]
|
||||
intel_involved = set()
|
||||
external_sources = set()
|
||||
stack = list(SEED)
|
||||
while stack:
|
||||
t = stack.pop()
|
||||
if t in intel_involved:
|
||||
continue
|
||||
intel_involved.add(t)
|
||||
b = builds.get(t)
|
||||
if not b:
|
||||
continue
|
||||
for ref in b["refs"]:
|
||||
ds, tbl = ref.split(".", 1)
|
||||
if ds == "clientes_intel":
|
||||
if tbl not in intel_involved:
|
||||
stack.append(tbl)
|
||||
else:
|
||||
external_sources.add(ref)
|
||||
|
||||
print("== tablas clientes_intel implicadas en el linaje de customer_marts ==")
|
||||
for t in sorted(intel_involved):
|
||||
b = builds.get(t, {})
|
||||
print(f" {t:26s} {b.get('stmt','(sin job)')}")
|
||||
|
||||
print("\n== FUENTES EXTERNAS (fuera de clientes_intel) usadas por el pipeline ==")
|
||||
for s in sorted(external_sources):
|
||||
print(f" {s}")
|
||||
|
||||
# Marcar las fuentes de CLIENTE que pide el usuario.
|
||||
KEYS = ["customer", "customers", "cliente", "user", "usuario", "tpv", "salesforce",
|
||||
"sf_", "contact", "mkt_cloud", "persona"]
|
||||
print("\n== fuentes que parecen de CLIENTE/usuario ==")
|
||||
for s in sorted(external_sources):
|
||||
low = s.lower()
|
||||
if any(k in low for k in KEYS):
|
||||
print(f" {s}")
|
||||
|
||||
json.dump({
|
||||
"intel_involved": sorted(intel_involved),
|
||||
"external_sources": sorted(external_sources),
|
||||
}, open("scratchpad/intel_lineage.json", "w"), indent=2, ensure_ascii=False)
|
||||
Reference in New Issue
Block a user