chore: auto-commit (8 archivos)

- scratchpad/gen_docs.py
- scratchpad/gen_intel.py
- scratchpad/gen_verify.py
- scratchpad/intel_build.json
- scratchpad/intel_lineage.json
- scratchpad/lineage_graph.json
- scratchpad/trace_intel.py
- scratchpad/trace_lineage.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-01 19:00:06 +02:00
parent fbdf80bd71
commit 2ebc9efeb2
8 changed files with 1724 additions and 0 deletions
+106
View File
@@ -0,0 +1,106 @@
"""Traza la construccion de clientes_intel: para cada tabla, recupera el SQL del ultimo
job que la escribio (INFORMATION_SCHEMA.JOBS) + sus referenced_tables, y recorre hacia
atras hasta las tablas fuente (TPV, customers, users, Navision, Salesforce).
Vuelca todo a scratchpad/intel_build.json.
"""
import json
import warnings
warnings.filterwarnings("ignore")
import google.auth
from google.cloud import bigquery
PROJECT = "autingo-159109"
REGION = "region-europe-west1"
creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/bigquery"])
creds = creds.with_quota_project(None)
c = bigquery.Client(project=PROJECT, credentials=creds)
# Ultimo job por tabla destino en clientes_intel: query + referenced_tables + stmt.
sql = f"""
WITH j AS (
SELECT
dest.table_id AS tbl,
query,
statement_type AS stmt,
creation_time,
ARRAY(
SELECT AS STRUCT rt.project_id, rt.dataset_id, rt.table_id
FROM UNNEST(referenced_tables) rt
) AS refs,
ROW_NUMBER() OVER (PARTITION BY dest.table_id ORDER BY creation_time DESC) AS rn
FROM `{PROJECT}`.`{REGION}`.INFORMATION_SCHEMA.JOBS_BY_PROJECT,
UNNEST([destination_table]) dest
WHERE dest.dataset_id = 'clientes_intel'
AND state = 'DONE' AND error_result IS NULL
AND statement_type IS NOT NULL
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 120 DAY)
)
SELECT tbl, query, stmt, creation_time, refs FROM j WHERE rn = 1
ORDER BY tbl
"""
builds = {}
for r in c.query(sql).result():
refs = []
for rt in r.refs:
refs.append(f"{rt['dataset_id']}.{rt['table_id']}")
builds[r.tbl] = {
"query": r.query or "",
"stmt": r.stmt,
"last_run": str(r.creation_time),
"refs": sorted(set(x for x in refs if not x.endswith(f".{r.tbl}"))),
}
json.dump(builds, open("scratchpad/intel_build.json", "w"), indent=2, ensure_ascii=False)
print(f"tablas clientes_intel con SQL de construccion capturado: {len(builds)}\n")
# Recursion desde las 12 tablas usadas por customer_marts.
SEED = [
"dim_persona", "dim_vehiculo", "fact_transaccion", "fact_campana_respuesta",
"feat_cliente_persona", "feat_cliente_vehiculo", "seg_cliente_360", "score_clv",
"reco_acciones", "map_persona_canal8", "map_persona_fuente", "map_persona_vehiculo",
]
intel_involved = set()
external_sources = set()
stack = list(SEED)
while stack:
t = stack.pop()
if t in intel_involved:
continue
intel_involved.add(t)
b = builds.get(t)
if not b:
continue
for ref in b["refs"]:
ds, tbl = ref.split(".", 1)
if ds == "clientes_intel":
if tbl not in intel_involved:
stack.append(tbl)
else:
external_sources.add(ref)
print("== tablas clientes_intel implicadas en el linaje de customer_marts ==")
for t in sorted(intel_involved):
b = builds.get(t, {})
print(f" {t:26s} {b.get('stmt','(sin job)')}")
print("\n== FUENTES EXTERNAS (fuera de clientes_intel) usadas por el pipeline ==")
for s in sorted(external_sources):
print(f" {s}")
# Marcar las fuentes de CLIENTE que pide el usuario.
KEYS = ["customer", "customers", "cliente", "user", "usuario", "tpv", "salesforce",
"sf_", "contact", "mkt_cloud", "persona"]
print("\n== fuentes que parecen de CLIENTE/usuario ==")
for s in sorted(external_sources):
low = s.lower()
if any(k in low for k in KEYS):
print(f" {s}")
json.dump({
"intel_involved": sorted(intel_involved),
"external_sources": sorted(external_sources),
}, open("scratchpad/intel_lineage.json", "w"), indent=2, ensure_ascii=False)