"""Traza la construccion de clientes_intel: para cada tabla, recupera el SQL del ultimo
job que la escribio (INFORMATION_SCHEMA.JOBS) + sus referenced_tables, y recorre hacia
atras hasta las tablas fuente (TPV, customers, users, Navision, Salesforce).

Vuelca todo a scratchpad/intel_build.json.
"""
import json
import warnings

warnings.filterwarnings("ignore")
import google.auth
from google.cloud import bigquery

PROJECT = "autingo-159109"
REGION = "region-europe-west1"

creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/bigquery"])
creds = creds.with_quota_project(None)
c = bigquery.Client(project=PROJECT, credentials=creds)

# Ultimo job por tabla destino en clientes_intel: query + referenced_tables + stmt.
sql = f"""
WITH j AS (
  SELECT
    dest.table_id AS tbl,
    query,
    statement_type AS stmt,
    creation_time,
    ARRAY(
      SELECT AS STRUCT rt.project_id, rt.dataset_id, rt.table_id
      FROM UNNEST(referenced_tables) rt
    ) AS refs,
    ROW_NUMBER() OVER (PARTITION BY dest.table_id ORDER BY creation_time DESC) AS rn
  FROM `{PROJECT}`.`{REGION}`.INFORMATION_SCHEMA.JOBS_BY_PROJECT,
       UNNEST([destination_table]) dest
  WHERE dest.dataset_id = 'clientes_intel'
    AND state = 'DONE' AND error_result IS NULL
    AND statement_type IS NOT NULL
    AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 120 DAY)
)
SELECT tbl, query, stmt, creation_time, refs FROM j WHERE rn = 1
ORDER BY tbl
"""

builds = {}
for r in c.query(sql).result():
    refs = []
    for rt in r.refs:
        refs.append(f"{rt['dataset_id']}.{rt['table_id']}")
    builds[r.tbl] = {
        "query": r.query or "",
        "stmt": r.stmt,
        "last_run": str(r.creation_time),
        "refs": sorted(set(x for x in refs if not x.endswith(f".{r.tbl}"))),
    }

json.dump(builds, open("scratchpad/intel_build.json", "w"), indent=2, ensure_ascii=False)
print(f"tablas clientes_intel con SQL de construccion capturado: {len(builds)}\n")

# Recursion desde las 12 tablas usadas por customer_marts.
SEED = [
    "dim_persona", "dim_vehiculo", "fact_transaccion", "fact_campana_respuesta",
    "feat_cliente_persona", "feat_cliente_vehiculo", "seg_cliente_360", "score_clv",
    "reco_acciones", "map_persona_canal8", "map_persona_fuente", "map_persona_vehiculo",
]
intel_involved = set()
external_sources = set()
stack = list(SEED)
while stack:
    t = stack.pop()
    if t in intel_involved:
        continue
    intel_involved.add(t)
    b = builds.get(t)
    if not b:
        continue
    for ref in b["refs"]:
        ds, tbl = ref.split(".", 1)
        if ds == "clientes_intel":
            if tbl not in intel_involved:
                stack.append(tbl)
        else:
            external_sources.add(ref)

print("== tablas clientes_intel implicadas en el linaje de customer_marts ==")
for t in sorted(intel_involved):
    b = builds.get(t, {})
    print(f"  {t:26s} {b.get('stmt','(sin job)')}")

print("\n== FUENTES EXTERNAS (fuera de clientes_intel) usadas por el pipeline ==")
for s in sorted(external_sources):
    print(f"  {s}")

# Marcar las fuentes de CLIENTE que pide el usuario.
KEYS = ["customer", "customers", "cliente", "user", "usuario", "tpv", "salesforce",
        "sf_", "contact", "mkt_cloud", "persona"]
print("\n== fuentes que parecen de CLIENTE/usuario ==")
for s in sorted(external_sources):
    low = s.lower()
    if any(k in low for k in KEYS):
        print(f"  {s}")

json.dump({
    "intel_involved": sorted(intel_involved),
    "external_sources": sorted(external_sources),
}, open("scratchpad/intel_lineage.json", "w"), indent=2, ensure_ascii=False)