763e06c127
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
162 lines
6.3 KiB
Python
162 lines
6.3 KiB
Python
"""Descarga y parsea la Global Vendor List (GVL) de IAB Europe TCF.
|
|
|
|
La GVL es el catalogo maestro de "data brokers" (vendors) del Transparency &
|
|
Consent Framework de IAB Europe, con sus propositos de tratamiento de datos,
|
|
intereses legitimos, special purposes, features y categorias de datos.
|
|
|
|
Sin credenciales. Usa solo stdlib (urllib.request) para no anadir dependencias.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
DEFAULT_URL_V3 = "https://vendor-list.consensu.org/v3/vendor-list.json"
|
|
FALLBACK_URL_V2 = "https://vendor-list.consensu.org/v2/vendor-list.json"
|
|
|
|
_USER_AGENT = "fn_registry-fetch_iab_gvl/1.0 (+recon)"
|
|
_TIMEOUT_S = 30
|
|
|
|
|
|
def _download_json(url: str) -> dict:
|
|
"""Descarga un JSON via HTTP GET y lo parsea. Lanza en fallo."""
|
|
req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
|
|
raw = resp.read()
|
|
return json.loads(raw.decode("utf-8"))
|
|
|
|
|
|
def _vendor_policy_url(vendor: dict) -> str:
|
|
"""Deriva la URL de politica de privacidad de un vendor de forma tolerante.
|
|
|
|
En GVL v3 los vendors no exponen `policyUrl` directo: la privacy URL vive
|
|
en `urls[].privacy` (lista por idioma). En v2 algunos vendors si traen
|
|
`policyUrl`. Esta funcion cubre ambos casos.
|
|
"""
|
|
direct = vendor.get("policyUrl")
|
|
if isinstance(direct, str) and direct:
|
|
return direct
|
|
urls = vendor.get("urls") or []
|
|
if isinstance(urls, list):
|
|
# Preferir el bloque en ingles si existe; si no, el primero con privacy.
|
|
for entry in urls:
|
|
if isinstance(entry, dict) and entry.get("langId") == "en" and entry.get("privacy"):
|
|
return str(entry["privacy"])
|
|
for entry in urls:
|
|
if isinstance(entry, dict) and entry.get("privacy"):
|
|
return str(entry["privacy"])
|
|
return ""
|
|
|
|
|
|
def _summarize_vendor(vendor: dict) -> dict:
|
|
"""Extrae los campos utiles de un vendor, tolerando claves ausentes."""
|
|
return {
|
|
"id": vendor.get("id", 0),
|
|
"name": vendor.get("name", ""),
|
|
"purposes": vendor.get("purposes", []) or [],
|
|
"legIntPurposes": vendor.get("legIntPurposes", []) or [],
|
|
"specialPurposes": vendor.get("specialPurposes", []) or [],
|
|
"features": vendor.get("features", []) or [],
|
|
"dataDeclaration": vendor.get("dataDeclaration", []) or [],
|
|
"policyUrl": _vendor_policy_url(vendor),
|
|
}
|
|
|
|
|
|
def _summarize_definitions(defs: dict) -> dict:
|
|
"""Resume un diccionario de definiciones (purposes, dataCategories, ...)."""
|
|
out: dict = {}
|
|
for key, item in (defs or {}).items():
|
|
if not isinstance(item, dict):
|
|
continue
|
|
out[str(key)] = {
|
|
"id": item.get("id", 0),
|
|
"name": item.get("name", ""),
|
|
"description": item.get("description", ""),
|
|
}
|
|
return out
|
|
|
|
|
|
def fetch_iab_gvl(out_path: str = "", url: str = "", lang: str = "") -> dict:
|
|
"""Descarga y parsea la Global Vendor List (GVL) de IAB Europe TCF.
|
|
|
|
Args:
|
|
out_path: si no esta vacio, guarda el JSON crudo descargado en esa ruta
|
|
(crea los directorios padre si hace falta).
|
|
url: endpoint de la GVL. Si esta vacio usa el endpoint TCF v3.2 por
|
|
defecto y, si falla, hace fallback al endpoint v2.
|
|
lang: codigo de idioma ISO opcional (ej. "es"). NO cambia el endpoint
|
|
principal: las traducciones de propositos viven en endpoints aparte
|
|
(purposes-<lang>.json). Hoy solo se documenta el parametro; el
|
|
resumen devuelto sigue siendo el del endpoint principal (ingles).
|
|
|
|
Returns:
|
|
dict con el resumen de la GVL. En exito:
|
|
{"status": "ok", "gvlSpecificationVersion": ..., "vendorListVersion": ...,
|
|
"tcfPolicyVersion": ..., "lastUpdated": ..., "n_vendors": int,
|
|
"n_purposes": int, "n_specialPurposes": int, "n_features": int,
|
|
"n_dataCategories": int, "vendors": {...}, "purposes": {...},
|
|
"dataCategories": {...}}.
|
|
En fallo de red o parseo: {"status": "error", "error": "..."} (no lanza).
|
|
"""
|
|
candidates = [url] if url else [DEFAULT_URL_V3, FALLBACK_URL_V2]
|
|
|
|
data = None
|
|
last_error = ""
|
|
for candidate in candidates:
|
|
try:
|
|
data = _download_json(candidate)
|
|
break
|
|
except (urllib.error.URLError, urllib.error.HTTPError, ValueError, OSError) as exc:
|
|
last_error = f"{candidate}: {exc}"
|
|
continue
|
|
|
|
if data is None:
|
|
return {"status": "error", "error": last_error or "no url candidates"}
|
|
|
|
try:
|
|
if out_path:
|
|
parent = os.path.dirname(out_path)
|
|
if parent:
|
|
os.makedirs(parent, exist_ok=True)
|
|
with open(out_path, "w", encoding="utf-8") as fh:
|
|
json.dump(data, fh, ensure_ascii=False)
|
|
|
|
vendors_raw = data.get("vendors", {}) or {}
|
|
purposes_raw = data.get("purposes", {}) or {}
|
|
special_purposes_raw = data.get("specialPurposes", {}) or {}
|
|
features_raw = data.get("features", {}) or {}
|
|
data_categories_raw = data.get("dataCategories", {}) or {}
|
|
|
|
vendors = {str(vid): _summarize_vendor(v) for vid, v in vendors_raw.items()}
|
|
|
|
return {
|
|
"status": "ok",
|
|
"gvlSpecificationVersion": data.get("gvlSpecificationVersion"),
|
|
"vendorListVersion": data.get("vendorListVersion"),
|
|
"tcfPolicyVersion": data.get("tcfPolicyVersion"),
|
|
"lastUpdated": data.get("lastUpdated"),
|
|
"n_vendors": len(vendors_raw),
|
|
"n_purposes": len(purposes_raw),
|
|
"n_specialPurposes": len(special_purposes_raw),
|
|
"n_features": len(features_raw),
|
|
"n_dataCategories": len(data_categories_raw),
|
|
"vendors": vendors,
|
|
"purposes": _summarize_definitions(purposes_raw),
|
|
"dataCategories": _summarize_definitions(data_categories_raw),
|
|
}
|
|
except Exception as exc: # noqa: BLE001 - contrato: nunca lanzar.
|
|
return {"status": "error", "error": str(exc)}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
result = fetch_iab_gvl(out_path=sys.argv[1] if len(sys.argv) > 1 else "")
|
|
print(json.dumps(
|
|
{k: v for k, v in result.items() if k not in ("vendors", "purposes", "dataCategories")},
|
|
indent=2,
|
|
))
|
|
if result.get("status") == "ok":
|
|
print(f"sample vendors: {list(result['vendors'].items())[:1]}")
|