Files
fn_registry/python/functions/cybersecurity/fetch_iab_gvl.py
T
egutierrez 763e06c127 feat(browser): auto-commit con 178 cambios
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-20 18:22:23 +02:00

162 lines
6.3 KiB
Python

"""Descarga y parsea la Global Vendor List (GVL) de IAB Europe TCF.
La GVL es el catalogo maestro de "data brokers" (vendors) del Transparency &
Consent Framework de IAB Europe, con sus propositos de tratamiento de datos,
intereses legitimos, special purposes, features y categorias de datos.
Sin credenciales. Usa solo stdlib (urllib.request) para no anadir dependencias.
"""
import json
import os
import urllib.error
import urllib.request
DEFAULT_URL_V3 = "https://vendor-list.consensu.org/v3/vendor-list.json"
FALLBACK_URL_V2 = "https://vendor-list.consensu.org/v2/vendor-list.json"
_USER_AGENT = "fn_registry-fetch_iab_gvl/1.0 (+recon)"
_TIMEOUT_S = 30
def _download_json(url: str) -> dict:
"""Descarga un JSON via HTTP GET y lo parsea. Lanza en fallo."""
req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT})
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp:
raw = resp.read()
return json.loads(raw.decode("utf-8"))
def _vendor_policy_url(vendor: dict) -> str:
"""Deriva la URL de politica de privacidad de un vendor de forma tolerante.
En GVL v3 los vendors no exponen `policyUrl` directo: la privacy URL vive
en `urls[].privacy` (lista por idioma). En v2 algunos vendors si traen
`policyUrl`. Esta funcion cubre ambos casos.
"""
direct = vendor.get("policyUrl")
if isinstance(direct, str) and direct:
return direct
urls = vendor.get("urls") or []
if isinstance(urls, list):
# Preferir el bloque en ingles si existe; si no, el primero con privacy.
for entry in urls:
if isinstance(entry, dict) and entry.get("langId") == "en" and entry.get("privacy"):
return str(entry["privacy"])
for entry in urls:
if isinstance(entry, dict) and entry.get("privacy"):
return str(entry["privacy"])
return ""
def _summarize_vendor(vendor: dict) -> dict:
"""Extrae los campos utiles de un vendor, tolerando claves ausentes."""
return {
"id": vendor.get("id", 0),
"name": vendor.get("name", ""),
"purposes": vendor.get("purposes", []) or [],
"legIntPurposes": vendor.get("legIntPurposes", []) or [],
"specialPurposes": vendor.get("specialPurposes", []) or [],
"features": vendor.get("features", []) or [],
"dataDeclaration": vendor.get("dataDeclaration", []) or [],
"policyUrl": _vendor_policy_url(vendor),
}
def _summarize_definitions(defs: dict) -> dict:
"""Resume un diccionario de definiciones (purposes, dataCategories, ...)."""
out: dict = {}
for key, item in (defs or {}).items():
if not isinstance(item, dict):
continue
out[str(key)] = {
"id": item.get("id", 0),
"name": item.get("name", ""),
"description": item.get("description", ""),
}
return out
def fetch_iab_gvl(out_path: str = "", url: str = "", lang: str = "") -> dict:
"""Descarga y parsea la Global Vendor List (GVL) de IAB Europe TCF.
Args:
out_path: si no esta vacio, guarda el JSON crudo descargado en esa ruta
(crea los directorios padre si hace falta).
url: endpoint de la GVL. Si esta vacio usa el endpoint TCF v3.2 por
defecto y, si falla, hace fallback al endpoint v2.
lang: codigo de idioma ISO opcional (ej. "es"). NO cambia el endpoint
principal: las traducciones de propositos viven en endpoints aparte
(purposes-<lang>.json). Hoy solo se documenta el parametro; el
resumen devuelto sigue siendo el del endpoint principal (ingles).
Returns:
dict con el resumen de la GVL. En exito:
{"status": "ok", "gvlSpecificationVersion": ..., "vendorListVersion": ...,
"tcfPolicyVersion": ..., "lastUpdated": ..., "n_vendors": int,
"n_purposes": int, "n_specialPurposes": int, "n_features": int,
"n_dataCategories": int, "vendors": {...}, "purposes": {...},
"dataCategories": {...}}.
En fallo de red o parseo: {"status": "error", "error": "..."} (no lanza).
"""
candidates = [url] if url else [DEFAULT_URL_V3, FALLBACK_URL_V2]
data = None
last_error = ""
for candidate in candidates:
try:
data = _download_json(candidate)
break
except (urllib.error.URLError, urllib.error.HTTPError, ValueError, OSError) as exc:
last_error = f"{candidate}: {exc}"
continue
if data is None:
return {"status": "error", "error": last_error or "no url candidates"}
try:
if out_path:
parent = os.path.dirname(out_path)
if parent:
os.makedirs(parent, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as fh:
json.dump(data, fh, ensure_ascii=False)
vendors_raw = data.get("vendors", {}) or {}
purposes_raw = data.get("purposes", {}) or {}
special_purposes_raw = data.get("specialPurposes", {}) or {}
features_raw = data.get("features", {}) or {}
data_categories_raw = data.get("dataCategories", {}) or {}
vendors = {str(vid): _summarize_vendor(v) for vid, v in vendors_raw.items()}
return {
"status": "ok",
"gvlSpecificationVersion": data.get("gvlSpecificationVersion"),
"vendorListVersion": data.get("vendorListVersion"),
"tcfPolicyVersion": data.get("tcfPolicyVersion"),
"lastUpdated": data.get("lastUpdated"),
"n_vendors": len(vendors_raw),
"n_purposes": len(purposes_raw),
"n_specialPurposes": len(special_purposes_raw),
"n_features": len(features_raw),
"n_dataCategories": len(data_categories_raw),
"vendors": vendors,
"purposes": _summarize_definitions(purposes_raw),
"dataCategories": _summarize_definitions(data_categories_raw),
}
except Exception as exc: # noqa: BLE001 - contrato: nunca lanzar.
return {"status": "error", "error": str(exc)}
if __name__ == "__main__":
import sys
result = fetch_iab_gvl(out_path=sys.argv[1] if len(sys.argv) > 1 else "")
print(json.dumps(
{k: v for k, v in result.items() if k not in ("vendors", "purposes", "dataCategories")},
indent=2,
))
if result.get("status") == "ok":
print(f"sample vendors: {list(result['vendors'].items())[:1]}")