"""Descarga y parsea la Global Vendor List (GVL) de IAB Europe TCF. La GVL es el catalogo maestro de "data brokers" (vendors) del Transparency & Consent Framework de IAB Europe, con sus propositos de tratamiento de datos, intereses legitimos, special purposes, features y categorias de datos. Sin credenciales. Usa solo stdlib (urllib.request) para no anadir dependencias. """ import json import os import urllib.error import urllib.request DEFAULT_URL_V3 = "https://vendor-list.consensu.org/v3/vendor-list.json" FALLBACK_URL_V2 = "https://vendor-list.consensu.org/v2/vendor-list.json" _USER_AGENT = "fn_registry-fetch_iab_gvl/1.0 (+recon)" _TIMEOUT_S = 30 def _download_json(url: str) -> dict: """Descarga un JSON via HTTP GET y lo parsea. Lanza en fallo.""" req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}) with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp: raw = resp.read() return json.loads(raw.decode("utf-8")) def _vendor_policy_url(vendor: dict) -> str: """Deriva la URL de politica de privacidad de un vendor de forma tolerante. En GVL v3 los vendors no exponen `policyUrl` directo: la privacy URL vive en `urls[].privacy` (lista por idioma). En v2 algunos vendors si traen `policyUrl`. Esta funcion cubre ambos casos. """ direct = vendor.get("policyUrl") if isinstance(direct, str) and direct: return direct urls = vendor.get("urls") or [] if isinstance(urls, list): # Preferir el bloque en ingles si existe; si no, el primero con privacy. for entry in urls: if isinstance(entry, dict) and entry.get("langId") == "en" and entry.get("privacy"): return str(entry["privacy"]) for entry in urls: if isinstance(entry, dict) and entry.get("privacy"): return str(entry["privacy"]) return "" def _summarize_vendor(vendor: dict) -> dict: """Extrae los campos utiles de un vendor, tolerando claves ausentes.""" return { "id": vendor.get("id", 0), "name": vendor.get("name", ""), "purposes": vendor.get("purposes", []) or [], "legIntPurposes": vendor.get("legIntPurposes", []) or [], "specialPurposes": vendor.get("specialPurposes", []) or [], "features": vendor.get("features", []) or [], "dataDeclaration": vendor.get("dataDeclaration", []) or [], "policyUrl": _vendor_policy_url(vendor), } def _summarize_definitions(defs: dict) -> dict: """Resume un diccionario de definiciones (purposes, dataCategories, ...).""" out: dict = {} for key, item in (defs or {}).items(): if not isinstance(item, dict): continue out[str(key)] = { "id": item.get("id", 0), "name": item.get("name", ""), "description": item.get("description", ""), } return out def fetch_iab_gvl(out_path: str = "", url: str = "", lang: str = "") -> dict: """Descarga y parsea la Global Vendor List (GVL) de IAB Europe TCF. Args: out_path: si no esta vacio, guarda el JSON crudo descargado en esa ruta (crea los directorios padre si hace falta). url: endpoint de la GVL. Si esta vacio usa el endpoint TCF v3.2 por defecto y, si falla, hace fallback al endpoint v2. lang: codigo de idioma ISO opcional (ej. "es"). NO cambia el endpoint principal: las traducciones de propositos viven en endpoints aparte (purposes-.json). Hoy solo se documenta el parametro; el resumen devuelto sigue siendo el del endpoint principal (ingles). Returns: dict con el resumen de la GVL. En exito: {"status": "ok", "gvlSpecificationVersion": ..., "vendorListVersion": ..., "tcfPolicyVersion": ..., "lastUpdated": ..., "n_vendors": int, "n_purposes": int, "n_specialPurposes": int, "n_features": int, "n_dataCategories": int, "vendors": {...}, "purposes": {...}, "dataCategories": {...}}. En fallo de red o parseo: {"status": "error", "error": "..."} (no lanza). """ candidates = [url] if url else [DEFAULT_URL_V3, FALLBACK_URL_V2] data = None last_error = "" for candidate in candidates: try: data = _download_json(candidate) break except (urllib.error.URLError, urllib.error.HTTPError, ValueError, OSError) as exc: last_error = f"{candidate}: {exc}" continue if data is None: return {"status": "error", "error": last_error or "no url candidates"} try: if out_path: parent = os.path.dirname(out_path) if parent: os.makedirs(parent, exist_ok=True) with open(out_path, "w", encoding="utf-8") as fh: json.dump(data, fh, ensure_ascii=False) vendors_raw = data.get("vendors", {}) or {} purposes_raw = data.get("purposes", {}) or {} special_purposes_raw = data.get("specialPurposes", {}) or {} features_raw = data.get("features", {}) or {} data_categories_raw = data.get("dataCategories", {}) or {} vendors = {str(vid): _summarize_vendor(v) for vid, v in vendors_raw.items()} return { "status": "ok", "gvlSpecificationVersion": data.get("gvlSpecificationVersion"), "vendorListVersion": data.get("vendorListVersion"), "tcfPolicyVersion": data.get("tcfPolicyVersion"), "lastUpdated": data.get("lastUpdated"), "n_vendors": len(vendors_raw), "n_purposes": len(purposes_raw), "n_specialPurposes": len(special_purposes_raw), "n_features": len(features_raw), "n_dataCategories": len(data_categories_raw), "vendors": vendors, "purposes": _summarize_definitions(purposes_raw), "dataCategories": _summarize_definitions(data_categories_raw), } except Exception as exc: # noqa: BLE001 - contrato: nunca lanzar. return {"status": "error", "error": str(exc)} if __name__ == "__main__": import sys result = fetch_iab_gvl(out_path=sys.argv[1] if len(sys.argv) > 1 else "") print(json.dumps( {k: v for k, v in result.items() if k not in ("vendors", "purposes", "dataCategories")}, indent=2, )) if result.get("status") == "ok": print(f"sample vendors: {list(result['vendors'].items())[:1]}")