"""Ejecuta una recipe YAML contra Chrome remoto via CDP.""" import json import re import sys import os import time sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import urllib.request import websocket from core.validate_recipe_yaml import validate_recipe_yaml def _ws_send_recv(ws, msg_id: int, method: str, params: dict, timeout: float = 10.0) -> dict: """Envia un mensaje CDP y espera respuesta con el mismo id.""" import threading result_holder = {} event = threading.Event() original_on_message = ws.on_message def on_message_wrapper(ws_app, message): try: msg = json.loads(message) if msg.get("id") == msg_id: result_holder["result"] = msg event.set() except Exception: pass if original_on_message: original_on_message(ws_app, message) ws.on_message = on_message_wrapper ws.send(json.dumps({"id": msg_id, "method": method, "params": params})) event.wait(timeout=timeout) ws.on_message = original_on_message return result_holder.get("result", {}) def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool: """Polling cada 200ms hasta que document.querySelector(selector) no sea null.""" deadline = time.time() + timeout_s msg_id = 1000 while time.time() < deadline: ws.send(json.dumps({ "id": msg_id, "method": "Runtime.evaluate", "params": { "expression": f"!!document.querySelector({json.dumps(selector)})", "returnByValue": True, } })) time.sleep(0.2) msg_id += 1 # Leer respuesta en loop simple (websocket-client sync) # Para modo sync usamos recv() try: raw = ws.sock.recv() if raw: msg = json.loads(raw) val = msg.get("result", {}).get("result", {}).get("value", False) if val: return True except Exception: pass return False def cdp_extract_recipe( recipe_path: str, debug_port: int = 9222, tab_id: str | None = None, record_run: bool = True, ) -> dict: """Ejecuta una recipe YAML contra Chrome remoto via CDP. Args: recipe_path: Ruta al archivo .yaml de la recipe. debug_port: Puerto de depuracion remota de Chrome. Default 9222. tab_id: ID del tab a usar. Si None, busca tab cuyo URL matchee url_pattern. record_run: Si True y output.sink=='data_factory.runs', llama data_factory_record_run. Returns: {status, rows_out, kb_out, duration_ms, error, sample_rows} """ start_ms = int(time.time() * 1000) # Leer y validar recipe try: with open(recipe_path, "r", encoding="utf-8") as f: yaml_text = f.read() except OSError as e: return {"status": "error", "rows_out": 0, "kb_out": 0.0, "duration_ms": 0, "error": str(e), "sample_rows": []} validation = validate_recipe_yaml(yaml_text) if not validation["valid"]: return {"status": "error", "rows_out": 0, "kb_out": 0.0, "duration_ms": 0, "error": "recipe invalida: " + "; ".join(validation["errors"]), "sample_rows": []} recipe = validation["parsed"] url_pattern = recipe["url_pattern"] steps = recipe["steps"] output_cfg = recipe.get("output", {}) sink = output_cfg.get("sink", "stdout") # Obtener lista de tabs try: with urllib.request.urlopen( f"http://127.0.0.1:{debug_port}/json/list", timeout=5 ) as resp: tabs = json.loads(resp.read().decode()) except Exception as e: return {"status": "error", "rows_out": 0, "kb_out": 0.0, "duration_ms": 0, "error": f"no se pudo conectar a Chrome en port {debug_port}: {e}", "sample_rows": []} # Encontrar tab ws_url = None if tab_id: for tab in tabs: if tab.get("id") == tab_id: ws_url = tab.get("webSocketDebuggerUrl") break else: for tab in tabs: tab_url = tab.get("url", "") if re.search(url_pattern, tab_url): ws_url = tab.get("webSocketDebuggerUrl") break if not ws_url: return {"status": "error", "rows_out": 0, "kb_out": 0.0, "duration_ms": 0, "error": f"no tab matching pattern: {url_pattern}", "sample_rows": []} # Ejecutar steps last_result = None try: ws = websocket.create_connection(ws_url, timeout=10) try: for i, step in enumerate(steps): if "wait_selector" in step: selector = step["wait_selector"] found = _poll_selector(ws, selector, timeout_s=10.0) if not found: raise RuntimeError(f"step {i}: timeout esperando selector '{selector}'") elif "js" in step: ws.send(json.dumps({ "id": i + 1, "method": "Runtime.evaluate", "params": { "expression": step["js"], "returnByValue": True, "awaitPromise": True, } })) raw = ws.recv() msg = json.loads(raw) result_obj = msg.get("result", {}).get("result", {}) last_result = result_obj.get("value") finally: ws.close() except Exception as e: return {"status": "error", "rows_out": 0, "kb_out": 0.0, "duration_ms": int(time.time() * 1000) - start_ms, "error": str(e), "sample_rows": []} # Calcular metricas rows = last_result if isinstance(last_result, list) else ( [last_result] if last_result is not None else [] ) rows_out = len(rows) kb_out = len(json.dumps(rows, ensure_ascii=False).encode()) / 1024 sample_rows = rows[:5] duration_ms = int(time.time() * 1000) - start_ms # Sink if sink == "stdout": print(json.dumps(rows, ensure_ascii=False, indent=2)) elif sink == "json_file": out_path = output_cfg.get("path", "output.json") with open(out_path, "w", encoding="utf-8") as f: json.dump(rows, f, ensure_ascii=False, indent=2) elif sink == "data_factory.runs" and record_run: try: from pipelines.data_factory_record_run import data_factory_record_run data_factory_record_run( node_id=recipe.get("name", "unknown"), function_id="cdp_extract_recipe_py_pipelines", args={"recipe_path": recipe_path, "debug_port": debug_port}, ) except Exception as e: # No fatal — el dato ya fue extraido pass return { "status": "ok", "rows_out": rows_out, "kb_out": round(kb_out, 2), "duration_ms": duration_ms, "error": "", "sample_rows": sample_rows, }