Files
fn_registry/python/functions/pipelines/cdp_extract_recipe.py
T
egutierrez a03675113a chore: auto-commit (286 archivos)
- .claude/agents/fn-orquestador/SKILL.md
- .claude/commands/fn_claude.md
- .claude/rules/INDEX.md
- .claude/rules/cpp_apps.md
- .claude/rules/ids_naming.md
- CHANGELOG.md
- apps/dag_engine/README.md
- apps/dag_engine/api.go
- apps/dag_engine/dags_migrated/example.yaml
- apps/dag_engine/dags_migrated/example_lineage_tracking.yaml
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 16:33:22 +02:00

211 lines
7.1 KiB
Python

"""Ejecuta una recipe YAML contra Chrome remoto via CDP."""
import json
import re
import sys
import os
import time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import urllib.request
import websocket
from core.validate_recipe_yaml import validate_recipe_yaml
def _ws_send_recv(ws, msg_id: int, method: str, params: dict, timeout: float = 10.0) -> dict:
"""Envia un mensaje CDP y espera respuesta con el mismo id."""
import threading
result_holder = {}
event = threading.Event()
original_on_message = ws.on_message
def on_message_wrapper(ws_app, message):
try:
msg = json.loads(message)
if msg.get("id") == msg_id:
result_holder["result"] = msg
event.set()
except Exception:
pass
if original_on_message:
original_on_message(ws_app, message)
ws.on_message = on_message_wrapper
ws.send(json.dumps({"id": msg_id, "method": method, "params": params}))
event.wait(timeout=timeout)
ws.on_message = original_on_message
return result_holder.get("result", {})
def _poll_selector(ws, selector: str, timeout_s: float = 10.0) -> bool:
"""Polling cada 200ms hasta que document.querySelector(selector) no sea null."""
deadline = time.time() + timeout_s
msg_id = 1000
while time.time() < deadline:
ws.send(json.dumps({
"id": msg_id,
"method": "Runtime.evaluate",
"params": {
"expression": f"!!document.querySelector({json.dumps(selector)})",
"returnByValue": True,
}
}))
time.sleep(0.2)
msg_id += 1
# Leer respuesta en loop simple (websocket-client sync)
# Para modo sync usamos recv()
try:
raw = ws.sock.recv()
if raw:
msg = json.loads(raw)
val = msg.get("result", {}).get("result", {}).get("value", False)
if val:
return True
except Exception:
pass
return False
def cdp_extract_recipe(
recipe_path: str,
debug_port: int = 9222,
tab_id: str | None = None,
record_run: bool = True,
) -> dict:
"""Ejecuta una recipe YAML contra Chrome remoto via CDP.
Args:
recipe_path: Ruta al archivo .yaml de la recipe.
debug_port: Puerto de depuracion remota de Chrome. Default 9222.
tab_id: ID del tab a usar. Si None, busca tab cuyo URL matchee url_pattern.
record_run: Si True y output.sink=='data_factory.runs', llama data_factory_record_run.
Returns:
{status, rows_out, kb_out, duration_ms, error, sample_rows}
"""
start_ms = int(time.time() * 1000)
# Leer y validar recipe
try:
with open(recipe_path, "r", encoding="utf-8") as f:
yaml_text = f.read()
except OSError as e:
return {"status": "error", "rows_out": 0, "kb_out": 0.0,
"duration_ms": 0, "error": str(e), "sample_rows": []}
validation = validate_recipe_yaml(yaml_text)
if not validation["valid"]:
return {"status": "error", "rows_out": 0, "kb_out": 0.0,
"duration_ms": 0, "error": "recipe invalida: " + "; ".join(validation["errors"]),
"sample_rows": []}
recipe = validation["parsed"]
url_pattern = recipe["url_pattern"]
steps = recipe["steps"]
output_cfg = recipe.get("output", {})
sink = output_cfg.get("sink", "stdout")
# Obtener lista de tabs
try:
with urllib.request.urlopen(
f"http://127.0.0.1:{debug_port}/json/list", timeout=5
) as resp:
tabs = json.loads(resp.read().decode())
except Exception as e:
return {"status": "error", "rows_out": 0, "kb_out": 0.0,
"duration_ms": 0,
"error": f"no se pudo conectar a Chrome en port {debug_port}: {e}",
"sample_rows": []}
# Encontrar tab
ws_url = None
if tab_id:
for tab in tabs:
if tab.get("id") == tab_id:
ws_url = tab.get("webSocketDebuggerUrl")
break
else:
for tab in tabs:
tab_url = tab.get("url", "")
if re.search(url_pattern, tab_url):
ws_url = tab.get("webSocketDebuggerUrl")
break
if not ws_url:
return {"status": "error", "rows_out": 0, "kb_out": 0.0,
"duration_ms": 0,
"error": f"no tab matching pattern: {url_pattern}",
"sample_rows": []}
# Ejecutar steps
last_result = None
try:
ws = websocket.create_connection(ws_url, timeout=10)
try:
for i, step in enumerate(steps):
if "wait_selector" in step:
selector = step["wait_selector"]
found = _poll_selector(ws, selector, timeout_s=10.0)
if not found:
raise RuntimeError(f"step {i}: timeout esperando selector '{selector}'")
elif "js" in step:
ws.send(json.dumps({
"id": i + 1,
"method": "Runtime.evaluate",
"params": {
"expression": step["js"],
"returnByValue": True,
"awaitPromise": True,
}
}))
raw = ws.recv()
msg = json.loads(raw)
result_obj = msg.get("result", {}).get("result", {})
last_result = result_obj.get("value")
finally:
ws.close()
except Exception as e:
return {"status": "error", "rows_out": 0, "kb_out": 0.0,
"duration_ms": int(time.time() * 1000) - start_ms,
"error": str(e), "sample_rows": []}
# Calcular metricas
rows = last_result if isinstance(last_result, list) else (
[last_result] if last_result is not None else []
)
rows_out = len(rows)
kb_out = len(json.dumps(rows, ensure_ascii=False).encode()) / 1024
sample_rows = rows[:5]
duration_ms = int(time.time() * 1000) - start_ms
# Sink
if sink == "stdout":
print(json.dumps(rows, ensure_ascii=False, indent=2))
elif sink == "json_file":
out_path = output_cfg.get("path", "output.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(rows, f, ensure_ascii=False, indent=2)
elif sink == "data_factory.runs" and record_run:
try:
from pipelines.data_factory_record_run import data_factory_record_run
data_factory_record_run(
node_id=recipe.get("name", "unknown"),
function_id="cdp_extract_recipe_py_pipelines",
args={"recipe_path": recipe_path, "debug_port": debug_port},
)
except Exception as e:
# No fatal — el dato ya fue extraido
pass
return {
"status": "ok",
"rows_out": rows_out,
"kb_out": round(kb_out, 2),
"duration_ms": duration_ms,
"error": "",
"sample_rows": sample_rows,
}