"""Orquesta trim_ax_tree -> chunk_ax_tree -> N llamadas claude_cli_prompt -> merge schema.""" import json import re import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from core.trim_ax_tree import trim_ax_tree from core.chunk_ax_tree import chunk_ax_tree from infra.claude_cli_prompt import claude_cli_prompt def _parse_json_response(text: str) -> dict: """Extrae JSON de la respuesta de Claude, tolerante a fenced code blocks.""" # Intentar fenced ```json ... ``` m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL) if m: return json.loads(m.group(1)) # Intentar JSON directo m = re.search(r"\{.*\}", text, re.DOTALL) if m: return json.loads(m.group(0)) raise ValueError(f"No se encontro JSON valido en respuesta: {text[:200]}") def _build_prompt(url: str, chunk_json: str) -> str: return ( f"Analiza este accessibility tree de la pagina {url}.\n" "Identifica campos de datos extraibles (tablas, listas, valores estructurados).\n" "Para cada campo propone:\n" " - field (snake_case)\n" " - selector CSS robusto que se pueda usar con document.querySelector\n" " - sample_value (valor visible representativo)\n" " - type (string|int|float|bool|date)\n" " - source_role (role del AXNode origen)\n" "\n" 'Devuelve JSON valido SIN explicacion:\n' '{"schema": [...], "notes": "..."}\n' "\n" "AX tree:\n" f"{chunk_json}" ) def llm_propose_scraping_schema( url: str, ax_tree: list, max_chunks: int = 5, max_chars_per_chunk: int = 25000, ) -> dict: """Orquesta: trim_ax_tree -> chunk_ax_tree -> N llamadas claude_cli_prompt -> merge. Args: url: URL de la pagina (se incluye en el prompt para contexto). ax_tree: AX tree como lista de dicts obtenida via CDP. max_chunks: Maximo de chunks a procesar (trunca el resto). max_chars_per_chunk: Caracteres maximos por chunk antes de pasar a Claude. Returns: {schema: [{field, selector, sample_value, type, source_role}], notes: str, chunks_processed: int, truncated: bool} """ trimmed = trim_ax_tree(ax_tree) chunks = chunk_ax_tree(trimmed, max_chars=max_chars_per_chunk) truncated = len(chunks) > max_chunks chunks = chunks[:max_chunks] merged_schema: list = [] seen_fields: set = set() notes_parts: list = [] for chunk in chunks: chunk_json = json.dumps(chunk, ensure_ascii=False) prompt = _build_prompt(url, chunk_json) try: response = claude_cli_prompt(prompt, timeout_s=60) parsed = _parse_json_response(response) except Exception as e: notes_parts.append(f"[chunk error: {e}]") continue for item in parsed.get("schema", []): field = item.get("field", "") if field and field not in seen_fields: seen_fields.add(field) merged_schema.append(item) note = parsed.get("notes", "") if note: notes_parts.append(note) return { "schema": merged_schema, "notes": " | ".join(notes_parts), "chunks_processed": len(chunks), "truncated": truncated, }