diff --git a/enrichers.cpp b/enrichers.cpp index 8b23538..9e440d9 100644 --- a/enrichers.cpp +++ b/enrichers.cpp @@ -59,6 +59,57 @@ std::vector parse_inline_list(const std::string& v) { return out; } +// Split por comas a nivel cero, respetando comillas y nesting de [] / {}. +// El YAML inline `{ name: limit, type: int, default: 10 }` puede contener +// strings con comas entre comillas — un split crudo las rompería. +std::vector split_top_level(const std::string& s) { + std::vector out; + std::string cur; + int depth_b = 0, depth_c = 0; + char quote = 0; + for (char c : s) { + if (quote) { + cur.push_back(c); + if (c == quote) quote = 0; + continue; + } + if (c == '"' || c == '\'') { quote = c; cur.push_back(c); continue; } + if (c == '[') ++depth_b; + if (c == ']') --depth_b; + if (c == '{') ++depth_c; + if (c == '}') --depth_c; + if (c == ',' && depth_b == 0 && depth_c == 0) { + out.push_back(cur); + cur.clear(); + continue; + } + cur.push_back(c); + } + if (!cur.empty()) out.push_back(cur); + return out; +} + +// Parsea un objeto YAML inline `{ name: x, type: int, default: 10 }` a un +// EnricherParam. Retorna true si al menos `name` se resolvio. +bool parse_inline_param(const std::string& v, EnricherParam* out) { + std::string s = strip(v); + if (s.size() < 2 || s.front() != '{' || s.back() != '}') return false; + s = s.substr(1, s.size() - 2); + for (auto& kv : split_top_level(s)) { + size_t colon = kv.find(':'); + if (colon == std::string::npos) continue; + std::string k = strip(kv.substr(0, colon)); + std::string val = strip_quotes(strip(kv.substr(colon + 1))); + if (k == "name") out->name = val; + else if (k == "type") out->type = lower(val); + else if (k == "default") out->default_value = val; + else if (k == "description") out->description = val; + else if (k == "desc") out->description = val; + } + if (out->type.empty()) out->type = "string"; + return !out->name.empty(); +} + // Manifest YAML soportado (subset): // id: fetch_webpage // name: "Fetch web page" @@ -66,16 +117,19 @@ std::vector parse_inline_list(const std::string& v) { // applies_to: [Webpage, Url] // lang: python <- issue 0033: go|python|bash (default python) // exec: run <- basename del binario/script (default "run") -// params: <- v1 ignora bloque -// - { name: timeout_s, ... } +// params: +// - { name: timeout_s, type: int, default: 15 } +// - { name: region, type: string, default: "" } // -// Las claves anidadas bajo `params:` (y otros bloques con valor vacio -// seguido de lineas indentadas) se ignoran. +// Solo el bloque `params:` se parsea con detalle. Otros bloques con valor +// vacio seguido de lineas indentadas (`emits:`, `relations:`, +// `uses_functions:`) se ignoran como antes. bool parse_manifest(const std::string& path, EnricherSpec* out) { std::ifstream f(path); if (!f) return false; std::string line; - bool in_skip_block = false; + bool in_skip_block = false; + bool in_params_block = false; while (std::getline(f, line)) { // Strip CR de Windows. if (!line.empty() && line.back() == '\r') line.pop_back(); @@ -84,10 +138,27 @@ bool parse_manifest(const std::string& path, EnricherSpec* out) { std::string trim = strip(line); if (trim.empty() || trim.front() == '#') continue; - // Si la linea NO empieza con whitespace, salimos del bloque skip. + // Si la linea NO empieza con whitespace, salimos de los bloques + // anidados — el siguiente top-level reinicia el contexto. bool indented = !line.empty() && std::isspace((unsigned char)line.front()); - if (!indented) in_skip_block = false; + if (!indented) { + in_skip_block = false; + in_params_block = false; + } if (in_skip_block) continue; + if (in_params_block) { + // Linea esperada: ` - { name: x, type: int, default: 10 }`. + // Tolera variaciones de indent y comilla. + std::string body = trim; + if (!body.empty() && body.front() == '-') { + body = strip(body.substr(1)); + } + EnricherParam p; + if (parse_inline_param(body, &p)) { + out->params.push_back(std::move(p)); + } + continue; + } size_t colon = trim.find(':'); if (colon == std::string::npos) continue; @@ -101,9 +172,17 @@ bool parse_manifest(const std::string& path, EnricherSpec* out) { else if (key == "applies_to") out->applies_to = parse_inline_list(val); else if (key == "lang") out->lang = lower(strip_quotes(val)); else if (key == "exec") out->exec_basename = strip_quotes(val); - else if (key == "params" && val.empty()) in_skip_block = true; + else if (key == "params") { + // `params: []` — vacio explicito, nada que hacer. + // `params:` — siguiente bloque indentado son items. + std::string vs = strip(val); + if (vs.empty()) in_params_block = true; + // Si fuese inline (`params: [{...}]`) — formato no usado en + // los manifests actuales, lo ignoramos. + } else if (key == "emits" && val.empty()) in_skip_block = true; else if (key == "relations" && val.empty()) in_skip_block = true; + else if (key == "uses_functions" && val.empty()) in_skip_block = true; } // Defaults — preservan retrocompat con manifests existentes que no diff --git a/enrichers.h b/enrichers.h index a082ad5..26bac80 100644 --- a/enrichers.h +++ b/enrichers.h @@ -10,11 +10,22 @@ // `enrichers_for_type(type_ref)` para mostrar el submenu filtrado por tipo // del nodo right-clickado. // -// Para v1 no parseamos `params` con detalle — solo lo necesario para -// presentar el item de menu y submitear el job con `{}`. +// Los parametros declarados en `params:` del manifest se parsean para que +// la UI pueda renderizar un dialog de configuracion antes de lanzar el +// job. Si la lista esta vacia, el job se submitea directamente con `{}`. namespace ge { +// Parametro declarado en `manifest.yaml` -> entrada `{ name, type, default }`. +// La UI de configuracion edita un buffer string por param y lo serializa a +// JSON segun el `type` al pulsar Run. +struct EnricherParam { + std::string name; // ej: "limit" + std::string type; // "int" | "float" | "string" | "bool" + std::string default_value; // valor por defecto en formato texto + std::string description; // opcional, para tooltip +}; + struct EnricherSpec { std::string id; // ej: "fetch_webpage" std::string name; // ej: "Fetch web page" @@ -34,6 +45,9 @@ struct EnricherSpec { // /{.exe} segun la plataforma. Default "run". std::string exec_basename; + // Parametros editables por el usuario antes de lanzar el job. + std::vector params; + // True si lang != "" y no se pudo resolver el ejecutable // correspondiente (ej: enricher Go sin compilar). El loader deja // el spec en el registro pero marcado como deshabilitado para diff --git a/enrichers/extract_domain/manifest.yaml b/enrichers/extract_domain/manifest.yaml index 7a14e75..ca568c9 100644 --- a/enrichers/extract_domain/manifest.yaml +++ b/enrichers/extract_domain/manifest.yaml @@ -1,7 +1,7 @@ id: extract_domain name: "Extract domain" description: "Saca el dominio de la url/email del nodo y crea/conecta una entidad Domain con relacion BELONGS_TO. No descarga nada." -applies_to: [Url, Webpage, Email] +applies_to: [Url, Email] emits: [Domain] relations: [BELONGS_TO] params: [] diff --git a/enrichers/extract_links/manifest.yaml b/enrichers/extract_links/manifest.yaml index e27e065..9a93a2b 100644 --- a/enrichers/extract_links/manifest.yaml +++ b/enrichers/extract_links/manifest.yaml @@ -1,7 +1,7 @@ id: extract_links name: "Extract links" -description: "Lee la markdown cacheada de un Webpage (metadata.markdown_path) y crea nodos Url para cada enlace encontrado, conectados con relacion LINKS_TO. Requiere haber ejecutado fetch_webpage antes." -applies_to: [Webpage] +description: "Lee la markdown cacheada del nodo Url (metadata.markdown_path) y crea nodos Url para cada enlace encontrado, conectados con relacion LINKS_TO. Requiere haber ejecutado fetch_webpage antes." +applies_to: [Url] emits: [Url] relations: [LINKS_TO] uses_functions: diff --git a/enrichers/extract_text_entities/manifest.yaml b/enrichers/extract_text_entities/manifest.yaml index e974411..8f4476b 100644 --- a/enrichers/extract_text_entities/manifest.yaml +++ b/enrichers/extract_text_entities/manifest.yaml @@ -1,7 +1,7 @@ id: extract_text_entities name: "Extract entities from text" -description: "Lee la markdown cacheada de un Webpage y extrae IoCs (IPs, emails, dominios, hashes, crypto wallets, CVEs, MAC, telefonos) creando entidades + relacion EXTRACTED_FROM. Sin coste — solo regex. Modelos ML (GLiNER/GLiREL) en futura iteracion." -applies_to: [Webpage] +description: "Lee la markdown cacheada de un Url y extrae IoCs (IPs, emails, dominios, hashes, crypto wallets, CVEs, MAC, telefonos) creando entidades + relacion EXTRACTED_FROM. Sin coste — solo regex. Modelos ML (GLiNER/GLiREL) en futura iteracion." +applies_to: [Url] emits: [Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone] relations: [EXTRACTED_FROM] uses_functions: diff --git a/enrichers/fetch_webpage/manifest.yaml b/enrichers/fetch_webpage/manifest.yaml index b967f0c..72b24dc 100644 --- a/enrichers/fetch_webpage/manifest.yaml +++ b/enrichers/fetch_webpage/manifest.yaml @@ -1,7 +1,7 @@ id: fetch_webpage name: "Fetch web page" -description: "Descarga HTML de una URL, extrae markdown limpio (readabilipy) y guarda los blobs en cache. Crea/actualiza el nodo Webpage con title/status_code/paths y crea el Domain con relacion BELONGS_TO." -applies_to: [Url, Webpage] +description: "Descarga HTML de una URL, extrae markdown limpio (readabilipy) y guarda los blobs en cache. Actualiza el nodo Url con title/status_code/paths/markdown en metadata y crea el Domain con relacion BELONGS_TO." +applies_to: [Url] emits: [Domain] relations: [BELONGS_TO] uses_functions: diff --git a/enrichers/fetch_webpage/run.py b/enrichers/fetch_webpage/run.py index 6d065e5..ca88f3f 100755 --- a/enrichers/fetch_webpage/run.py +++ b/enrichers/fetch_webpage/run.py @@ -3,7 +3,12 @@ Lee JSON de stdin, descarga la URL del nodo, convierte HTML a markdown, guarda blobs en `//.{html,md}`, actualiza el -nodo a tipo Webpage con metadata enriquecida y crea/conecta el Domain. +nodo (deja type_ref=Url) con metadata enriquecida y crea/conecta el Domain. + +Nota: historicamente fetch_webpage convertia Url -> Webpage, pero esos +dos tipos se han unificado en Url. Los campos de cuerpo cacheado +(html_path, markdown_path, status_code, fetched_at, text_length, ...) +viven en metadata. Wire protocol (issue 0026): - stdin: JSON con node_id, metadata, ops_db_path, app_dir, cache_dir, @@ -289,7 +294,14 @@ def main() -> int: log(f"node {node_id} disappeared") return 6 cur_type, cur_meta = row[0], row[1] or "{}" - new_type = "Webpage" if cur_type.lower() == "url" else cur_type or "Webpage" + # Webpage fue un tipo separado historicamente. Hoy se unifica en + # Url (mismo tipo, los campos de cuerpo cacheado viven en + # metadata): si el nodo entrante es Url o el legacy Webpage, lo + # dejamos como Url; si el nodo no tiene tipo, default Url. + if not cur_type or cur_type.lower() in ("url", "webpage"): + new_type = "Url" + else: + new_type = cur_type patch = { "url": url, diff --git a/enrichers/web_search/__pycache__/run.cpython-312.pyc b/enrichers/web_search/__pycache__/run.cpython-312.pyc deleted file mode 100644 index cef5a49..0000000 Binary files a/enrichers/web_search/__pycache__/run.cpython-312.pyc and /dev/null differ diff --git a/enrichers/web_search/run.py b/enrichers/web_search/run.py index b09b15d..3f4cbca 100755 --- a/enrichers/web_search/run.py +++ b/enrichers/web_search/run.py @@ -8,14 +8,20 @@ Wire protocol estandar (issue 0026): - stdout: una linea JSON al final con resumen. - exit code 0 = ok, !=0 = error. -DDG endpoint usado: https://html.duckduckgo.com/html/?q= -Devuelve HTML estatico, sin JavaScript. Los enlaces vienen envueltos en -redireccion `//duckduckgo.com/l/?uddg=` que hay que decodificar. +DDG endpoints usados: + 1. https://lite.duckduckgo.com/lite/ (POST) — endpoint primario. + HTML minimo (ano 2009-style), tabla con `` y + ``. Es el menos agresivo con bot + detection; suele responder 200 cuando el endpoint `html.` ya + devuelve un challenge "anomaly" desde IPs residenciales/Windows. + 2. https://html.duckduckgo.com/html/ (POST) — fallback. Su parser + usa `result__a` / `result__snippet`. DDG envuelve los enlaces en + `//duckduckgo.com/l/?uddg=` que hay que decodificar. -Para automatizar busquedas masivas en el futuro (sesion persistente, -cookies, JS, captchas) la fase 2 introducira un enricher `web_search_cdp` -que controle un Chromium remoto via DevTools Protocol. Este es el -fallback simple zero-infra. +Si ambos endpoints devuelven la pagina anti-bot ("anomaly", challenge +captcha), el enricher emite un error claro indicando que se necesita +`web_search_cdp` (issue 0029) — el fallback simple zero-infra no puede +resolver el challenge. """ from __future__ import annotations @@ -49,13 +55,33 @@ def now_ms() -> int: return int(time.time() * 1000) -def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str: - """Descarga la pagina HTML de resultados de DuckDuckGo. +def _ddg_post(url: str, params: dict, headers: dict, timeout: int) -> str: + try: + import requests # type: ignore + r = requests.post(url, data=params, headers=headers, timeout=timeout) + return r.text + except ImportError: + from urllib.parse import urlencode + from urllib.request import Request, urlopen + body = urlencode(params).encode() + req = Request(url, data=body, headers=headers) + with urlopen(req, timeout=timeout) as resp: # type: ignore + return resp.read().decode("utf-8", errors="replace") - El endpoint `html.duckduckgo.com` no requiere JS y respeta los - parametros `kl` (region) y `kp` (safe search: 1 strict, -1 off, - -2 moderate). Inyecta cookie para que el "moderate" se aplique sin - pantalla intermedia. + +def is_anomaly_page(htmltxt: str) -> bool: + """Detecta la pagina anti-bot de DDG (challenge captcha).""" + s = htmltxt.lower() + return "anomaly" in s and "challenge" in s + + +def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> tuple[str, str]: + """Descarga la pagina de resultados de DuckDuckGo. + + Intenta primero `lite.duckduckgo.com/lite/` (HTML minimo, ano-2009 + style, mucho menos agresivo con bot detection que `html.`). Si + ese endpoint devuelve la pagina anti-bot, cae al endpoint `html.`. + Devuelve `(html, source)` donde source ∈ {"lite", "html"}. """ params = {"q": query} if region: @@ -66,29 +92,22 @@ def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str: headers = { "User-Agent": ( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.7", } - try: - import requests # type: ignore - r = requests.post( - "https://html.duckduckgo.com/html/", - data=params, - headers=headers, - timeout=timeout, - ) - return r.text - except ImportError: - from urllib.parse import urlencode - from urllib.request import Request, urlopen - body = urlencode(params).encode() - req = Request("https://html.duckduckgo.com/html/", data=body, - headers=headers) - with urlopen(req, timeout=timeout) as resp: # type: ignore - return resp.read().decode("utf-8", errors="replace") + + htmltxt = _ddg_post("https://lite.duckduckgo.com/lite/", params, + headers, timeout) + if not is_anomaly_page(htmltxt): + return htmltxt, "lite" + + log("lite endpoint devolvio challenge — fallback a html endpoint") + htmltxt = _ddg_post("https://html.duckduckgo.com/html/", params, + headers, timeout) + return htmltxt, "html" def decode_ddg_href(href: str) -> str: @@ -195,7 +214,7 @@ class _DDGParser(HTMLParser): def parse_ddg_html(htmltxt: str) -> list[dict]: - """Parsea el HTML de DDG y devuelve [{url, title, snippet, rank}].""" + """Parsea el HTML del endpoint `html.duckduckgo.com`.""" p = _DDGParser() try: p.feed(htmltxt) @@ -221,6 +240,100 @@ def parse_ddg_html(htmltxt: str) -> list[dict]: return out +class _DDGLiteParser(HTMLParser): + """Parser para `lite.duckduckgo.com/lite/`. + + Estructura tipica: + title + ... + snippet text + Los snippets vienen DESPUES del enlace (no hijo del mismo elemento), + asi que parea por orden: cada `result-link` consume el siguiente + `result-snippet`. + """ + + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.results: list[dict] = [] + self._in_link = False + self._in_snippet = False + self._cur_href = "" + self._title_buf: list[str] = [] + self._snippet_buf: list[str] = [] + self._pending_snippet_for: int | None = None + + def _attrs_dict(self, attrs): + return {k: (v or "") for k, v in attrs} + + def handle_starttag(self, tag: str, attrs): + a = self._attrs_dict(attrs) + cls = a.get("class", "") + if tag == "a" and "result-link" in cls: + href = a.get("href", "") + self._in_link = True + self._cur_href = href + self._title_buf = [] + elif tag == "td" and "result-snippet" in cls: + self._in_snippet = True + self._snippet_buf = [] + + def handle_endtag(self, tag: str): + if self._in_link and tag == "a": + title = " ".join("".join(self._title_buf).split()) + self.results.append({ + "href": self._cur_href, + "title": title, + "snippet": "", + }) + self._pending_snippet_for = len(self.results) - 1 + self._in_link = False + elif self._in_snippet and tag == "td": + snippet = " ".join("".join(self._snippet_buf).split()) + if self._pending_snippet_for is not None: + self.results[self._pending_snippet_for]["snippet"] = snippet + self._pending_snippet_for = None + self._in_snippet = False + + def handle_data(self, data: str): + if self._in_link: + self._title_buf.append(data) + elif self._in_snippet: + self._snippet_buf.append(data) + + +def parse_ddg_lite(htmltxt: str) -> list[dict]: + """Parsea el HTML del endpoint `lite.duckduckgo.com/lite/`.""" + p = _DDGLiteParser() + try: + p.feed(htmltxt) + p.close() + except Exception as e: + log(f"DDG lite parser failed: {e}") + + out: list[dict] = [] + seen: set[str] = set() + for r in p.results: + href = r.get("href") or "" + # lite envia URLs absolutas directas; aun asi pasamos por + # decode_ddg_href por si en algun caso DDG envuelve. + url = decode_ddg_href(href) + if not url or not url.startswith(("http://", "https://")): + continue + # Excluir auto-promociones de DDG (paginas de ayuda). + if "duckduckgo.com/duckduckgo-help-pages/" in url: + continue + if url in seen: + continue + seen.add(url) + out.append({ + "url": url, + "title": r.get("title") or "", + "snippet": r.get("snippet") or "", + "rank": len(out) + 1, + }) + return out + + def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None: """Busca un nodo Url existente con la misma url en metadata.""" cur = conn.execute( @@ -384,18 +497,40 @@ def main() -> int: progress(0.10, "fetching") try: - htmltxt = fetch_ddg(query, timeout=timeout_s, region=region, safe=safe) + htmltxt, source = fetch_ddg(query, timeout=timeout_s, + region=region, safe=safe) except Exception as e: log(f"DDG fetch failed: {e}") print(json.dumps({"error": str(e), "query": query, "entities_added": 0, "relations_added": 0})) return 4 + if is_anomaly_page(htmltxt): + log("DDG devolvio challenge captcha en ambos endpoints — " + "usar web_search_cdp (issue 0029) para resolver") + print(json.dumps({ + "error": "DDG bot challenge — captcha required", + "query": query, + "engine": "duckduckgo", + "source": source, + "results": 0, + "entities_added": 0, + "relations_added": 0, + }, ensure_ascii=False)) + return 4 + progress(0.55, "parsing") - results = parse_ddg_html(htmltxt) + # El parser se elige por contenido — si el endpoint y el markup no + # coinciden (tests con stub que sirve cualquier URL, o un cambio + # futuro de DDG), aun extraemos resultados. Probamos ambos y nos + # quedamos con el que devuelva mas. + results_lite = parse_ddg_lite(htmltxt) if "result-link" in htmltxt else [] + results_html = parse_ddg_html(htmltxt) if "result__a" in htmltxt else [] + results = results_lite if len(results_lite) >= len(results_html) else results_html if limit > 0: results = results[:limit] - log(f"DDG returned {len(results)} results") + log(f"DDG ({source}) returned {len(results)} results " + f"(lite_parsed={len(results_lite)} html_parsed={len(results_html)})") progress(0.80, "applying") conn = sqlite3.connect(ops_db_path) diff --git a/examples/types.yaml b/examples/types.yaml index 7e4f882..5b1ca3f 100644 --- a/examples/types.yaml +++ b/examples/types.yaml @@ -101,25 +101,18 @@ entities: - { name: country, type: string } - { name: postcode, type: string } + # Url — unifica el viejo Url (solo metadata) y Webpage (cuerpo + # cacheado). Tras fetch_webpage, los campos `*_path`, `status_code`, + # `fetched_at`, `text_length`, etc. tienen valor; sin haber corrido + # fetch siguen vacios pero el nodo sigue siendo un Url valido. - name: Url color: "#89E0FC" icon: ti-link principal_field: url - fields: - - { name: url, type: url, required: true } - - { name: title, type: string } - - { name: domain, type: string } - - # Documento web descargado. Issue 0027: tipo separado de Url para nodos - # con cuerpo cacheado (HTML+markdown+screenshot). Los enrichers - # fetch_webpage / extract_links / extract_text_entities lo pueblan. - - name: Webpage - color: "#89E0FC" - icon: ti-file-text - principal_field: url fields: - { name: url, type: url, required: true } - { name: title, type: string } + - { name: domain, type: string } - { name: status_code, type: int } - { name: content_type, type: string } - { name: fetched_at, type: date } diff --git a/jobs.cpp b/jobs.cpp index 184a764..ba0d841 100644 --- a/jobs.cpp +++ b/jobs.cpp @@ -378,15 +378,20 @@ std::string read_entity_field(const char* db_path, const char* id, return out; } -// JSON entregado al subprocess. Todos los paths se normalizan a WSL en -// Windows; en POSIX los respeta tal cual. +// JSON entregado al subprocess. En Windows, los paths se normalizan a +// forma WSL solo cuando el subprocess corre dentro de WSL (lang=bash, o +// python con runtime registry_venv). Para subprocesses nativos Windows +// (lang=go, o python embedded/FN_PYTHON/system) se mantienen los paths +// Windows-nativos — pasarlos como /mnt/c/... haria que fallen al abrir. +// En POSIX la conversion es no-op y siempre se respetan los paths. std::string build_stdin_json(const std::string& job_id, const std::string& enricher_id, const std::string& node_id, const std::string& params_json, const std::string& ops_db, const std::string& app_dir, - const std::string& registry_root) + const std::string& registry_root, + const std::string& lang) { std::string node_type, node_name, node_metadata = "{}"; if (!node_id.empty()) { @@ -420,10 +425,25 @@ std::string build_stdin_json(const std::string& job_id, std::string app_dir_abs = absify(app_dir); std::string root_abs = absify(registry_root); - std::string ops_db_wsl = to_wsl_path(ops_db_abs); - std::string app_dir_wsl = to_wsl_path(app_dir_abs); - std::string root_wsl = to_wsl_path(root_abs); - std::string cache_dir = app_dir_wsl + "/cache"; + // Decidir si convertir paths a forma WSL. Solo se hace cuando el + // subprocess vive dentro de WSL — si no, los paths /mnt/c/... no + // existen para el proceso Windows-nativo. + bool use_wsl_paths = false; +#ifdef _WIN32 + if (lang == "bash") { + use_wsl_paths = true; + } else if (lang == "python") { + use_wsl_paths = cached_python_runtime().needs_wsl; + } + // lang == "go": siempre nativo Windows. +#else + (void)lang; +#endif + + std::string ops_db_out = use_wsl_paths ? to_wsl_path(ops_db_abs) : ops_db_abs; + std::string app_dir_out = use_wsl_paths ? to_wsl_path(app_dir_abs) : app_dir_abs; + std::string root_out = use_wsl_paths ? to_wsl_path(root_abs) : root_abs; + std::string cache_dir = app_dir_out + "/cache"; std::ostringstream o; o << '{' @@ -434,10 +454,10 @@ std::string build_stdin_json(const std::string& job_id, << "\"node_name\":\"" << json_escape(node_name) << "\"," << "\"metadata\":" << (node_metadata.empty() ? "{}" : node_metadata) << "," << "\"params\":" << (params_json.empty() ? "{}" : params_json) << "," - << "\"ops_db_path\":\"" << json_escape(ops_db_wsl) << "\"," - << "\"app_dir\":\"" << json_escape(app_dir_wsl) << "\"," + << "\"ops_db_path\":\"" << json_escape(ops_db_out) << "\"," + << "\"app_dir\":\"" << json_escape(app_dir_out) << "\"," << "\"cache_dir\":\"" << json_escape(cache_dir) << "\"," - << "\"registry_root\":\"" << json_escape(root_wsl) << "\"" + << "\"registry_root\":\"" << json_escape(root_out) << "\"" << '}'; return o.str(); } @@ -1030,7 +1050,7 @@ void worker_loop() { } std::string stdin_payload = build_stdin_json( ctx.id, ctx.enricher_id, ctx.node_id, ctx.params_json, - ops_db, g_state->app_dir, g_state->registry_root); + ops_db, g_state->app_dir, g_state->registry_root, lang); ProcResult res = run_subprocess(job_id, run_path, lang, stdin_payload, ctrl); diff --git a/main.cpp b/main.cpp index 6c397a9..43cd088 100644 --- a/main.cpp +++ b/main.cpp @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #ifndef _WIN32 @@ -318,27 +320,91 @@ static void place_orphans_near_neighbors(GraphData& g, float min_dist, int park_n = 0; int placed_neighbor = 0, placed_camera = 0, parked = 0; - for (int i = 0; i < g.node_count; ++i) { - GraphNode& n = g.nodes[i]; - if (n.x != 0.0f || n.y != 0.0f) continue; + // ----- Pase 1: agrupar orphans por su anchor (vecino con posicion) ----- + // Cuando un enricher crea N nodos todos conectados al mismo source + // (caso tipico: web_search → N Urls SEARCH_RESULT_OF source), queremos + // que los N nodos clustereen MUY apretados alrededor del source en + // un solo anillo, no que se desperdiguen por anillos concentricos + // hasta encontrar slot libre. La busqueda anti-colision individual + // los empuja hacia fuera cuando ya hay vecinos preexistentes; aqui + // les damos a los hermanos del mismo anchor angulos repartidos en + // un anillo unico cerca del padre. + std::unordered_map> orphans_by_anchor; + std::vector orphans_no_anchor; + for (int i = 0; i < g.node_count; ++i) { + const GraphNode& n = g.nodes[i]; + if (n.x != 0.0f || n.y != 0.0f) continue; int parent = layout_first_placed_neighbor(g, i); - if (parent >= 0) { - float ox, oy; - if (find_collision_free_slot( - g, i, g.nodes[parent].x, g.nodes[parent].y, - min_dist, n.user_data, - neighbor_radii, n_neighbor_radii, &ox, &oy)) { - n.x = ox; n.y = oy; - } else { - // Acepta solape como ultimo recurso. - n.x = g.nodes[parent].x + neighbor_radii[n_neighbor_radii - 1]; - n.y = g.nodes[parent].y; + if (parent >= 0) orphans_by_anchor[parent].push_back(i); + else orphans_no_anchor.push_back(i); + } + + // ----- Pase 2: place clusters (orphans con anchor) ----- + // Para cada anchor con sus hijos, los repartimos en un anillo + // alrededor del padre. Si hay mas hijos de los que caben en el + // anillo base, abrimos anillos adicionales. Cada hijo sigue + // pasando find_collision_free_slot como fallback si el slot ideal + // estaba ocupado por otro nodo del grafo. + const float two_pi = 6.28318530718f; + for (auto& kv : orphans_by_anchor) { + int parent = kv.first; + std::vector& kids = kv.second; + if (kids.empty()) continue; + // Orden estable por user_data para que rondas sucesivas del + // mismo enricher (mismo set de hijos) coloquen igual. + std::sort(kids.begin(), kids.end(), + [&](int a, int b) { + return g.nodes[a].user_data < g.nodes[b].user_data; + }); + float cx = g.nodes[parent].x; + float cy = g.nodes[parent].y; + // Capacidad por anillo: circunferencia / min_dist. + // Para min_dist=60, ring r=80 -> ~8 slots; r=140 -> ~14. + for (size_t k = 0; k < kids.size(); ++k) { + // Anillo y slot dentro del anillo en funcion del indice. + int ri = 0; size_t accum = 0; size_t cap = 0; + for (; ri < n_neighbor_radii; ++ri) { + float r_here = neighbor_radii[ri]; + cap = (size_t)std::max(6.0f, two_pi * r_here / min_dist); + if (k < accum + cap) break; + accum += cap; } - n.vx = n.vy = 0.0f; + if (ri >= n_neighbor_radii) ri = n_neighbor_radii - 1; + float r_use = neighbor_radii[ri]; + cap = (size_t)std::max(6.0f, two_pi * r_use / min_dist); + size_t slot = k - accum; + // Jitter pequeno por user_data para que rondas distintas no + // queden alineadas si comparten anchor. + uint64_t seed = g.nodes[kids[k]].user_data; + float jitter = ((float)((seed >> 16) & 0xFF) / 255.0f) * (two_pi / cap); + float angle = jitter + (float)slot * (two_pi / cap); + float px = cx + r_use * std::cos(angle); + float py = cy + r_use * std::sin(angle); + // Si el slot ideal colisiona con un nodo ajeno al cluster, + // delegamos en find_collision_free_slot que probara mas + // angulos en radios crecientes. + GraphNode& kid = g.nodes[kids[k]]; + if (layout_no_collision(g, kids[k], px, py, min_dist)) { + kid.x = px; kid.y = py; + } else { + float ox, oy; + if (find_collision_free_slot( + g, kids[k], cx, cy, min_dist, seed, + neighbor_radii, n_neighbor_radii, &ox, &oy)) { + kid.x = ox; kid.y = oy; + } else { + kid.x = px; kid.y = py; // ultimo recurso: solape + } + } + kid.vx = kid.vy = 0.0f; ++placed_neighbor; - continue; } + } + + // ----- Pase 3: place orphans sin anchor (camera o parking lot) ----- + for (int i : orphans_no_anchor) { + GraphNode& n = g.nodes[i]; if (use_camera) { // Sin vecino → colocar dentro de la camara con ring placement. @@ -875,10 +941,29 @@ static void render_context_menu() { } else { for (const auto& s : specs) { if (ImGui::MenuItem(s.name.c_str())) { - char job_id[64]; - bool ok = ge::jobs_submit(s.id.c_str(), sql_id, lbl, - "{}", job_id, sizeof(job_id)); - if (ok) g_app.panel_jobs = true; + if (s.params.empty()) { + // Sin params editables: submit directo, comportamiento + // historico — un click y a correr. + char job_id[64]; + bool ok = ge::jobs_submit(s.id.c_str(), sql_id, lbl, + "{}", job_id, sizeof(job_id)); + if (ok) g_app.panel_jobs = true; + } else { + // Abrir ventana de configuracion. Inicializar + // buffers con los defaults del manifest. + g_app.enr_modal_id = s.id; + g_app.enr_modal_node_id = sql_id; + g_app.enr_modal_node_label = lbl ? lbl : ""; + g_app.enr_modal_param_bufs.clear(); + g_app.enr_modal_param_bufs.resize(s.params.size()); + for (size_t i = 0; i < s.params.size(); ++i) { + const std::string& dv = s.params[i].default_value; + auto& buf = g_app.enr_modal_param_bufs[i]; + buf.assign(256, '\0'); + std::snprintf(buf.data(), buf.size(), "%s", dv.c_str()); + } + g_app.enr_window_open = true; + } } if (!s.description.empty() && ImGui::IsItemHovered()) { ImGui::SetTooltip("%s", s.description.c_str()); @@ -891,6 +976,171 @@ static void render_context_menu() { ImGui::EndPopup(); } +// ---------------------------------------------------------------------------- +// Modal: configurar parametros de enricher antes de lanzar el job +// ---------------------------------------------------------------------------- +// Se invoca desde el context menu (Run enricher → click). Si el enricher +// declara `params` en su manifest, en lugar de submitear directamente, +// llenamos el AppState (ver bloque `enr_modal_*`) y aqui renderizamos el +// dialogo. El usuario ajusta valores y al pulsar Run construimos el +// JSON `{ "param": value, ... }` y lo pasamos a `jobs_submit`. + +static std::string json_escape_str(const std::string& s) { + std::string out; + out.reserve(s.size() + 8); + for (char c : s) { + switch (c) { + case '"': out += "\\\""; break; + case '\\': out += "\\\\"; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + if ((unsigned char)c < 0x20) { + char b[8]; + std::snprintf(b, sizeof(b), "\\u%04x", (unsigned char)c); + out += b; + } else { + out.push_back(c); + } + } + } + return out; +} + +// Renderiza una fila label/input dentro de una BeginTable de 2 columnas. +// El label va a la izquierda alineado al frame del input; el input usa +// todo el ancho disponible de la columna derecha. +static void labeled_row_begin(const char* label) { + ImGui::TableNextRow(); + ImGui::TableNextColumn(); + ImGui::AlignTextToFramePadding(); + ImGui::TextUnformatted(label); + ImGui::TableNextColumn(); + ImGui::SetNextItemWidth(-FLT_MIN); +} + +static void render_enricher_config_window() { + if (!g_app.enr_window_open) return; + + ImGui::SetNextWindowSize(ImVec2(420, 0), ImGuiCond_FirstUseEver); + if (!ImGui::Begin("Run enricher", &g_app.enr_window_open, + ImGuiWindowFlags_NoCollapse)) { + ImGui::End(); + return; + } + + const ge::EnricherSpec* spec = ge::enricher_by_id(g_app.enr_modal_id.c_str()); + if (!spec) { + ImGui::TextDisabled("(enricher no encontrado)"); + ImGui::End(); + return; + } + + ImGui::Text("%s", spec->name.c_str()); + if (!spec->description.empty()) { + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.7f, 0.7f, 0.7f, 1.0f)); + ImGui::TextWrapped("%s", spec->description.c_str()); + ImGui::PopStyleColor(); + } + ImGui::Separator(); + ImGui::TextDisabled("Node: %s", g_app.enr_modal_node_label.c_str()); + ImGui::Spacing(); + + // Asegurar tamaño de buffers — un manifest puede haberse recargado + // con mas params de los que llenamos al abrir la ventana. + if (g_app.enr_modal_param_bufs.size() < spec->params.size()) { + g_app.enr_modal_param_bufs.resize(spec->params.size()); + } + + if (ImGui::BeginTable("##enr_params", 2, + ImGuiTableFlags_SizingStretchProp | + ImGuiTableFlags_NoBordersInBody)) { + ImGui::TableSetupColumn("name", ImGuiTableColumnFlags_WidthFixed, 110.0f); + ImGui::TableSetupColumn("value", ImGuiTableColumnFlags_WidthStretch); + + for (size_t i = 0; i < spec->params.size(); ++i) { + const auto& p = spec->params[i]; + auto& buf = g_app.enr_modal_param_bufs[i]; + if (buf.size() < 256) buf.resize(256, '\0'); + + ImGui::PushID((int)i); + labeled_row_begin(p.name.c_str()); + const std::string& t = p.type; + if (t == "int") { + int v = std::atoi(buf.data()); + if (ImGui::InputInt("##v", &v, 1, 10)) { + std::snprintf(buf.data(), buf.size(), "%d", v); + } + } else if (t == "float" || t == "double" || t == "number") { + float v = (float)std::atof(buf.data()); + if (ImGui::InputFloat("##v", &v)) { + std::snprintf(buf.data(), buf.size(), "%g", v); + } + } else if (t == "bool") { + bool v = (std::strcmp(buf.data(), "true") == 0 || + std::strcmp(buf.data(), "1") == 0); + if (ImGui::Checkbox("##v", &v)) { + std::snprintf(buf.data(), buf.size(), "%s", v ? "true" : "false"); + } + } else { + ImGui::InputText("##v", buf.data(), buf.size()); + } + if (!p.description.empty() && ImGui::IsItemHovered()) { + ImGui::SetTooltip("%s", p.description.c_str()); + } + ImGui::PopID(); + } + ImGui::EndTable(); + } + + ImGui::Separator(); + if (ImGui::Button("Run", ImVec2(100, 0))) { + // Construir JSON `{ "name": value, ... }` segun los tipos. + std::string j = "{"; + for (size_t i = 0; i < spec->params.size(); ++i) { + const auto& p = spec->params[i]; + const auto& buf = g_app.enr_modal_param_bufs[i]; + if (i) j += ","; + j += "\""; + j += json_escape_str(p.name); + j += "\":"; + if (p.type == "int") { + int v = std::atoi(buf.data()); + char b[32]; std::snprintf(b, sizeof(b), "%d", v); + j += b; + } else if (p.type == "float" || p.type == "double" || p.type == "number") { + double v = std::atof(buf.data()); + char b[64]; std::snprintf(b, sizeof(b), "%g", v); + j += b; + } else if (p.type == "bool") { + bool v = (std::strcmp(buf.data(), "true") == 0 || + std::strcmp(buf.data(), "1") == 0); + j += v ? "true" : "false"; + } else { + j += "\""; + j += json_escape_str(buf.data()); + j += "\""; + } + } + j += "}"; + + char job_id[64]; + bool ok = ge::jobs_submit(spec->id.c_str(), + g_app.enr_modal_node_id.c_str(), + g_app.enr_modal_node_label.c_str(), + j.c_str(), job_id, sizeof(job_id)); + if (ok) g_app.panel_jobs = true; + g_app.enr_window_open = false; + } + ImGui::SameLine(); + if (ImGui::Button("Cancel", ImVec2(100, 0))) { + g_app.enr_window_open = false; + } + + ImGui::End(); +} + // ---------------------------------------------------------------------------- // Label callback // ---------------------------------------------------------------------------- @@ -1742,6 +1992,9 @@ static void render() { ImGui::SetNextWindowSize(ImVec2(520.0f, 720.0f), ImGuiCond_FirstUseEver); ge::chat_render(&g_app.panel_chat); + // Enricher config window (abierto desde context menu Run enricher). + render_enricher_config_window(); + g_first_render = false; } diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index df002ae..0000000 Binary files a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc and /dev/null differ diff --git a/tests/__pycache__/test_extract_links.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_links.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index a8b8f5f..0000000 Binary files a/tests/__pycache__/test_extract_links.cpython-312-pytest-9.0.2.pyc and /dev/null differ diff --git a/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index 17eb6a1..0000000 Binary files a/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc and /dev/null differ diff --git a/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index be91d29..0000000 Binary files a/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc and /dev/null differ diff --git a/tests/_runner.py b/tests/_runner.py new file mode 100644 index 0000000..6734d10 --- /dev/null +++ b/tests/_runner.py @@ -0,0 +1,37 @@ +"""Trampoline para invocar enrichers desde tests. + +El Python embebido de Windows (`python-embed`) ignora `PYTHONPATH` por +diseno — el control de sys.path lo lleva el fichero `python312._pth`. +Para inyectar el stub `requests` de tests sin tocar ese fichero, los +tests llaman a este runner en vez de a `run.py` directamente: + + python _runner.py + +El runner anade `$_STUB_PATHS` al frente de `sys.path` y ejecuta el +script objetivo como si hubiese sido invocado directamente. +""" +from __future__ import annotations + +import os +import runpy +import sys + + +def main() -> int: + stub_paths = os.environ.get("_STUB_PATHS", "") + if stub_paths: + for p in stub_paths.split(os.pathsep): + if p and p not in sys.path: + sys.path.insert(0, p) + + if len(sys.argv) < 2: + sys.stderr.write("usage: _runner.py