From 7a94160fd2f8c4676df3c013ada7c1971e216bc4 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 3 May 2026 14:41:28 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20catch-up=20de=20decisiones=20previas=20?= =?UTF-8?q?(Webpage=E2=86=92Url,=20anti-bot,=20UI=202-col,=20tests=20cross?= =?UTF-8?q?-platform)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bloque de cambios revisados y validados con el usuario en sesiones previas que no habian aterrizado en commits propios. Lista por tema: * enrichers: web_search ahora usa lite.duckduckgo.com como endpoint primario (mas tolerante con bot detection desde IP residencial), con fallback al endpoint html. Detecta pagina captcha y emite error claro si ambos fallan. Anyade _DDGLiteParser para el formato lite + auto-pick de parser por contenido. * enrichers: tipo Webpage unificado en Url (campos de cuerpo cacheado viven en metadata del Url). Manifests actualizados (applies_to: [Url]). fetch_webpage ya no convierte Url->Webpage. * enrichers/manifest: campo `params` parseado a EnricherSpec.params (name, type, default_value, description). UI puede renderizar dialog de configuracion. * jobs: fix de path conversion para Python embebido nativo Windows (no convertir a /mnt/c/... cuando el subproceso es Windows-native; solo cuando es bash o python via WSL). * main.cpp: ventana ImGui (no modal) "Run enricher" con layout 2-col (label izq, input der). Inserta job con JSON tipado. Layout clustering apretado: hijos del mismo anchor en un solo anillo alrededor del padre, sin desperdigar por anillos crecientes. * views: inspector con layout 2-col via BeginTable (Identity, Schema fields, Extras). Description full-width debajo de su label. * tests: portable conftest (auto-detecta REGISTRY_ROOT, PYTHON_BIN, ENRICHERS_DIR para WSL y Windows portable). _runner.py trampoline inyecta stub via sys.path porque embedded Python ignora PYTHONPATH. Tests bash-only (vendor_script, freeze, dispatcher bash, resolver Linux-binary) skipean en Windows. Tests existentes adaptados a Webpage->Url. Resultado actual: 32 passed WSL, 21 passed + 11 skipped Windows. --- enrichers.cpp | 95 +++++- enrichers.h | 18 +- enrichers/extract_domain/manifest.yaml | 2 +- enrichers/extract_links/manifest.yaml | 4 +- enrichers/extract_text_entities/manifest.yaml | 4 +- enrichers/fetch_webpage/manifest.yaml | 4 +- enrichers/fetch_webpage/run.py | 16 +- .../__pycache__/run.cpython-312.pyc | Bin 17486 -> 0 bytes enrichers/web_search/run.py | 205 ++++++++++-- examples/types.yaml | 17 +- jobs.cpp | 42 ++- main.cpp | 293 ++++++++++++++-- .../conftest.cpython-312-pytest-9.0.2.pyc | Bin 11234 -> 0 bytes ...extract_links.cpython-312-pytest-9.0.2.pyc | Bin 9225 -> 0 bytes ...text_entities.cpython-312-pytest-9.0.2.pyc | Bin 9081 -> 0 bytes ...fetch_webpage.cpython-312-pytest-9.0.2.pyc | Bin 10386 -> 0 bytes tests/_runner.py | 37 ++ tests/conftest.py | 98 +++++- tests/test_dispatcher_lang.py | 9 +- tests/test_extract_links.py | 8 +- tests/test_extract_text_entities.py | 10 +- tests/test_fetch_webpage.py | 4 +- tests/test_python_runtime_resolver.py | 6 + tests/test_vendor_script.py | 11 + views.cpp | 315 +++++++++++------- views.h | 16 + 26 files changed, 973 insertions(+), 241 deletions(-) delete mode 100644 enrichers/web_search/__pycache__/run.cpython-312.pyc delete mode 100644 tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc delete mode 100644 tests/__pycache__/test_extract_links.cpython-312-pytest-9.0.2.pyc delete mode 100644 tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc delete mode 100644 tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/_runner.py diff --git a/enrichers.cpp b/enrichers.cpp index 8b23538..9e440d9 100644 --- a/enrichers.cpp +++ b/enrichers.cpp @@ -59,6 +59,57 @@ std::vector parse_inline_list(const std::string& v) { return out; } +// Split por comas a nivel cero, respetando comillas y nesting de [] / {}. +// El YAML inline `{ name: limit, type: int, default: 10 }` puede contener +// strings con comas entre comillas — un split crudo las rompería. +std::vector split_top_level(const std::string& s) { + std::vector out; + std::string cur; + int depth_b = 0, depth_c = 0; + char quote = 0; + for (char c : s) { + if (quote) { + cur.push_back(c); + if (c == quote) quote = 0; + continue; + } + if (c == '"' || c == '\'') { quote = c; cur.push_back(c); continue; } + if (c == '[') ++depth_b; + if (c == ']') --depth_b; + if (c == '{') ++depth_c; + if (c == '}') --depth_c; + if (c == ',' && depth_b == 0 && depth_c == 0) { + out.push_back(cur); + cur.clear(); + continue; + } + cur.push_back(c); + } + if (!cur.empty()) out.push_back(cur); + return out; +} + +// Parsea un objeto YAML inline `{ name: x, type: int, default: 10 }` a un +// EnricherParam. Retorna true si al menos `name` se resolvio. +bool parse_inline_param(const std::string& v, EnricherParam* out) { + std::string s = strip(v); + if (s.size() < 2 || s.front() != '{' || s.back() != '}') return false; + s = s.substr(1, s.size() - 2); + for (auto& kv : split_top_level(s)) { + size_t colon = kv.find(':'); + if (colon == std::string::npos) continue; + std::string k = strip(kv.substr(0, colon)); + std::string val = strip_quotes(strip(kv.substr(colon + 1))); + if (k == "name") out->name = val; + else if (k == "type") out->type = lower(val); + else if (k == "default") out->default_value = val; + else if (k == "description") out->description = val; + else if (k == "desc") out->description = val; + } + if (out->type.empty()) out->type = "string"; + return !out->name.empty(); +} + // Manifest YAML soportado (subset): // id: fetch_webpage // name: "Fetch web page" @@ -66,16 +117,19 @@ std::vector parse_inline_list(const std::string& v) { // applies_to: [Webpage, Url] // lang: python <- issue 0033: go|python|bash (default python) // exec: run <- basename del binario/script (default "run") -// params: <- v1 ignora bloque -// - { name: timeout_s, ... } +// params: +// - { name: timeout_s, type: int, default: 15 } +// - { name: region, type: string, default: "" } // -// Las claves anidadas bajo `params:` (y otros bloques con valor vacio -// seguido de lineas indentadas) se ignoran. +// Solo el bloque `params:` se parsea con detalle. Otros bloques con valor +// vacio seguido de lineas indentadas (`emits:`, `relations:`, +// `uses_functions:`) se ignoran como antes. bool parse_manifest(const std::string& path, EnricherSpec* out) { std::ifstream f(path); if (!f) return false; std::string line; - bool in_skip_block = false; + bool in_skip_block = false; + bool in_params_block = false; while (std::getline(f, line)) { // Strip CR de Windows. if (!line.empty() && line.back() == '\r') line.pop_back(); @@ -84,10 +138,27 @@ bool parse_manifest(const std::string& path, EnricherSpec* out) { std::string trim = strip(line); if (trim.empty() || trim.front() == '#') continue; - // Si la linea NO empieza con whitespace, salimos del bloque skip. + // Si la linea NO empieza con whitespace, salimos de los bloques + // anidados — el siguiente top-level reinicia el contexto. bool indented = !line.empty() && std::isspace((unsigned char)line.front()); - if (!indented) in_skip_block = false; + if (!indented) { + in_skip_block = false; + in_params_block = false; + } if (in_skip_block) continue; + if (in_params_block) { + // Linea esperada: ` - { name: x, type: int, default: 10 }`. + // Tolera variaciones de indent y comilla. + std::string body = trim; + if (!body.empty() && body.front() == '-') { + body = strip(body.substr(1)); + } + EnricherParam p; + if (parse_inline_param(body, &p)) { + out->params.push_back(std::move(p)); + } + continue; + } size_t colon = trim.find(':'); if (colon == std::string::npos) continue; @@ -101,9 +172,17 @@ bool parse_manifest(const std::string& path, EnricherSpec* out) { else if (key == "applies_to") out->applies_to = parse_inline_list(val); else if (key == "lang") out->lang = lower(strip_quotes(val)); else if (key == "exec") out->exec_basename = strip_quotes(val); - else if (key == "params" && val.empty()) in_skip_block = true; + else if (key == "params") { + // `params: []` — vacio explicito, nada que hacer. + // `params:` — siguiente bloque indentado son items. + std::string vs = strip(val); + if (vs.empty()) in_params_block = true; + // Si fuese inline (`params: [{...}]`) — formato no usado en + // los manifests actuales, lo ignoramos. + } else if (key == "emits" && val.empty()) in_skip_block = true; else if (key == "relations" && val.empty()) in_skip_block = true; + else if (key == "uses_functions" && val.empty()) in_skip_block = true; } // Defaults — preservan retrocompat con manifests existentes que no diff --git a/enrichers.h b/enrichers.h index a082ad5..26bac80 100644 --- a/enrichers.h +++ b/enrichers.h @@ -10,11 +10,22 @@ // `enrichers_for_type(type_ref)` para mostrar el submenu filtrado por tipo // del nodo right-clickado. // -// Para v1 no parseamos `params` con detalle — solo lo necesario para -// presentar el item de menu y submitear el job con `{}`. +// Los parametros declarados en `params:` del manifest se parsean para que +// la UI pueda renderizar un dialog de configuracion antes de lanzar el +// job. Si la lista esta vacia, el job se submitea directamente con `{}`. namespace ge { +// Parametro declarado en `manifest.yaml` -> entrada `{ name, type, default }`. +// La UI de configuracion edita un buffer string por param y lo serializa a +// JSON segun el `type` al pulsar Run. +struct EnricherParam { + std::string name; // ej: "limit" + std::string type; // "int" | "float" | "string" | "bool" + std::string default_value; // valor por defecto en formato texto + std::string description; // opcional, para tooltip +}; + struct EnricherSpec { std::string id; // ej: "fetch_webpage" std::string name; // ej: "Fetch web page" @@ -34,6 +45,9 @@ struct EnricherSpec { // /{.exe} segun la plataforma. Default "run". std::string exec_basename; + // Parametros editables por el usuario antes de lanzar el job. + std::vector params; + // True si lang != "" y no se pudo resolver el ejecutable // correspondiente (ej: enricher Go sin compilar). El loader deja // el spec en el registro pero marcado como deshabilitado para diff --git a/enrichers/extract_domain/manifest.yaml b/enrichers/extract_domain/manifest.yaml index 7a14e75..ca568c9 100644 --- a/enrichers/extract_domain/manifest.yaml +++ b/enrichers/extract_domain/manifest.yaml @@ -1,7 +1,7 @@ id: extract_domain name: "Extract domain" description: "Saca el dominio de la url/email del nodo y crea/conecta una entidad Domain con relacion BELONGS_TO. No descarga nada." -applies_to: [Url, Webpage, Email] +applies_to: [Url, Email] emits: [Domain] relations: [BELONGS_TO] params: [] diff --git a/enrichers/extract_links/manifest.yaml b/enrichers/extract_links/manifest.yaml index e27e065..9a93a2b 100644 --- a/enrichers/extract_links/manifest.yaml +++ b/enrichers/extract_links/manifest.yaml @@ -1,7 +1,7 @@ id: extract_links name: "Extract links" -description: "Lee la markdown cacheada de un Webpage (metadata.markdown_path) y crea nodos Url para cada enlace encontrado, conectados con relacion LINKS_TO. Requiere haber ejecutado fetch_webpage antes." -applies_to: [Webpage] +description: "Lee la markdown cacheada del nodo Url (metadata.markdown_path) y crea nodos Url para cada enlace encontrado, conectados con relacion LINKS_TO. Requiere haber ejecutado fetch_webpage antes." +applies_to: [Url] emits: [Url] relations: [LINKS_TO] uses_functions: diff --git a/enrichers/extract_text_entities/manifest.yaml b/enrichers/extract_text_entities/manifest.yaml index e974411..8f4476b 100644 --- a/enrichers/extract_text_entities/manifest.yaml +++ b/enrichers/extract_text_entities/manifest.yaml @@ -1,7 +1,7 @@ id: extract_text_entities name: "Extract entities from text" -description: "Lee la markdown cacheada de un Webpage y extrae IoCs (IPs, emails, dominios, hashes, crypto wallets, CVEs, MAC, telefonos) creando entidades + relacion EXTRACTED_FROM. Sin coste — solo regex. Modelos ML (GLiNER/GLiREL) en futura iteracion." -applies_to: [Webpage] +description: "Lee la markdown cacheada de un Url y extrae IoCs (IPs, emails, dominios, hashes, crypto wallets, CVEs, MAC, telefonos) creando entidades + relacion EXTRACTED_FROM. Sin coste — solo regex. Modelos ML (GLiNER/GLiREL) en futura iteracion." +applies_to: [Url] emits: [Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone] relations: [EXTRACTED_FROM] uses_functions: diff --git a/enrichers/fetch_webpage/manifest.yaml b/enrichers/fetch_webpage/manifest.yaml index b967f0c..72b24dc 100644 --- a/enrichers/fetch_webpage/manifest.yaml +++ b/enrichers/fetch_webpage/manifest.yaml @@ -1,7 +1,7 @@ id: fetch_webpage name: "Fetch web page" -description: "Descarga HTML de una URL, extrae markdown limpio (readabilipy) y guarda los blobs en cache. Crea/actualiza el nodo Webpage con title/status_code/paths y crea el Domain con relacion BELONGS_TO." -applies_to: [Url, Webpage] +description: "Descarga HTML de una URL, extrae markdown limpio (readabilipy) y guarda los blobs en cache. Actualiza el nodo Url con title/status_code/paths/markdown en metadata y crea el Domain con relacion BELONGS_TO." +applies_to: [Url] emits: [Domain] relations: [BELONGS_TO] uses_functions: diff --git a/enrichers/fetch_webpage/run.py b/enrichers/fetch_webpage/run.py index 6d065e5..ca88f3f 100755 --- a/enrichers/fetch_webpage/run.py +++ b/enrichers/fetch_webpage/run.py @@ -3,7 +3,12 @@ Lee JSON de stdin, descarga la URL del nodo, convierte HTML a markdown, guarda blobs en `//.{html,md}`, actualiza el -nodo a tipo Webpage con metadata enriquecida y crea/conecta el Domain. +nodo (deja type_ref=Url) con metadata enriquecida y crea/conecta el Domain. + +Nota: historicamente fetch_webpage convertia Url -> Webpage, pero esos +dos tipos se han unificado en Url. Los campos de cuerpo cacheado +(html_path, markdown_path, status_code, fetched_at, text_length, ...) +viven en metadata. Wire protocol (issue 0026): - stdin: JSON con node_id, metadata, ops_db_path, app_dir, cache_dir, @@ -289,7 +294,14 @@ def main() -> int: log(f"node {node_id} disappeared") return 6 cur_type, cur_meta = row[0], row[1] or "{}" - new_type = "Webpage" if cur_type.lower() == "url" else cur_type or "Webpage" + # Webpage fue un tipo separado historicamente. Hoy se unifica en + # Url (mismo tipo, los campos de cuerpo cacheado viven en + # metadata): si el nodo entrante es Url o el legacy Webpage, lo + # dejamos como Url; si el nodo no tiene tipo, default Url. + if not cur_type or cur_type.lower() in ("url", "webpage"): + new_type = "Url" + else: + new_type = cur_type patch = { "url": url, diff --git a/enrichers/web_search/__pycache__/run.cpython-312.pyc b/enrichers/web_search/__pycache__/run.cpython-312.pyc deleted file mode 100644 index cef5a499dd332007bc89ee04c4c44f393330302a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17486 zcmc(G3ve6fo!>67cmo7T@U7QU5=B9xNK3LMi`K)GNIfixazx5LGxk9cb}51a0cv+4 zi3k|et?x`maz(|-2|7C``jVdMvuBdav{R?kUK1y6bIoOj6zK|H^Cr$+Iz4x$Q#w@U zJkCs~zyG&b0HUBc)9Fl?#JAsm`~Tkm_xInQ`Fw5;*YW@To7iwW$Nd96n3v7S@XM!p zj=RZ8Tt6rAk|oX$Tly_L<*jjRzm?sEeu3R>{Wf;D_uJXs(eGe)XTOu(UHvZHg}8gz z-R~au^m|x4wzzlL*Y9I_d%R-U-|rs|^aqA3`zujaE7eJkmu>x3QoZDSS?CW+4U!A@ zYN=6j<6a}Jl03NAN~aYA$T%IHlgg$5q$-|K;%#pCvoS>$N0g+Nj3(ovtZI>j6j8*@F;z{;;v)udRWLlmFtJ=G&dlL=}?4#%WcBa?^>%dO(DtVN`V7HJieBWhSW7aoadgRNp@ zWF#!blvXrUyrL0|ofV6Ud_JaX%4k?gCbd>^B%(xy)i$$RSy4K~cq}1DRPpT7r%xR@ z-QC;Uv3DSzjA;ACy%_R&dH-2fOB|48>0BgwL6l_i%&|ff$&}V1rVl8hxZF{MVNWQRDY zX(MV!d;6d^9B-4b=JY$CY>Os`Sy}smi~IIoOv%dVes`CADJ91*$;PJ8nrpFWvQ<=L z3Gu1OrATj7iH&G&;tA}MoQOxFvMOGR$qDSJoS>>2d*N1OEUFxh#xQ}i?d>aijkh02 zNz(a!d*uYpMZyxHe{nD}%I=bk(%3*O8lfpYjU^W&DJ?mSv7~_jz#%|OSn}bB8oPu$ z^^PS=X(=TsZdPTLx*d@f6!2yvbaOU zdMQZ^GltGmG?_g9o-HD9I4X^tb<;qxEr5>81bJvsNe;(S!`P=`pej})B4QQ$lF7I# zKAp$jHnAHc0S;rF?tw@=&JaP34UeFaw5%kz#S#Nbq|N;aQBsK4J&{BrNsFIMs68Qz z?gZ+}T5MR>os_1N2^mmsA|0%cDk~wY?gVnGa`>XEJ5q^@siX$rloDs9Q42MkzZ|%O zM)~D`fy_-#Gt0T5QmB0CP@Z8HfRK^+3@2G$7bI(jf1Q(rZ`q1W;0s!#zE@l(BLM^9 zLbjf?t3XI;Yuk#3^8zVQB<{*ln zKi@u>9G2VTsc1xPA4r5vLT(2~8Iq%#+MWbbYvJ=sWMr@%Y(Q;i*|2 zOwEjAMwzYIu^{Zc599-(b{Q49Xr7Bwo)hq68}u?V|2G?y;jk_Iw=Dn9w#Cw;crg~m zhXjac9ahh)G-6SyMygxmSd9kcTJTf1AX%{zS?g3PdvL0MK@e9?U#SIn6u-iSsTdU?V-Xwx3dhu>vJQZ)_^C}uun?Z$^}UmO z-*Ddc-tx|Sn(unr<~(h)?T6+)U9&>hgY%~@+4dhqrgZ)qtqi-$t%ONO5}(FU@0(lf zA5jxjVg<(Q0*zc*i*`x|Z>PZztI7rxKZ2iHkA%ivaZS5EIyrjvE7`s|f785Y{j9Lw z7%~mhyb3Jk8`vypRbUV?eoV+%6kEoU5hTlAKuT6gkZh9u4eOXKW6jtuaf&m893c*1 z91O-Oxg@vb0gT=<(oJI6dcXW=rgj9@O|P_WpinKKuA%nMjIM&ixb z->owBvN0b@JsBUy7JMhqR3GEUDl!#A#4lD{W?MsbD_-DHs+MZrtu@<-S|lI+na?~I zwLHhs?6z|%e;NAgbI`*m9razYKF3|=FLTB{R2ON&)^y3xzm)UPHX_hU&O^yCBs~-g z(lg@Fks!eF)K$>WAc9FV>V^ttDi&wUs1j$PY@v;dshAA4?Wta9bZBt|T9+74s;mh} zBB)}JoKTi7#LtSGi60HV5n%@c8J4J z85D@aP@8xxF$z{5fj}nlZs=JgprH+{H^J7UMW}i47`*_?&krS`K@3Mo?#s$Bv=sg=EN3Bjon& zJ01~xBLfj7mIqF^Yw{J1>1nMHQgJ9nq!hGYp)J^YWjOxC#eI*o?P=Z8zJ=u;Pglr^ zZD)F0pTxtNVj)Jqw`a$cLtfoU=!6DTT^LEC9f6iY_Z%A@Nh(@5X}r1@O~+&B+L)4~ z5aZG-@IV8j@~jYNj0W>o)YFjI#0Qb^|AMfpAf`Gn7@q@bMHL zj*PIeDefvQH@<2B zEIE1K6H9h3*v02oZT-~e@Y~0aECm9#PW~fjUCvv5{p94yoVPt!S(mF`h5scN=WAT% zY(Dn{zvSftmDev#UYhWIw&&bJ{`zd~ykETQZ<+JA%=<$V!iN<# zv$cDEDCE{Q-?Q@mmI>=GebrOF(=8wPw#-PmVBO6#^ zA6o^a<#nWUOyXp~|3)EohVZoVk_vi|Vu#Agr2#O;C2Ai6UEuqQO|qHLF0LqiGX>!~ z#%K6)@@A-vnk(7g$&XDrf?Cd%qXe%PnAQEb<}u#^j?}Idm)XvkC1V*P99wZo@J^iP zO&K9MNxqI*Ggif;Rhpm~tJz8~mthh_t1{m~J_;m5sJtLQT#-#oewd~i@z_B{9$>qslJ+zcBK=3_h}o;iI25&%Xlth2Ms+&h~WC5Mv=Y&l2)P<8BFTvo}9r|Y4HBI8QU zJ3HIj+BPXEZ6EGt71Ne10m;8IS($t#LUtCjcTo}jY*r!)j9OTTNii8sDG*p>X-5*N z1T0onOp4JIoDE44=5)R>Hok`>5PasuLkwK!(w6r2wA~m%+S(p(PuH5mx!)E}SKR-s z3$#>>8;9Jw8zzmSsh4Bgph6m~ZcoTsJP8dHvX~WvHNuaf5WXq<3Br0F5|)rlcXeMO zR!7!mB(4-DXypuE{~>3LRb=N(Hq*)`i1JZ<0G{%>~u)vou>eE+~=Wka^FKWdn$S$sTutl5;Sc>ai*;+WmtK2uHn0iY&Knp%{@Fbn=dJlQ$J=~4 zr#I*Ej~`#M;o)O_tp}U9_nTY?+pO*DTDbCBCBIDbddM3Cl ze7_eahF^DwflyB~dp-3fP*e96O^Mv6d2Nm)qJv3A+$ySx*a);IXlEddvkA?^*4QmK(m{f_7*>WnfU!T`BFQ25|ZLo7)cWUZb3Hi+<9=ka9&%-J9^6F2c94|!bC6ZVowFf z2&-4HF4$9n|HGvr^z=ck34p_33R)PRCor4=^>D}umqiMb(4FCMED_VfVbX(WdeC^n z;7hs-H#F5!RT}PCLMFm`k9Th&iVysdu~JXgmBH9YoePg*RU41irgx3LRfIt z=bBsa=+Cu4Gx)&SV3>x5MMQ(NOl%uQkh#fa_%7}RIG3LX%$NmY38Y41Pvx(9GghSj zqP?H7mNWZIbLpMDIE}*W>E(nHKY?9gc}pse1%UaT=$y@uNA4kY8;N7W6JT@#%X7jyo=)RC*1Kj#a3&=}}9_+M|34^^{|j zFta{}+=|)Q7)wtFO1GYv{MQL|1CkZFCkUz^6$#wn`9}6WsFt zb$MEQ>%});oL{&5?z(+*>-NpBJ1}4S~$?j;nkj;ukx;M%^lyG zhc;Wk>bCQibGquSx;N`)s_%w6=0Y9wO;6ly>YQupoNqcbU*C0q!(ubh)@s=M6>A@) zE8^yiuD*i|L=Q)dWaXtigd>)tmrTK{PUksjuduVlR#|&xDCLXW8|B{Sd*GEXutVJr zuNZde!QCcT(|x<`$?rCKuhO*-?lqPFE2^{)3GH-Ye@{)Gy6as(=UqSj=m*}-rF(SO zyJpV2=B{_+oOk1N-v{0;SbKAs3ql9Y6_)ug@$k@PezN@Xms5g@hZjmSY6)m|#q|&d zwE~JBO~M5!V#7*V4vs~bZ^uD3h?Z&LMxFa{V$2 z?$fXs+{|R8+VhafqGJg&sQL6_{B6e03b> z*k9ICFzvn!Pc%;BHAT#b1oFZQ${$g$PN=r@?uC3E6Cr8dVXA+3RJzCw$j z`|{RDRF22PVc>~`Qidl_8g|ID1WyUsW(+evdzt&xVz)Ve6L8qJELRD(nq{wGt6KJY zZ5x(Xdu`3js|DNEUwf^#hJSKfZS|jWNZ7KmcJ8zM$W_9a8Uk2AZAF8^-1srx1w*79 zT@`(MP!jlZ49RSR?8Px_(N|}dj|m#k$hgqL?+}zQgua>A@@-@G65pJ?+=-J^@Yr49 zRf<$9s0wsGrc~mLtj6rOc$W%C^DwO*&p0vyRBry-zKl@rv?=2#@&XuNmhwRFEB8CBaM%A-W7bse(<>a15dm<#j;(M*{s6 z$vF2bXVq2jc;}Kuu(jlzuIu(m`^06q1dw%1Irs<~X;H#c3_}@R zML|yPe6WO*hO=I>5cTDW#5yLF9zc{6_r>GtF7IU=6 zBjRvO9Y&-Ty-E~f^V-t8db>|_AL%S0D^M&8;f>5Z-rJWbQmTZ3;7&ZNGbd@ne^X>Y#j*G>X|13?@cdIO)`b8&$N zc*0#ZA9|~Stgf2R)k+ND&6>_;%a58}o!f0cdW1(BEmg`1K^U6u1$vfB%M2u-i8PQ; zfHAIuq;q7zZxP8MWDp;laY&Xo9Ai$c;7}>2`e&Sy6-I8};WOsS*p)gcZv_r+_Q6~= zE|N9_RudW$+1?8$&4V%*(N2(Tlon;L<~Q3ank1%lgM7gXKvLMp+(wKFMV&Nq2_eH9 zxiQZ}M&-%45!pc_a}3x@M&^dI(VOl*L{Y4zm|r4H)D-Z#P3(<{qmVg_A`k>XQ4@&N zWbqOM@CQXBZMwU7C=_xSI8Ej+lN!nm%I!x&2igeXK$zb^G6npLXouhCb+4RIQ;Hmp zsL@y~z5C45U7da1Me^zG?kiB%z5}hIVLVgieFyTi6_fTI&;`&{Sm(pB=|ASVo=<*? z{?e1jdV0H0_ld`P`c4&F+{_|vS&*$sl@OK5loGu!?3M-iYBKdy+G|LBrt`#^?p|>- z27`Y_&$yY+|BhntT?+#sun-&oL}Dz|O8GNd3!9W09#NHVQRxOMutv_ojgBzS!H`$^ zb1Ek3R+O_P>*!>g5lfucEt;zQ1=V&jiXS4jfipM6bxFEF+OSG47=!dn89*hN2|4#m zSilbexP+hjb0p&&xqsJXwdr1*p^qP3bOo+9j34|sSepyh=YowmHD{T1x$4?a{Z6la zyz5>SS6Oq@eZ!r7bVgnXJUV{tLkDE4_qrG10N(DLuQqGR9))naXS2H8kgQZL+2;MTz5|FYXk0h(*i`k@hRG+U zduOU=HcUS^-jx%aubg=4#FEA0IslJ^I{<*u>Gjj+W}TsXE}PAcI00|f_({Va6EH3F zB6h-9$kWI?T+$irqDasP5EIHN;U1<^k`RT-a_114WJ-v9WoK9Y73jp7jEY zJA1lBEqVVl?Hyj7ICk<_pST@15FrW(hvG2E)Uq)Y(H*QMY_<&z#3bcCKq=owq7qkQ zJ8q~%@Z1_hK)fhKo2vXZ-XpHxJQT=t{3Q$TTJy0hFx8x`o^`HWb_gDD|0OrH4T=f8 zx_`QQ*3&#I9T(xjG~t;n=jDUPk^=ikC}3v?>_zrCYKLe1FcZ@h8xs#J-^D1>7fKn5 zxuPmU{3&*yag>ssg&J;dYODBraub86FqY95va?NMJOc*q>FyKZL#NL4^mU&$7zWgX z%ZNQ9!~iJc82=7=<=<1n5aC}VM<#k4sPUiNqbAd%6Fx5FH)Qq2iTiqnzH!|78Oy`XB-J!RK-igor;jC}(_~8PD@~(X! zyZoTf%hR@LWoE~$v+bTk0EGgx0tS7e)528{@&5yqT6z##?&Rfn8NFlpyrhUj<-FZR z1DXZL6ohbLEI3Lv#jt4t;uMfC8^}^HUQNg$QfF3NCT4=0ly7B)X&U38As0eAp&?TU zGrLTDRcgO*mW7dl7RvYffT25vP}Q=tbl$%n7@tEU8-rai>&vYn)D?5IW^Dw&mWTD2 z5Wm+dLbWN|qjyCszTdeQ?k}7{h2(!=1|9?{Su%FjTU6tufY~qn$rjixYuq#tx)&d7#`>g_n!NXY` z={iX)>9N_YJG9Bn74Mk>&5Zdo{-IFujhU00%pT#2svisBR|%(D1uWR(uwbjSttMo~ zpQ)18(=g25Nv@9twQUbV0CEFC;=VC|Z)SJzyPMu2Ed$qBbtb6VG6AWDq}o_OYd6YK z%b%%+y}H?4Z&{(hb0Wpp)65sLSD%>3O*qwmhbSF2XqJd1kqpG zXM}Lkv4t#&X=gq{OU;9Rg0WHH`Y6HHJ$<_K$jMG|AgRc)^NH{Uc~sqZ>hR%mrWsW7 zhZ$^!9vSHE?mT_yC=|KgGbj4Orw*UT0C6qBc$J{jD{tcgt&pj#{3RvNQ!+=%9VCa9 zcaeq~6>`zj*JJpKRM|^!$)HqjQ$qHP;d4EO90B3`M`fBy>HI_?w#_~PV{zQNZA76D z3IB>p$w{hw6N&CnFCyef-l;ptw}cNC3@eoxus$XlDg`Jx3VBdT&bSKz8 zb9(%^*~sJb!HsucG=`vAf=$Id9Lr_v!KO4?O`^wQIrIjjF#Ayc2y> z9Sg$RC7g*>E>^9|cF*j2@0p)&T&OxV;kZ|UxP1WZSrFDNR;=<_Qx%K;weW{L zGxO~1uA?9LkL3dErnk=pHs0_rt>UU0mR55JDfUcyuK7NO(*qvA?8-KEUsFcz5M##+^SG+)s{~^_F8gPAmq+x_pG|d*=?(rDmbt2vwP23 zn0w!XB_;R%#_2CjADZ=UUT}sM{ej;u+3@1G_p0+wRGcR?Ts=x|)W+Z5Yh@J)y{V#z z?>eos-i-^+W*p&OlL+Czvvd05w|9N7JJ-7HhfSYa`NnR3ndAM3`Fn849^rpQ9k|fI zZ*U2^3zisF^S#SM$y$$)Fi&ZCg;B#Oc=YnS0@#`^k z0-Khd{NR&We4Vb40c7y7Kcd_p6NI49SAQ!@E$kfGJUeF4#P&jNgU>KN$V3ifhKv=l zvM29vi^|Z$m&%qhX2ohz`)M~j@g}$Y3wcxg^Qt@(Z98P}P=vJ`@}ZFp zH|mXv5eCMbs4PcOM8-sKP>v0qvCTqr#)RD^?J#cud(T_~EXVu;g*rMu0ZPV0>y>M0 zln&I{H|aP~SHLehcq8fbjHHWhNvgU9Um1Tyy^{b^)>A?<*7(rW!@jXI&K4EYe3dcg$|Q2S0EwWj{4CVD z*9ZyNul!HI@c60!9SMkn=RXxHdEqx3Iex=GaGsCo-^KnbKH}VX`~}zYcij44aF5P$ zkKVIe__}2`OL!iH;pF)Z%gz?QlV9%R8~DJz$B*)S{Z!*Uj_%9PSn7HI@-81ASQ6Lr zRk?NRm#lQ(vdy^f+GE`JcP>-;(m~$AZ^AcHOIEtC-?U8kB@lAuxOd5pX4pHr(>vUk q#ix1Rw=`hkz*5q8tPMHAJAUYuqc0tu?7RBR2Let{IH8d_QvMI^%Yk12 diff --git a/enrichers/web_search/run.py b/enrichers/web_search/run.py index b09b15d..3f4cbca 100755 --- a/enrichers/web_search/run.py +++ b/enrichers/web_search/run.py @@ -8,14 +8,20 @@ Wire protocol estandar (issue 0026): - stdout: una linea JSON al final con resumen. - exit code 0 = ok, !=0 = error. -DDG endpoint usado: https://html.duckduckgo.com/html/?q= -Devuelve HTML estatico, sin JavaScript. Los enlaces vienen envueltos en -redireccion `//duckduckgo.com/l/?uddg=` que hay que decodificar. +DDG endpoints usados: + 1. https://lite.duckduckgo.com/lite/ (POST) — endpoint primario. + HTML minimo (ano 2009-style), tabla con `` y + ``. Es el menos agresivo con bot + detection; suele responder 200 cuando el endpoint `html.` ya + devuelve un challenge "anomaly" desde IPs residenciales/Windows. + 2. https://html.duckduckgo.com/html/ (POST) — fallback. Su parser + usa `result__a` / `result__snippet`. DDG envuelve los enlaces en + `//duckduckgo.com/l/?uddg=` que hay que decodificar. -Para automatizar busquedas masivas en el futuro (sesion persistente, -cookies, JS, captchas) la fase 2 introducira un enricher `web_search_cdp` -que controle un Chromium remoto via DevTools Protocol. Este es el -fallback simple zero-infra. +Si ambos endpoints devuelven la pagina anti-bot ("anomaly", challenge +captcha), el enricher emite un error claro indicando que se necesita +`web_search_cdp` (issue 0029) — el fallback simple zero-infra no puede +resolver el challenge. """ from __future__ import annotations @@ -49,13 +55,33 @@ def now_ms() -> int: return int(time.time() * 1000) -def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str: - """Descarga la pagina HTML de resultados de DuckDuckGo. +def _ddg_post(url: str, params: dict, headers: dict, timeout: int) -> str: + try: + import requests # type: ignore + r = requests.post(url, data=params, headers=headers, timeout=timeout) + return r.text + except ImportError: + from urllib.parse import urlencode + from urllib.request import Request, urlopen + body = urlencode(params).encode() + req = Request(url, data=body, headers=headers) + with urlopen(req, timeout=timeout) as resp: # type: ignore + return resp.read().decode("utf-8", errors="replace") - El endpoint `html.duckduckgo.com` no requiere JS y respeta los - parametros `kl` (region) y `kp` (safe search: 1 strict, -1 off, - -2 moderate). Inyecta cookie para que el "moderate" se aplique sin - pantalla intermedia. + +def is_anomaly_page(htmltxt: str) -> bool: + """Detecta la pagina anti-bot de DDG (challenge captcha).""" + s = htmltxt.lower() + return "anomaly" in s and "challenge" in s + + +def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> tuple[str, str]: + """Descarga la pagina de resultados de DuckDuckGo. + + Intenta primero `lite.duckduckgo.com/lite/` (HTML minimo, ano-2009 + style, mucho menos agresivo con bot detection que `html.`). Si + ese endpoint devuelve la pagina anti-bot, cae al endpoint `html.`. + Devuelve `(html, source)` donde source ∈ {"lite", "html"}. """ params = {"q": query} if region: @@ -66,29 +92,22 @@ def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str: headers = { "User-Agent": ( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.7", } - try: - import requests # type: ignore - r = requests.post( - "https://html.duckduckgo.com/html/", - data=params, - headers=headers, - timeout=timeout, - ) - return r.text - except ImportError: - from urllib.parse import urlencode - from urllib.request import Request, urlopen - body = urlencode(params).encode() - req = Request("https://html.duckduckgo.com/html/", data=body, - headers=headers) - with urlopen(req, timeout=timeout) as resp: # type: ignore - return resp.read().decode("utf-8", errors="replace") + + htmltxt = _ddg_post("https://lite.duckduckgo.com/lite/", params, + headers, timeout) + if not is_anomaly_page(htmltxt): + return htmltxt, "lite" + + log("lite endpoint devolvio challenge — fallback a html endpoint") + htmltxt = _ddg_post("https://html.duckduckgo.com/html/", params, + headers, timeout) + return htmltxt, "html" def decode_ddg_href(href: str) -> str: @@ -195,7 +214,7 @@ class _DDGParser(HTMLParser): def parse_ddg_html(htmltxt: str) -> list[dict]: - """Parsea el HTML de DDG y devuelve [{url, title, snippet, rank}].""" + """Parsea el HTML del endpoint `html.duckduckgo.com`.""" p = _DDGParser() try: p.feed(htmltxt) @@ -221,6 +240,100 @@ def parse_ddg_html(htmltxt: str) -> list[dict]: return out +class _DDGLiteParser(HTMLParser): + """Parser para `lite.duckduckgo.com/lite/`. + + Estructura tipica: + title + ... + snippet text + Los snippets vienen DESPUES del enlace (no hijo del mismo elemento), + asi que parea por orden: cada `result-link` consume el siguiente + `result-snippet`. + """ + + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.results: list[dict] = [] + self._in_link = False + self._in_snippet = False + self._cur_href = "" + self._title_buf: list[str] = [] + self._snippet_buf: list[str] = [] + self._pending_snippet_for: int | None = None + + def _attrs_dict(self, attrs): + return {k: (v or "") for k, v in attrs} + + def handle_starttag(self, tag: str, attrs): + a = self._attrs_dict(attrs) + cls = a.get("class", "") + if tag == "a" and "result-link" in cls: + href = a.get("href", "") + self._in_link = True + self._cur_href = href + self._title_buf = [] + elif tag == "td" and "result-snippet" in cls: + self._in_snippet = True + self._snippet_buf = [] + + def handle_endtag(self, tag: str): + if self._in_link and tag == "a": + title = " ".join("".join(self._title_buf).split()) + self.results.append({ + "href": self._cur_href, + "title": title, + "snippet": "", + }) + self._pending_snippet_for = len(self.results) - 1 + self._in_link = False + elif self._in_snippet and tag == "td": + snippet = " ".join("".join(self._snippet_buf).split()) + if self._pending_snippet_for is not None: + self.results[self._pending_snippet_for]["snippet"] = snippet + self._pending_snippet_for = None + self._in_snippet = False + + def handle_data(self, data: str): + if self._in_link: + self._title_buf.append(data) + elif self._in_snippet: + self._snippet_buf.append(data) + + +def parse_ddg_lite(htmltxt: str) -> list[dict]: + """Parsea el HTML del endpoint `lite.duckduckgo.com/lite/`.""" + p = _DDGLiteParser() + try: + p.feed(htmltxt) + p.close() + except Exception as e: + log(f"DDG lite parser failed: {e}") + + out: list[dict] = [] + seen: set[str] = set() + for r in p.results: + href = r.get("href") or "" + # lite envia URLs absolutas directas; aun asi pasamos por + # decode_ddg_href por si en algun caso DDG envuelve. + url = decode_ddg_href(href) + if not url or not url.startswith(("http://", "https://")): + continue + # Excluir auto-promociones de DDG (paginas de ayuda). + if "duckduckgo.com/duckduckgo-help-pages/" in url: + continue + if url in seen: + continue + seen.add(url) + out.append({ + "url": url, + "title": r.get("title") or "", + "snippet": r.get("snippet") or "", + "rank": len(out) + 1, + }) + return out + + def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None: """Busca un nodo Url existente con la misma url en metadata.""" cur = conn.execute( @@ -384,18 +497,40 @@ def main() -> int: progress(0.10, "fetching") try: - htmltxt = fetch_ddg(query, timeout=timeout_s, region=region, safe=safe) + htmltxt, source = fetch_ddg(query, timeout=timeout_s, + region=region, safe=safe) except Exception as e: log(f"DDG fetch failed: {e}") print(json.dumps({"error": str(e), "query": query, "entities_added": 0, "relations_added": 0})) return 4 + if is_anomaly_page(htmltxt): + log("DDG devolvio challenge captcha en ambos endpoints — " + "usar web_search_cdp (issue 0029) para resolver") + print(json.dumps({ + "error": "DDG bot challenge — captcha required", + "query": query, + "engine": "duckduckgo", + "source": source, + "results": 0, + "entities_added": 0, + "relations_added": 0, + }, ensure_ascii=False)) + return 4 + progress(0.55, "parsing") - results = parse_ddg_html(htmltxt) + # El parser se elige por contenido — si el endpoint y el markup no + # coinciden (tests con stub que sirve cualquier URL, o un cambio + # futuro de DDG), aun extraemos resultados. Probamos ambos y nos + # quedamos con el que devuelva mas. + results_lite = parse_ddg_lite(htmltxt) if "result-link" in htmltxt else [] + results_html = parse_ddg_html(htmltxt) if "result__a" in htmltxt else [] + results = results_lite if len(results_lite) >= len(results_html) else results_html if limit > 0: results = results[:limit] - log(f"DDG returned {len(results)} results") + log(f"DDG ({source}) returned {len(results)} results " + f"(lite_parsed={len(results_lite)} html_parsed={len(results_html)})") progress(0.80, "applying") conn = sqlite3.connect(ops_db_path) diff --git a/examples/types.yaml b/examples/types.yaml index 7e4f882..5b1ca3f 100644 --- a/examples/types.yaml +++ b/examples/types.yaml @@ -101,25 +101,18 @@ entities: - { name: country, type: string } - { name: postcode, type: string } + # Url — unifica el viejo Url (solo metadata) y Webpage (cuerpo + # cacheado). Tras fetch_webpage, los campos `*_path`, `status_code`, + # `fetched_at`, `text_length`, etc. tienen valor; sin haber corrido + # fetch siguen vacios pero el nodo sigue siendo un Url valido. - name: Url color: "#89E0FC" icon: ti-link principal_field: url - fields: - - { name: url, type: url, required: true } - - { name: title, type: string } - - { name: domain, type: string } - - # Documento web descargado. Issue 0027: tipo separado de Url para nodos - # con cuerpo cacheado (HTML+markdown+screenshot). Los enrichers - # fetch_webpage / extract_links / extract_text_entities lo pueblan. - - name: Webpage - color: "#89E0FC" - icon: ti-file-text - principal_field: url fields: - { name: url, type: url, required: true } - { name: title, type: string } + - { name: domain, type: string } - { name: status_code, type: int } - { name: content_type, type: string } - { name: fetched_at, type: date } diff --git a/jobs.cpp b/jobs.cpp index 184a764..ba0d841 100644 --- a/jobs.cpp +++ b/jobs.cpp @@ -378,15 +378,20 @@ std::string read_entity_field(const char* db_path, const char* id, return out; } -// JSON entregado al subprocess. Todos los paths se normalizan a WSL en -// Windows; en POSIX los respeta tal cual. +// JSON entregado al subprocess. En Windows, los paths se normalizan a +// forma WSL solo cuando el subprocess corre dentro de WSL (lang=bash, o +// python con runtime registry_venv). Para subprocesses nativos Windows +// (lang=go, o python embedded/FN_PYTHON/system) se mantienen los paths +// Windows-nativos — pasarlos como /mnt/c/... haria que fallen al abrir. +// En POSIX la conversion es no-op y siempre se respetan los paths. std::string build_stdin_json(const std::string& job_id, const std::string& enricher_id, const std::string& node_id, const std::string& params_json, const std::string& ops_db, const std::string& app_dir, - const std::string& registry_root) + const std::string& registry_root, + const std::string& lang) { std::string node_type, node_name, node_metadata = "{}"; if (!node_id.empty()) { @@ -420,10 +425,25 @@ std::string build_stdin_json(const std::string& job_id, std::string app_dir_abs = absify(app_dir); std::string root_abs = absify(registry_root); - std::string ops_db_wsl = to_wsl_path(ops_db_abs); - std::string app_dir_wsl = to_wsl_path(app_dir_abs); - std::string root_wsl = to_wsl_path(root_abs); - std::string cache_dir = app_dir_wsl + "/cache"; + // Decidir si convertir paths a forma WSL. Solo se hace cuando el + // subprocess vive dentro de WSL — si no, los paths /mnt/c/... no + // existen para el proceso Windows-nativo. + bool use_wsl_paths = false; +#ifdef _WIN32 + if (lang == "bash") { + use_wsl_paths = true; + } else if (lang == "python") { + use_wsl_paths = cached_python_runtime().needs_wsl; + } + // lang == "go": siempre nativo Windows. +#else + (void)lang; +#endif + + std::string ops_db_out = use_wsl_paths ? to_wsl_path(ops_db_abs) : ops_db_abs; + std::string app_dir_out = use_wsl_paths ? to_wsl_path(app_dir_abs) : app_dir_abs; + std::string root_out = use_wsl_paths ? to_wsl_path(root_abs) : root_abs; + std::string cache_dir = app_dir_out + "/cache"; std::ostringstream o; o << '{' @@ -434,10 +454,10 @@ std::string build_stdin_json(const std::string& job_id, << "\"node_name\":\"" << json_escape(node_name) << "\"," << "\"metadata\":" << (node_metadata.empty() ? "{}" : node_metadata) << "," << "\"params\":" << (params_json.empty() ? "{}" : params_json) << "," - << "\"ops_db_path\":\"" << json_escape(ops_db_wsl) << "\"," - << "\"app_dir\":\"" << json_escape(app_dir_wsl) << "\"," + << "\"ops_db_path\":\"" << json_escape(ops_db_out) << "\"," + << "\"app_dir\":\"" << json_escape(app_dir_out) << "\"," << "\"cache_dir\":\"" << json_escape(cache_dir) << "\"," - << "\"registry_root\":\"" << json_escape(root_wsl) << "\"" + << "\"registry_root\":\"" << json_escape(root_out) << "\"" << '}'; return o.str(); } @@ -1030,7 +1050,7 @@ void worker_loop() { } std::string stdin_payload = build_stdin_json( ctx.id, ctx.enricher_id, ctx.node_id, ctx.params_json, - ops_db, g_state->app_dir, g_state->registry_root); + ops_db, g_state->app_dir, g_state->registry_root, lang); ProcResult res = run_subprocess(job_id, run_path, lang, stdin_payload, ctrl); diff --git a/main.cpp b/main.cpp index 6c397a9..43cd088 100644 --- a/main.cpp +++ b/main.cpp @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #ifndef _WIN32 @@ -318,27 +320,91 @@ static void place_orphans_near_neighbors(GraphData& g, float min_dist, int park_n = 0; int placed_neighbor = 0, placed_camera = 0, parked = 0; - for (int i = 0; i < g.node_count; ++i) { - GraphNode& n = g.nodes[i]; - if (n.x != 0.0f || n.y != 0.0f) continue; + // ----- Pase 1: agrupar orphans por su anchor (vecino con posicion) ----- + // Cuando un enricher crea N nodos todos conectados al mismo source + // (caso tipico: web_search → N Urls SEARCH_RESULT_OF source), queremos + // que los N nodos clustereen MUY apretados alrededor del source en + // un solo anillo, no que se desperdiguen por anillos concentricos + // hasta encontrar slot libre. La busqueda anti-colision individual + // los empuja hacia fuera cuando ya hay vecinos preexistentes; aqui + // les damos a los hermanos del mismo anchor angulos repartidos en + // un anillo unico cerca del padre. + std::unordered_map> orphans_by_anchor; + std::vector orphans_no_anchor; + for (int i = 0; i < g.node_count; ++i) { + const GraphNode& n = g.nodes[i]; + if (n.x != 0.0f || n.y != 0.0f) continue; int parent = layout_first_placed_neighbor(g, i); - if (parent >= 0) { - float ox, oy; - if (find_collision_free_slot( - g, i, g.nodes[parent].x, g.nodes[parent].y, - min_dist, n.user_data, - neighbor_radii, n_neighbor_radii, &ox, &oy)) { - n.x = ox; n.y = oy; - } else { - // Acepta solape como ultimo recurso. - n.x = g.nodes[parent].x + neighbor_radii[n_neighbor_radii - 1]; - n.y = g.nodes[parent].y; + if (parent >= 0) orphans_by_anchor[parent].push_back(i); + else orphans_no_anchor.push_back(i); + } + + // ----- Pase 2: place clusters (orphans con anchor) ----- + // Para cada anchor con sus hijos, los repartimos en un anillo + // alrededor del padre. Si hay mas hijos de los que caben en el + // anillo base, abrimos anillos adicionales. Cada hijo sigue + // pasando find_collision_free_slot como fallback si el slot ideal + // estaba ocupado por otro nodo del grafo. + const float two_pi = 6.28318530718f; + for (auto& kv : orphans_by_anchor) { + int parent = kv.first; + std::vector& kids = kv.second; + if (kids.empty()) continue; + // Orden estable por user_data para que rondas sucesivas del + // mismo enricher (mismo set de hijos) coloquen igual. + std::sort(kids.begin(), kids.end(), + [&](int a, int b) { + return g.nodes[a].user_data < g.nodes[b].user_data; + }); + float cx = g.nodes[parent].x; + float cy = g.nodes[parent].y; + // Capacidad por anillo: circunferencia / min_dist. + // Para min_dist=60, ring r=80 -> ~8 slots; r=140 -> ~14. + for (size_t k = 0; k < kids.size(); ++k) { + // Anillo y slot dentro del anillo en funcion del indice. + int ri = 0; size_t accum = 0; size_t cap = 0; + for (; ri < n_neighbor_radii; ++ri) { + float r_here = neighbor_radii[ri]; + cap = (size_t)std::max(6.0f, two_pi * r_here / min_dist); + if (k < accum + cap) break; + accum += cap; } - n.vx = n.vy = 0.0f; + if (ri >= n_neighbor_radii) ri = n_neighbor_radii - 1; + float r_use = neighbor_radii[ri]; + cap = (size_t)std::max(6.0f, two_pi * r_use / min_dist); + size_t slot = k - accum; + // Jitter pequeno por user_data para que rondas distintas no + // queden alineadas si comparten anchor. + uint64_t seed = g.nodes[kids[k]].user_data; + float jitter = ((float)((seed >> 16) & 0xFF) / 255.0f) * (two_pi / cap); + float angle = jitter + (float)slot * (two_pi / cap); + float px = cx + r_use * std::cos(angle); + float py = cy + r_use * std::sin(angle); + // Si el slot ideal colisiona con un nodo ajeno al cluster, + // delegamos en find_collision_free_slot que probara mas + // angulos en radios crecientes. + GraphNode& kid = g.nodes[kids[k]]; + if (layout_no_collision(g, kids[k], px, py, min_dist)) { + kid.x = px; kid.y = py; + } else { + float ox, oy; + if (find_collision_free_slot( + g, kids[k], cx, cy, min_dist, seed, + neighbor_radii, n_neighbor_radii, &ox, &oy)) { + kid.x = ox; kid.y = oy; + } else { + kid.x = px; kid.y = py; // ultimo recurso: solape + } + } + kid.vx = kid.vy = 0.0f; ++placed_neighbor; - continue; } + } + + // ----- Pase 3: place orphans sin anchor (camera o parking lot) ----- + for (int i : orphans_no_anchor) { + GraphNode& n = g.nodes[i]; if (use_camera) { // Sin vecino → colocar dentro de la camara con ring placement. @@ -875,10 +941,29 @@ static void render_context_menu() { } else { for (const auto& s : specs) { if (ImGui::MenuItem(s.name.c_str())) { - char job_id[64]; - bool ok = ge::jobs_submit(s.id.c_str(), sql_id, lbl, - "{}", job_id, sizeof(job_id)); - if (ok) g_app.panel_jobs = true; + if (s.params.empty()) { + // Sin params editables: submit directo, comportamiento + // historico — un click y a correr. + char job_id[64]; + bool ok = ge::jobs_submit(s.id.c_str(), sql_id, lbl, + "{}", job_id, sizeof(job_id)); + if (ok) g_app.panel_jobs = true; + } else { + // Abrir ventana de configuracion. Inicializar + // buffers con los defaults del manifest. + g_app.enr_modal_id = s.id; + g_app.enr_modal_node_id = sql_id; + g_app.enr_modal_node_label = lbl ? lbl : ""; + g_app.enr_modal_param_bufs.clear(); + g_app.enr_modal_param_bufs.resize(s.params.size()); + for (size_t i = 0; i < s.params.size(); ++i) { + const std::string& dv = s.params[i].default_value; + auto& buf = g_app.enr_modal_param_bufs[i]; + buf.assign(256, '\0'); + std::snprintf(buf.data(), buf.size(), "%s", dv.c_str()); + } + g_app.enr_window_open = true; + } } if (!s.description.empty() && ImGui::IsItemHovered()) { ImGui::SetTooltip("%s", s.description.c_str()); @@ -891,6 +976,171 @@ static void render_context_menu() { ImGui::EndPopup(); } +// ---------------------------------------------------------------------------- +// Modal: configurar parametros de enricher antes de lanzar el job +// ---------------------------------------------------------------------------- +// Se invoca desde el context menu (Run enricher → click). Si el enricher +// declara `params` en su manifest, en lugar de submitear directamente, +// llenamos el AppState (ver bloque `enr_modal_*`) y aqui renderizamos el +// dialogo. El usuario ajusta valores y al pulsar Run construimos el +// JSON `{ "param": value, ... }` y lo pasamos a `jobs_submit`. + +static std::string json_escape_str(const std::string& s) { + std::string out; + out.reserve(s.size() + 8); + for (char c : s) { + switch (c) { + case '"': out += "\\\""; break; + case '\\': out += "\\\\"; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + if ((unsigned char)c < 0x20) { + char b[8]; + std::snprintf(b, sizeof(b), "\\u%04x", (unsigned char)c); + out += b; + } else { + out.push_back(c); + } + } + } + return out; +} + +// Renderiza una fila label/input dentro de una BeginTable de 2 columnas. +// El label va a la izquierda alineado al frame del input; el input usa +// todo el ancho disponible de la columna derecha. +static void labeled_row_begin(const char* label) { + ImGui::TableNextRow(); + ImGui::TableNextColumn(); + ImGui::AlignTextToFramePadding(); + ImGui::TextUnformatted(label); + ImGui::TableNextColumn(); + ImGui::SetNextItemWidth(-FLT_MIN); +} + +static void render_enricher_config_window() { + if (!g_app.enr_window_open) return; + + ImGui::SetNextWindowSize(ImVec2(420, 0), ImGuiCond_FirstUseEver); + if (!ImGui::Begin("Run enricher", &g_app.enr_window_open, + ImGuiWindowFlags_NoCollapse)) { + ImGui::End(); + return; + } + + const ge::EnricherSpec* spec = ge::enricher_by_id(g_app.enr_modal_id.c_str()); + if (!spec) { + ImGui::TextDisabled("(enricher no encontrado)"); + ImGui::End(); + return; + } + + ImGui::Text("%s", spec->name.c_str()); + if (!spec->description.empty()) { + ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.7f, 0.7f, 0.7f, 1.0f)); + ImGui::TextWrapped("%s", spec->description.c_str()); + ImGui::PopStyleColor(); + } + ImGui::Separator(); + ImGui::TextDisabled("Node: %s", g_app.enr_modal_node_label.c_str()); + ImGui::Spacing(); + + // Asegurar tamaño de buffers — un manifest puede haberse recargado + // con mas params de los que llenamos al abrir la ventana. + if (g_app.enr_modal_param_bufs.size() < spec->params.size()) { + g_app.enr_modal_param_bufs.resize(spec->params.size()); + } + + if (ImGui::BeginTable("##enr_params", 2, + ImGuiTableFlags_SizingStretchProp | + ImGuiTableFlags_NoBordersInBody)) { + ImGui::TableSetupColumn("name", ImGuiTableColumnFlags_WidthFixed, 110.0f); + ImGui::TableSetupColumn("value", ImGuiTableColumnFlags_WidthStretch); + + for (size_t i = 0; i < spec->params.size(); ++i) { + const auto& p = spec->params[i]; + auto& buf = g_app.enr_modal_param_bufs[i]; + if (buf.size() < 256) buf.resize(256, '\0'); + + ImGui::PushID((int)i); + labeled_row_begin(p.name.c_str()); + const std::string& t = p.type; + if (t == "int") { + int v = std::atoi(buf.data()); + if (ImGui::InputInt("##v", &v, 1, 10)) { + std::snprintf(buf.data(), buf.size(), "%d", v); + } + } else if (t == "float" || t == "double" || t == "number") { + float v = (float)std::atof(buf.data()); + if (ImGui::InputFloat("##v", &v)) { + std::snprintf(buf.data(), buf.size(), "%g", v); + } + } else if (t == "bool") { + bool v = (std::strcmp(buf.data(), "true") == 0 || + std::strcmp(buf.data(), "1") == 0); + if (ImGui::Checkbox("##v", &v)) { + std::snprintf(buf.data(), buf.size(), "%s", v ? "true" : "false"); + } + } else { + ImGui::InputText("##v", buf.data(), buf.size()); + } + if (!p.description.empty() && ImGui::IsItemHovered()) { + ImGui::SetTooltip("%s", p.description.c_str()); + } + ImGui::PopID(); + } + ImGui::EndTable(); + } + + ImGui::Separator(); + if (ImGui::Button("Run", ImVec2(100, 0))) { + // Construir JSON `{ "name": value, ... }` segun los tipos. + std::string j = "{"; + for (size_t i = 0; i < spec->params.size(); ++i) { + const auto& p = spec->params[i]; + const auto& buf = g_app.enr_modal_param_bufs[i]; + if (i) j += ","; + j += "\""; + j += json_escape_str(p.name); + j += "\":"; + if (p.type == "int") { + int v = std::atoi(buf.data()); + char b[32]; std::snprintf(b, sizeof(b), "%d", v); + j += b; + } else if (p.type == "float" || p.type == "double" || p.type == "number") { + double v = std::atof(buf.data()); + char b[64]; std::snprintf(b, sizeof(b), "%g", v); + j += b; + } else if (p.type == "bool") { + bool v = (std::strcmp(buf.data(), "true") == 0 || + std::strcmp(buf.data(), "1") == 0); + j += v ? "true" : "false"; + } else { + j += "\""; + j += json_escape_str(buf.data()); + j += "\""; + } + } + j += "}"; + + char job_id[64]; + bool ok = ge::jobs_submit(spec->id.c_str(), + g_app.enr_modal_node_id.c_str(), + g_app.enr_modal_node_label.c_str(), + j.c_str(), job_id, sizeof(job_id)); + if (ok) g_app.panel_jobs = true; + g_app.enr_window_open = false; + } + ImGui::SameLine(); + if (ImGui::Button("Cancel", ImVec2(100, 0))) { + g_app.enr_window_open = false; + } + + ImGui::End(); +} + // ---------------------------------------------------------------------------- // Label callback // ---------------------------------------------------------------------------- @@ -1742,6 +1992,9 @@ static void render() { ImGui::SetNextWindowSize(ImVec2(520.0f, 720.0f), ImGuiCond_FirstUseEver); ge::chat_render(&g_app.panel_chat); + // Enricher config window (abierto desde context menu Run enricher). + render_enricher_config_window(); + g_first_render = false; } diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index df002aec8545a8a1736b1907e002ab5eb92a5a33..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11234 zcmdTqYiv~4nfK1U^PaJ1@B=?ET-#uK6k}sTl9uqO9|IU1Lu`{2vuL=Udu{GyX71$N zJI0=jThmen7rMYE5>8=NXO-PLOMeh)S7oLBu}P(ES1VO`L1nn1QoD_`;-3jbin6QK ze&4xsXU49{qgk!A$DZ^2&Uaql>wM?_tKaWoAaworC(0`g4D$ypSb;M$?4w;ahPlE> z%qSzVk}b|AY@;?7^IV)8ZAtAf0iGum&!&r$bs0-A_glbm7is- z^^b0R0!*L;X44a3f+aA5#7XOtMsp;wJmYxRY8h0HG}^0wRPJb_=`mwR<{gZOFt$of`--Hm_=<=0BN+qxXFe#EwcBG~)MCoM2+L1OZ(=J5ysfao$6QW467J>mz$#GaN zow1UfRHI@ZjWDT*FeO3I(qk#2Mr956Jt@Ep5{d?kI;{{{fEHbis&U#srjBVH(Nqd{ zI~=$6Op{^8skj1&;fzj>CnLq}?S$?zIjTo=IgyHsy4+c8gRMU){QK;i9$306#6=uE zo=zkogS(s7#H0{caozb-?+}I+VFh|2DkjDh5vJ@h53pSTT9I)X<483E%#EH>T*-(X zo(ISzA=dDU$)u{A1TY9?{s@r913NT6K&fK53qu-?h^D8&<+Yy`JwQ z*?z`d&yT}0y3>NyT~^M5)Ty|}+L+I3E2bI9mOzvhTdZ6xXyuBj#7lN-bc&%ce%5A< zD><&&hMABv^V_f%CBT13NO0y!qzd5aR2mKh&UPBglW93VNdaIpEnZS4@gN1_C$ulFCGmKj|@kS40I2M z+=k~gQFIv?WJ))9L}=KhbRwl097qYnjeUkAef>j*P1S(pfKoL%WjJBu6av89gsg`+ z!v%sVf{g}`Gc(*6N5+-7tRaQLW_~u*c}h*lo$+*3)H;EUEHPv8sMe`!N>Yzd!P*J3 zOY2-IV>`|Lx0nky-tgo304DP&$VM2d34WRg(JZsf4tB?X&$k-hY>`S>kK1 zRsN*zt-6K2+qIz^`)~0zclcc&Ro7k%y^(x}{511+W~sXC?|OcI@V$dSAAE1{pLX4P zarpL*k)`U9`g(!NViM;L(8rvss@`$*{(81fd94 zHk(vZg5PQu`ojCVj}D9otssMdgj-E?AeMnYQmB4bqAS8%kZJ-5@pCcfA$4&=GvQWE zv552di9D>Prb#rv-Zhq!kabX`zzYyoG#635Hj_sMQWUn?3{QZ~Lq>@V(qD>*`YNpE zo}^PSupo$feM6QD=WoWBr+`!% zgv0cg*YPW({`{cR)>;y7PPS~)4jsr5+=nc zkSUsZD=gl2!PI}I!I{)#0>>#NRY}!4GX51PLJq^Jy&4Cd^|avxL`fh!!!J+C(KHAH zD%lLraPNWe3*C|77Y7Un7|01lH|$X`=Hw7Z>R`@>O&T+J>}{zh0(VQsaG2IQ2|?i~ z{IqU}K#@2Y&$>&8m)y;>J@>uqF7BP%JHP$5cjIhdj(6SVtC#rdH#2wmCMdmr=**$B zFJR{2nS*Bsa$f&zKiLFWR-7cB5sK! z1tKaLw1n{b(F)47)x6uRFmUtN2RI~vOzK1pbr>AL!rtMv0!$2h;uT4eIY@YC8MqPyn~UV3K9RWsY2|nyjku= z99c(f4W^UWESt4uxvaftPR9!7Gz^gsS$>Muz;mH6rAGL8!5{`!bi{COt3nCUIAZ=) zjZiGMPxB#rW=Frtc0eJN@sL<_B?Wg&GN$N6L?Qwkcm*$ZWUloO4u^+E1Q4T#OAH@H zY8TL|0oyfiynr=J^?tix?Qy#x6t!}@P*lMo;ic{Yuu+A!z3oEjztFmU+xBO-ZtH^o zk!{;{(*M!c_9xVb_5pqP#(W9}JkhQ!4&Zs8f0!1WNr1^a`F45dKe~*m2gnh|O?WE>^gR!C3iV z(bWw9g(`&C*lD2a89R)~OE&arks7@~%WGXAEjWw2k}URzS`K?d%I8z+XM-~Jlx6Lb z^O}8_Y5qdBcAA;ujxnd%W=2PmL&Iiw0eZ%2_aPNrlWoTX4H+1-g zB7eMmAUqU?j=T3h;9&%Ur$+F&9$@YP92)bk-LNSV*@Z2;F`~4FR2Om;X_)MU3d1!n z!zEHp%9L~|IUDx4DoUE+4o~5=6YR2(!=z|%E}*RmG)*bj;G$__a6~bHqP$PHMcCh`d80h7`Zq)H#*PFcfaMh zm^x zBj0~M_A$PykD=Os&*cN&vHL&MP|06u?%|mC1Dkqkm=CyhJyqNXHSV6x+y~7p#4CI$ zX#Y?KdgrS%5KAh*91|sjPk<|UHoiWiO;8r;V@`zO1RcO^4wGHJklFHZE11ohwXfx) z0h}i5IZPKflug!iNKVRAqeka{n?7^rPqL6!S))wROB5RX<2)-X8!vPi- zTya1mpt3UM1gay_kF7i~HMnuqv~IA39D*up`sCGA-e~iq;cap_xGcypKwU=2-$V4r zJnpqu-SbxD)>U6=zuXRN9^9SVyZ`1Zz~&W)*tyjmZwR9wu? zWzT=-9A9AmoX0oY`=LL0js@O#SLU`L?~lA$d8O`h-F)}F_*UPYU?cE;4|I1^fid3* z5@GWqyV&?Hf3xvs_djgTZQQiLz9rqTfdr`B@d?AapIP=YW$Qpa`pRbye!5&yA`LuD zpcbS73>B>SVGruk%Dg;yuBV-OzhzS|Yx|&mUC$ow7p%9p-2RI)7UC810F5&$54Qh5 zmj_wCkGTkIn_!~06ICD+?4Te`1!~W8u|3blc9V-iee#!W7uNI8W9BPyEk3$opVOeY z1~Z5Mh>J@6A7CvOTpUzk7M(4o#zY%z6v8>=@!nADlMTzlOJDj)uE1 z$B=y$w~|5Z(gIOl*_!IRz>(w#whuw{M|!r5vSDGp*bA@h1D8^sC^n zy8fl|wr?<3Rg0Ro9yM+KrvPhd+W-342Gtt+?XNu7y_b2vsjjDze}C_~o*?%CU9xbq z;_%hNsVfN?#8H8%aN`S1^=n|Q(i>iu!HaJG`GU{#Yq>UN>^4v$1&b4ITH4>}1sqh{#rXwr4r_GDevQk(`e&Sa;X$ROFJ5_8g{;%+Yo*^~CC3yyW1n&8l@@TU z+R7Ev80z;`!P>DISJp;?Sy!xn6#x{Ak2mn7gr*q{F(jE)L2tAw^d>83&9c~5+|db} zzQuxD>lMD}5|f;g6Yk)ypSiB*sb$73Ic7XrcMSPveD%4QSdM&&&9+pY6V2-(C=p^IARwFXourZdE}hU$rfybz3T-g_;lZJPa8wSN z?`sZo2*dL&c-Mtph={}pM#fbFPlVvUz&R(h3Bw1?k+HNA*Wr!OaKzz3J+2uw5$%+k zj!W=Plp>LdxH=}rBh+Q#j%ebzjMYY2H^l;%!7w2zffvC%cKnJ#OwxA^g8WTtAVU8r zJh3@U$Bf}AKGVZ>F`XnR60DcTNKgM@(TxJ$GkQRV9i9-Cl;I*Wd=Q|?5+H$xdOfbd zw-qI3tcTD;#tbp#Dxr2X(V|#HY%#Gfdj|Kl-184U-Gvt zeq+i1^z6QzH+XUP-0oX^*WB*&dokI2m+$%w-!&gwtX_28+Rz2X(*N6h*STlSijwRd zsNHSW{u^WwN9jM*eN-O2n3>C5{O;U$Z<2-)-nvYUsG#u`(=%L|7QCyn^}m#=ZrFlo}QS{ zqL8Nn%?B_1s9pX;$bdJU%ko)!7F3t%HZ3}8vjwMYNnPO^hhkmvtB4XsLBFH0L}n4T zsKl~PYedO*%}qVK6k@i`22@@yUF(uk&SXH;W3}TD59=0A#eMcXLWL|}74wv+q7cmG~0X2o`NuzB@F9wg5mcdRD z9zYBQ0CFCST@bARLZ{qk9ynYJEh78OgLeFJ9PPvkXgaeWAtD4Le-2T|L#8lK2b#tl zuFMp2bV$?{1_z%+&_0iY2NfSV0jEq6)6Ho$oRNs6Mk5i!6N!wc zsTzt9U=i>YTB?IgV6hg|7%H0%Q;vda=pZuvBE)bPstjkoj>tGeWKdI0pjAdx#KURR z`L9Fb${0O5`pFVtHe4_T_znan18Cq|t4SF@BO&01)Cd|6hPV635&Xg=GCb5vkg+L~ zL%ZP~pRr1Ms+5k)d&mORfq-W;8DbDdEc?*rWBE_&7`FL880IYVeeZ{i`$MMeLninkwk>DzCV-3_!X1zK(g+U^FnEd{pS2rdPlp7n!Rdt#$E7pR|qWhoF^3#@Y~ z&^hbQ1#0IHECn`Wu3^Eo6lhy~ZYi++#?hs~&YQ=U0{dn?j~rgsvAm(FjGy%^A7mMu z>-CLiHa@gD*|J9*V+$-}Zn?5hZ)g4addPwBx8PY8Zh{mZa?2j3^{JaJ4>=zj%+)k6 z({u;CKwvrC{|M7Z8-r}eF?)aoeVP-qwnR z@-0jK=cVtdR-vp?gPc{1hM|^pdW)C~W#1^TgUP^`*!`U7DSEkV+PYmZ%hq*}r8jhY zHW%{}Q@UkndHc4PDi$nTGs9-86Y!bt3rSoM_WPQ@lih}T@ z`i4GJDC^L0$&}?Q@68v~S$$Fc_Iw#SU)1x)yQgMtyJ9^%IB49~OO>KAkT*+%I<*V= z`J$Oq3uW7&hNZ(pU6f6mn(DX7e5F{(>%`p8_`P{UEt}Bzl%eL$vaM5@n3np!kvFUX zS-z>4rV2(`orey-`N!AP3Ji+q&#Jsf(3(C_@77=i;`01^mxtuVpdNabH$p11{5JSf zjgnp{s@V4R*IC>C^uRR1&`kABT6~uECU3t`HhBYbK8jOPzgpo5viR^b1y`31v1FGc z-mCLOtU?`$Aa$cobS`XP*CtUCy)WJ4YgKTfB znLEU}#`9jCeP6J;LjL<28k}?DJOVVVd4$(Do(;{i2<ajlS#WA|NlhPodoIag$Z)gq|ePbX(w~*9gt*4wmt}r zrP;ba43QVf5wvN{K**-q_|rV9`{-r39ywZHooykDW~(!p1@eTyI{Tfr`s&Occy(rZ ztNdACooQI1vYWA=<|`R?D)s&`zIRLrf6<n@Ro zT_X4Yk#v9m{T3o=pA4<$)A!Ht)$DfKxuB+#uK`@RI4 z(1U@{IGgm^3BJ_(=ymZpkikzR|GS(e=gE+ht?z?92i^yJxF`Os?}IYzgWYZy8U9dk zIwu5vhH(}944(@};WqF!$yk3yM#zN&$2UBGP2y{x=B@BH_MP$Huz+}9*e8y`2=>uy ze4{RY67}QVcHjuN@e%N6a1P2W!goZ3Kev5B$cp4s_BFn3dv>QnE2#0^o0O?)PixkRy zxY!!tY(89Ro))1^dJcU(3FJ|F_i(F6x$maX(o0-><@&2Pw41L(hp;cu&iT4?`A_PM z%0>r~K7}RFg`Dj2G5ez`8f8xcfNtd^il_HRf9N2H)o`btVPSYC9{^0yz_5-W_&f{D ztAYps)DhkVNa~6X9Q5%}L_`Eue=UAr6orq|lCT*0QFKAfMLlJjnkAMkSfp*BdHt>c z(-0Xr?}^mH6{D(;-Wi-VOU7VvKCfGY(`8tvY!wbxs5xikZEMhk{DrN}P`xrc2#eJk z^jXcgT`8K>po2E%Kzs^iq1?wnWzkDMH)E7Rla5!%P;a+}mZyde#TrbOpSG|Do-3Mp zy=aXO1coqr!8#1&--Q2M5^CwupLVWYS$=Qj=z6CA&V{Ya$)9%q+Cq%@L3^iIJsX;3 zSaocwfY~!u!~RrxQN38qMg5tsKGc{rf%;K+c`3qaiul}&~q*=4(e3AI7+Dy@$(jhXqizGEmpN8~H1-(t1w4~b_8*9m$@zPiPcNsEA zS1C2A*TGeAmh`fpAEKC0porS(D3TX|LKMFZ6oc%wl!(6_#o>wjnHPnDdQr$e zQ5BW&B;!R5N<9Tfz`8IT7KWI>`gkEMjE2?DhlP=_+QqOi=7|QyH9*I((xf z?_68$T$7frZA#~M(w!@}*VFw=-`J6qZK-=*>i*v1+E`8M-jqgmd)x{jqx+GwzwQqM{@&{=sgU>Wm;)U(swSCjgeM^?!4^V?EC+*?d!LO|EC zlt~*6c2nwKA%V^+_(u=H{e{%Oqoiw@{&hvINvH2z0a8=cJ6G0~r?;ikaBtNMn6%+b zjQRt|krD zl-%0QnliL4;k{KaVA6&!F+QMRvvxD6=WI|744CaGr)$!1O*y?rY6^rN@2z?PlQw*b z@c{*!H3%7MLp^7MYF7L}>0Lc>r@E;OFHQWTl3unyx>S>TS4AMpmsZ8^j&Do7aBne@ z2?1S~dOwD#LWPY68&rP|Ms`rm*-doCN&4AcV#eEkBq?2YtjO&a}p1orO@1=)3F zbX#iNTUf~oHX7`vGztc+_;cWQ+dw(!{`yxbq4g!v!rkp>qpfcpmUd-tachQmS9~ad z!mjMC{~G~`76i!3HnO1j!SyePWkKP01jyUQ?xB0Q+^AjGg%59w6Xh7s&vr^~%xxiY zH||JI{1)t2Zo)}GwzHlo6@)qL&7O;Z4xqXu+JsP@+{oh2VFr57#gT%a?QHl@?@S^3*Kq89(Z6Xc3Bf_OV4h$g=Mn^*|G% zo(=W7*$g#_I%$9Gk?e4l8OUBcv1WO8H$^(#bVP7cPLO*HL^h-eV-W!#=MVS1IzZ*a|i8C>l_}j_FPUoU(2EZ z5j;Z20!`%i(|GfnWkgy@AK%Fef{(mALl({6pTqFpV6GbG zx$*vlY?`g!AP?Yl23hDX@&xI3y6Pk8KJZS^&7;Pj9PdL$B@TaW@Yf1|Dfq)|bH>ds z3OV(<4}>WGVnuP2qL?ktwd^?s`{v%x_OSrrBE&uxKx}qC#@}tK{!fjx zpsUkx?ANLP3-GQi0euQRV)mP!KynHR92tB!FIW;47CZRF}0+t(kfX4q-mhv7relwW))m+=u&102zBpx>8~yn^J9 zki3fI8j{zLTu1Uc5HAY7_gVmY8fD_8IkK5D&7xV!_3WLx8)&B@@#A?Dxi^t~6Ukdh zzJ=s%BpQ$~rr3p0o?#%34Qc2+okV3Hx7CYAn8c9I(C{wl;x1`)msEGm?1s-A_?0#9RbSVNJW%L2NNrA&fj8E4@L7t*YCHjaEu^$Tg#M-zP+nvG=9hW+l3!4)THq>BtKkOTi7@WaBRHh)Af^M zpALiMlbUiID-<{=@4DB~3oF}bu$#axuu^Ddt-*rlzyX*-6HN4*32OE*;0>B*g<*b= z8i)^JUWHtz7uPgm<~7ZeHEnv{o~MSUQ9QoFKaAA#3jW*W^=d2=F~F8BBnbwn;b%3x zWKiVA>t+7CiCIy5wpf^AsgMNx$Hp`QX7&o>S(qgf@XO$*JLm**2$%h}NB;mhJmxX9 zop`QflKG-BPX7$ba96P~)$+@zD2l%jUi~NG_`eFmhr*U{=s%>082MTN@=IAxNlVFJ Q^&J++9zK^8pZER!52SyIegFUf diff --git a/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index 17eb6a1c21cf819ccc3117c67b7da537ed4ebb30..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9081 zcmeHNU2Ggja-P|poHl&q1gztscAbMmqjNs}I2UJ?U|--MH$-1#9$ z)%5htklut0pdc?9&DKcgbrW$Cd{3BYCLy4S1%I-B?saJFrlM-9Oq9H=6lpFyqflA7tIXSKU#G@nylw`R-|sgh?lW_=-ZTS*s7L@^uaToKx5y{i1h%ko=F zxkNQ39KKd0xok!&QT3IuET5If3YlE~6?rm4UQ_Pm@;%DkOre}tYb|p1z(`&(M%ndincfSPYM|slL?U+5XhPAgg&Jqs~Cp;2;?pIoF?> zQZgf%k*R@mL>Wmb{i)%R%;aQWYAT%dV<7CS0iGa>&wmraS=B^dp^J}fo`@BwBR-^t zU-!>??QM19C;rFMBfdri-A@9RzNAaD0q)a%YIU3^b&rD{J88C_*SJr&FGfIvUa(6S zNQl&vFo|eUr!2sGof987JhEFB1+CH15Oh)Z?O_>x6lD5FofhYoB&OH#p8gPGU<8ej zQBN9;upZRIw@*SzM31Py)DqlcwuN)1V7vJ}ai}P0b z(_V2gSao5eQ9H(0r!H`fE5aW)XMHn5)^`hb8KYTmCWkSeBz${V5NLqQf9m@vE(u1$ zXeLLD7{n}bhedg>9phyy%16=Se;#FW?1ioI1RwF9YuwRi^48*G}-Z;Cp@Yagu*QjTSBG)U@!G1+&y`wCZ8K)!KvF^fvY1Ja2vN zJ*W-#pi^k$z8&QBzE^#x$A69QsRrM5cJV$^uSJa=?%Njd7Mvz$_T3}zce_T9bSGMS zzK4z`^x%wVoHcr_J74%c^jdiw$k~r2>w6w2J)~EU+k0*MzW3U8?ukF`y*3PcZJW_b z`rZ@tmMa23`xp`JG@lDS!foJ-)u{fA^pn)Snd(QZ`EFj*gJ>uw$ zU=Ka_J2>#M-->tBz9ZbkN5G%X`KfUMzE>jrjOKYEIXG?xvKc^DrY}owCVghGoS^`# zR5PU9MJ!ahU8y64^b|-XFsDtO9Rb`FQi|CU$rYzf-+aGWM=j)I2H#O8%b97#jB?_eaFhjJtXm8S5;jH)VBvowZ41B!>hMO{vn0X~A8HtvR%IIjGqIqjR+F6$Tv zsp$+M3i%Q-6n5yM)1iLo&_&jv!K4av01fsd8Qj&GbF4GxC?W6xg#W;1WY#Z; z*-0z`|H-hydu#P4uM|xQAhMd2D4yiqw&+<9E8cjFumOx@9k2-+7?1%3qGy2lJwXI8 zstdOOo*JSKhkoloB_hcBx%dlF6h4Yc!h-KF{qthdZ`NximIPR!O<+C#P=N6HdiqR} zsyOqq{0-@yDHW97{9HDp_D&UH-LX~GTc)L1C9A2uB{f&n($h3kp6P|fr}kQGTDe=! zm#9K}HOxX-6v}F3A9~6QX8rkTr3e=EV&!*G-_gvKvT8a@l?;@oaXN^TwdgsYFJ&`% z^+z)+LacBBh;WvvyH8%KATRpt=qvh@o{_sJ|X5+Dk+HX|^j_)?Nh?O&*l?Drk z%^5Je<}78+nd#5u^GUxoMYy(76vLftqv*~I6mH`y7oMj%O|fz-X8qfjZoGMYEPdmO z8DSZaJ5W=ikR5_m1n(t9j7<@=o8i~W3)ZMJ)QqQLOnC@%nPy7`+)c~|s7_DLpt zsG4W*I0#c{+&XB9LSU#kE8+ytEsLSyv#|P7{PDZdIlq25R~t5kzHF zI*v+yg;FzuBVb)fd4+x^us#lYg>zoDKCdv~RU7gO!=|WETmp0mTlO?+tFa#JXlGQ# z-4F6YH7|M|9i80oilk{!$E`s9D|jmxgb4Yq|N8yw zD=n+i;`I&b>~^eW`R-cm(BkXcQe;bNTa(&;y0AK2mD)C>f$ea7D|~P*d~juYJ$!ue z@)uHcX=;V6c5JuHtKa?Gw?9q%%@3-luYT+`qo~ohAR$+uQNxs?@%eT8@7(yd|~6 zy~#u-ICKq5nY8Y*8&dmn+|gOS?68daOlse5>#9m!O9RVf>D4W%3+_!OGQpv1Sjwbz zm)(%MmWiX|Frd60iB%g9twrRjbn3w+Ak~Qc;L=*;*p_q(?oGRZN$Zxx_@}7PSfCG7 zhjkcm_QlcuC&xFAjy@N}=$W;nqbon!Zi+9T`jhB`&ZkYMp817X<5K0rTh*hZi(}7F z=UGT-NK~aGOT)`bHPXE$9f5n3$xK*Bfw2w+gJs33<}4JfNk>+`=cqB}GwH~7q^T;& zOGC?JOBc5!8SYIcGGQGB#yS-Am&crH&O*VOB(LmN{!Eg;h;*(Tubvp*h>R>=`Bfyg zqqnRxY3Ta{^>>-K;5ol&wo=SyRaOL!LMA2;IYF z_%&M>vK@80pJO{KJ1Q9gqmBfPpf2e_%!C*rJp_3=JF_YXv)G$m7XckWbMpP1X>s=Q|2v&B~^&i`%2BI$UlQ3W>R})VSyDnD~&hp_sbe)_liP*W0deT6mdVP*% zJj3o8-{YwUnsrea@27oY4;jH}j9>t=odJ51#Eb@zqk3Q#H|kv%Wbzt~MxTIrIqt`3 z(wmI9-lR&T33F7yH|TN5Kqd49W>A+!_3ucWG?T;&i?x%B-!s;ZCd4{n$E-yU&El7O z*X5#LQP2a+B0Wx8{_p$jnbjI=>7JYO{BPIXJ!iJ|@aNz<(rRPy)_c*K(Q?o_M5{?X zP1;EN9=@CTim~URO|R{_jqAZ*SKLw6z|(q^bnu-aCs+v9+lN}aK;hf)8t%p2 zUBJ^yjTq|9*g(4srmp(;fCmI{%9m)yXaQ97%{cf2*=b(nVe(hx9JHh3vToPm z*22pa(>?Sc5=@&rVEC|uJrN8acEB(@_Tt}Y=m{i8fmB)xITaAnwDb2=c2*9+xfo90 zLM4d65I_cf6;1HWT)AvvMZnGslk@{;gOYwgkai(rRo3!oQtg>mw6MW2CZKWvnMny@ z%BU#;>g>1ZfJKKYA+`BeyX3@rx5$2 zJWy$5$7%~Trx4R(@wsHof;ukBp<~$2RV1$?c>~EcB)^N~I+7beOh1gptOF#e6p0z* z=wz}~%9qN?1H0$x_t8#9f}x{tA$c3gO(fq#@_i)lAh`v^iwi~=wqgV5@7p1zc1Sj8 zV4A}_s)oHDZQQ~T%>dJ5LjSoP(x_KT*-bOel-;x~4ern!+93_AxI}AW7rT%_nB=!1 z%zRD*{HUd!Y?HMhRk{cc5%j5l4hsV?jEyEE;wx92fFUNTuKJSlzE?r$3s7hBq7^o&rJ(aGms=u6n zI1dF+kUX4!GQZyYDsWYcuJyk9X$mBZfB{w{u5L*??hSOw%GO;L3c&qO> zoCW*Uc~%M*Ph1xIz>0yc!OcahLr%>u2E0K>S)rHTrB+h}{pT=KLbAmSrqiU9O{dLp zIz2U~%~2(trg#d4|D9oH4y@PC>`b#zyJqXqQp@(4gNz4#BXsn&>EU_mNn5I&{8JB6E4=BK--J;r65A z_VdgyisI+Ojeiu5{EHyGCu|BG|1SAN-`4_=XAQxqv{?V^mTqzA`T3yux@bB4Po1w` AhyVZp diff --git a/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index be91d29573a9dcc646930fdf7b6ebcc45c869e63..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10386 zcmd@)TWlOhax=R_?qj&Se7|`dy}3S%ONtWpT2hoHoz5p&l4Ci((7sHD+e32YeYs~S zikn%0ar{9c7l1$>x4`KD-k$_1I3fA*{pB-2es;N}$s61O2jqane|F@{B?1DZYIz5f~t1v$9B{NLXuCq<6?Hze%K!w~-2T|j=s5pJ3z zJn^LXv}fAGBkoOkrv>ZTkZw#sS=u-4uhs;n15hKRf@yJDw8|P%P3h2dC>@>-rz6vm zbn|p`x@Ed0-8$WxZkuicp2q2R;`@Y~?jU}EolD+mAb<6?s_B|U)Rd%VXfiRcQfXGz z6Z7(-I+Ii8R4I|oNK_?KI-B@VRfrmGgkhGFaVMDN_t|avAIVqddrW8uh5|TYH|(U@PLbX9NA5etF?tjJz(>ZkkM!~ zJPAJr?eO{lo&{gQUkH#$A!zuF;DRvD%X~8{#ZaTCmNm=W%uP zq*agDJa?LHtoD3dMelN2hf6+KY4>IL9#^eK-JI>a`M@R)XNHIVYg*Ui4cbyk+XGL& z5ysl&YSHEi z-ROAoy4|wS2{B9OU5qz_+NMkAdT+DiR+KZ0Eq?zf=MR47+HIS-tWa-;JNC)fY4efT zg9dAcyT}L`HM**CTlc=_yW5szKizQ~#BS~I>nL>UN9?(V@7IYZW6l?>hn)O`GkPv_ z&T10%V-Bz5vDO;*0zx2R*?7hAxZ8{cC_oDuB7_YJK% zJ!+20NK#H5=`)yEuz&YU_@i zdug-unysa+-de~SZmLBv%?7*oOju=j&B*xZhsHHKHCl? z{y?x+ZVQ>}Pf5265FE`j)upg&U@4Vbl~bp=5gBr76?9KvVgi zd0o$G=VGzN#l>NFvMKgRFnyR*las_0G77Zv>q|Km@@ccCKdtJJs8xCMF*7`PL&txuub)CQVF8h>k@y(?b)FJf;`Q5sc-1nhpxliRaD1Nk!9C zs#~n5fEDUbMfvBANlk(zW|Yr&+VWV5Bh#B*&`psIQBHs8AeV>eKtlv?Td-8eM z)!8`fY}{~87zQ_ zL^%fu*!)FT=b%{d&v9UYNX$+r>>@y_MPNn9PQvtP!NO29ScrNlzNBz0C+HYp`35%Q zyy;gnV1s044)VR8WCrZ^eH4$jNMdZ51Cbi`vJT^3%miDG;0^a<2&~6`5*B-_rtKjEqVUIyU2qv+Mr3kdH347@ra9h zOwaI$$rBA-Q<9#45S!1Y)mUmFp=hz$3@kI|^|2hy-cb{}7RzeMj4sbnB{v^~#i7M4 zR95fhQdz3fn2srS3(7mzz;JHK48-Tu3~fA0Gq4fC=ex3=^7 z*InkW<5J*1LpX2XQ7~_&SII1y!8fj4zj@>8JM!(DFdb+?@Rbqdf~jZT07%UNEdel- zAi!SH*2+n#8B>5Ip+yDDDP2Bs3WVm}OYIC>YGB)>{HZuE}-Lf{HT}Tm& zRh~;_XOxu8USP7O%&I5>q|Ih&TG3@@w6r#7hA&z#A2=wvLTQ$oZ8i%|yOObvKq%ys zIf^DX8`)ws1I`Men2x7!BJd-)1|T{>`w_=H1pO8Oc(tc<);j=}1Bj)Eu+R$!F%;uB zI&6{fJ~Txf9aHawn;2yT)BmKK7Mw9y@jQSCz^--y!u3Gdk;h zbYus00zOfeS8Me8E`d9`JgcEKq_4Z^RTS|0@3Ver7MgUyPx~iupk?kiLTF{SNFH`> zclAGPD8;^4I(F^1UcTWi{)Yw_m!6UxGFJA)oc4;m0wC&SF z8*P(+aeeuX74OPB+byjn;n2#7)t39Gw}eA*Z!(e*4lH9SqbfCYLpZeB;_$4_6%)mG zS8qB64EanrwB6ZT5(ZbsR>{hREnyJuO-3@pfn_XZRHcS)2!pG{;VE8nP=cz8>|d$U@=sES1~d>#22>W09A5*P2sOTy9Oc>pW%;`y?8 zbW6Z{lhKT*Ai>aeWM-%v0tZMdMFz{FR1%J?T?9}PrL~J?@yM2d_hz+#Q5B0~_&V}4 z)D3~f5-F9%1EAg7+W<=9fwi~YI$8yel0Z3HuD$Kn&tdrr{H<+uym+$WmzFPXi%l!j z_a{oiQ1MLwD-*>xEro}+grQAF0|s{m35KpCGeZHcedM4H0+OnNi(QuTBWv%Mgb~#B z{jxZ+CE&eTEnrl|q8Pp|06#@O2B0J!dpK4Whqna0H>(AVs#p}m z*O8y0ZU`)3l9rCu_x>id#;?8h)6l~Y%hI`W^SR|Wx5f4?v9B!ll?3RquY$e@$iS9R zyElb_m9wm%QbRX{fs)wAiq{dZHE;^I#mwxcU1LaWoOR=YTbj{ch zPcC2nC9FsNug;c)o+1xmc~Z?jXSN4Bp*b{X_Ov2n_do$j*+|A9$=|bqpx2YuB+lh8V1zd*N7JbRDbX7u~u@ zlXa{fs&lL!-gT`0v)W<&9gJdjT_j>1?KbZ^R&UWmHshVJjn$rZn#Vd;$GFM5>K(w> z4$EPqUE{2`C3hdIx9q{|Y_v(MbF5yIUT{C zaJ1P`J;Dj>I>Jf0q}HDSJle%HsrzWRHkx|2svYfqJzp&$)#V)Rc9S00(e8f7zQ@rn zBpP~eP;?s8JEIVa|H`2Fhm3riooL8p?f32~s{({WA{xTTAk9VFMQRP_#Zp zds5tR&;tlC0!h$^T&cz*nx2r6JgetkH!!iK`@HoBm#{4qaiEA-xbxmjQ@WT=C3ir?6M=wt}UT!laYYA z?=qIMf=UhD5V|nJta7eG?6;%QXF}IYj{P=-!!!qmiguYk8Fs%3SvG^RJiDMTP*s*G zzNz6)Y1Pe~nD1#a2v~d7m zczK@x1^3pkxP$-Bai4I18Tb{~{X4b*EdFQv5x)1?Ek4X2vE=?A DH3m8s diff --git a/tests/_runner.py b/tests/_runner.py new file mode 100644 index 0000000..6734d10 --- /dev/null +++ b/tests/_runner.py @@ -0,0 +1,37 @@ +"""Trampoline para invocar enrichers desde tests. + +El Python embebido de Windows (`python-embed`) ignora `PYTHONPATH` por +diseno — el control de sys.path lo lleva el fichero `python312._pth`. +Para inyectar el stub `requests` de tests sin tocar ese fichero, los +tests llaman a este runner en vez de a `run.py` directamente: + + python _runner.py + +El runner anade `$_STUB_PATHS` al frente de `sys.path` y ejecuta el +script objetivo como si hubiese sido invocado directamente. +""" +from __future__ import annotations + +import os +import runpy +import sys + + +def main() -> int: + stub_paths = os.environ.get("_STUB_PATHS", "") + if stub_paths: + for p in stub_paths.split(os.pathsep): + if p and p not in sys.path: + sys.path.insert(0, p) + + if len(sys.argv) < 2: + sys.stderr.write("usage: _runner.py