From 6919ebfe9c3555a2d6b187b863fed84504c9a72c Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sat, 2 May 2026 16:10:13 +0200 Subject: [PATCH] feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../__pycache__/run.cpython-312.pyc | Bin 0 -> 17486 bytes enrichers/web_search/manifest.yaml | 11 + enrichers/web_search/run.py | 436 ++++++++++++++++++ .../conftest.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 11234 bytes ...xtract_domain.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 9773 bytes ...extract_links.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 9225 bytes ...text_entities.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 9081 bytes ...fetch_webpage.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 10386 bytes ...est_manifests.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 7958 bytes ...st_web_search.cpython-312-pytest-9.0.2.pyc | Bin 0 -> 13629 bytes .../__pycache__/requests.cpython-312.pyc | Bin 0 -> 4371 bytes tests/_stubs/requests.py | 89 ++++ tests/conftest.py | 237 ++++++++++ tests/fixtures/ddg_results.html | 22 + tests/test_extract_domain.py | 60 +++ tests/test_extract_links.py | 63 +++ tests/test_extract_text_entities.py | 59 +++ tests/test_fetch_webpage.py | 77 ++++ tests/test_manifests.py | 72 +++ tests/test_web_search.py | 97 ++++ 20 files changed, 1223 insertions(+) create mode 100644 enrichers/web_search/__pycache__/run.cpython-312.pyc create mode 100644 enrichers/web_search/manifest.yaml create mode 100755 enrichers/web_search/run.py create mode 100644 tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_extract_domain.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_extract_links.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_manifests.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc create mode 100644 tests/_stubs/__pycache__/requests.cpython-312.pyc create mode 100644 tests/_stubs/requests.py create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/ddg_results.html create mode 100644 tests/test_extract_domain.py create mode 100644 tests/test_extract_links.py create mode 100644 tests/test_extract_text_entities.py create mode 100644 tests/test_fetch_webpage.py create mode 100644 tests/test_manifests.py create mode 100644 tests/test_web_search.py diff --git a/enrichers/web_search/__pycache__/run.cpython-312.pyc b/enrichers/web_search/__pycache__/run.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cef5a499dd332007bc89ee04c4c44f393330302a GIT binary patch literal 17486 zcmc(G3ve6fo!>67cmo7T@U7QU5=B9xNK3LMi`K)GNIfixazx5LGxk9cb}51a0cv+4 zi3k|et?x`maz(|-2|7C``jVdMvuBdav{R?kUK1y6bIoOj6zK|H^Cr$+Iz4x$Q#w@U zJkCs~zyG&b0HUBc)9Fl?#JAsm`~Tkm_xInQ`Fw5;*YW@To7iwW$Nd96n3v7S@XM!p zj=RZ8Tt6rAk|oX$Tly_L<*jjRzm?sEeu3R>{Wf;D_uJXs(eGe)XTOu(UHvZHg}8gz z-R~au^m|x4wzzlL*Y9I_d%R-U-|rs|^aqA3`zujaE7eJkmu>x3QoZDSS?CW+4U!A@ zYN=6j<6a}Jl03NAN~aYA$T%IHlgg$5q$-|K;%#pCvoS>$N0g+Nj3(ovtZI>j6j8*@F;z{;;v)udRWLlmFtJ=G&dlL=}?4#%WcBa?^>%dO(DtVN`V7HJieBWhSW7aoadgRNp@ zWF#!blvXrUyrL0|ofV6Ud_JaX%4k?gCbd>^B%(xy)i$$RSy4K~cq}1DRPpT7r%xR@ z-QC;Uv3DSzjA;ACy%_R&dH-2fOB|48>0BgwL6l_i%&|ff$&}V1rVl8hxZF{MVNWQRDY zX(MV!d;6d^9B-4b=JY$CY>Os`Sy}smi~IIoOv%dVes`CADJ91*$;PJ8nrpFWvQ<=L z3Gu1OrATj7iH&G&;tA}MoQOxFvMOGR$qDSJoS>>2d*N1OEUFxh#xQ}i?d>aijkh02 zNz(a!d*uYpMZyxHe{nD}%I=bk(%3*O8lfpYjU^W&DJ?mSv7~_jz#%|OSn}bB8oPu$ z^^PS=X(=TsZdPTLx*d@f6!2yvbaOU zdMQZ^GltGmG?_g9o-HD9I4X^tb<;qxEr5>81bJvsNe;(S!`P=`pej})B4QQ$lF7I# zKAp$jHnAHc0S;rF?tw@=&JaP34UeFaw5%kz#S#Nbq|N;aQBsK4J&{BrNsFIMs68Qz z?gZ+}T5MR>os_1N2^mmsA|0%cDk~wY?gVnGa`>XEJ5q^@siX$rloDs9Q42MkzZ|%O zM)~D`fy_-#Gt0T5QmB0CP@Z8HfRK^+3@2G$7bI(jf1Q(rZ`q1W;0s!#zE@l(BLM^9 zLbjf?t3XI;Yuk#3^8zVQB<{*ln zKi@u>9G2VTsc1xPA4r5vLT(2~8Iq%#+MWbbYvJ=sWMr@%Y(Q;i*|2 zOwEjAMwzYIu^{Zc599-(b{Q49Xr7Bwo)hq68}u?V|2G?y;jk_Iw=Dn9w#Cw;crg~m zhXjac9ahh)G-6SyMygxmSd9kcTJTf1AX%{zS?g3PdvL0MK@e9?U#SIn6u-iSsTdU?V-Xwx3dhu>vJQZ)_^C}uun?Z$^}UmO z-*Ddc-tx|Sn(unr<~(h)?T6+)U9&>hgY%~@+4dhqrgZ)qtqi-$t%ONO5}(FU@0(lf zA5jxjVg<(Q0*zc*i*`x|Z>PZztI7rxKZ2iHkA%ivaZS5EIyrjvE7`s|f785Y{j9Lw z7%~mhyb3Jk8`vypRbUV?eoV+%6kEoU5hTlAKuT6gkZh9u4eOXKW6jtuaf&m893c*1 z91O-Oxg@vb0gT=<(oJI6dcXW=rgj9@O|P_WpinKKuA%nMjIM&ixb z->owBvN0b@JsBUy7JMhqR3GEUDl!#A#4lD{W?MsbD_-DHs+MZrtu@<-S|lI+na?~I zwLHhs?6z|%e;NAgbI`*m9razYKF3|=FLTB{R2ON&)^y3xzm)UPHX_hU&O^yCBs~-g z(lg@Fks!eF)K$>WAc9FV>V^ttDi&wUs1j$PY@v;dshAA4?Wta9bZBt|T9+74s;mh} zBB)}JoKTi7#LtSGi60HV5n%@c8J4J z85D@aP@8xxF$z{5fj}nlZs=JgprH+{H^J7UMW}i47`*_?&krS`K@3Mo?#s$Bv=sg=EN3Bjon& zJ01~xBLfj7mIqF^Yw{J1>1nMHQgJ9nq!hGYp)J^YWjOxC#eI*o?P=Z8zJ=u;Pglr^ zZD)F0pTxtNVj)Jqw`a$cLtfoU=!6DTT^LEC9f6iY_Z%A@Nh(@5X}r1@O~+&B+L)4~ z5aZG-@IV8j@~jYNj0W>o)YFjI#0Qb^|AMfpAf`Gn7@q@bMHL zj*PIeDefvQH@<2B zEIE1K6H9h3*v02oZT-~e@Y~0aECm9#PW~fjUCvv5{p94yoVPt!S(mF`h5scN=WAT% zY(Dn{zvSftmDev#UYhWIw&&bJ{`zd~ykETQZ<+JA%=<$V!iN<# zv$cDEDCE{Q-?Q@mmI>=GebrOF(=8wPw#-PmVBO6#^ zA6o^a<#nWUOyXp~|3)EohVZoVk_vi|Vu#Agr2#O;C2Ai6UEuqQO|qHLF0LqiGX>!~ z#%K6)@@A-vnk(7g$&XDrf?Cd%qXe%PnAQEb<}u#^j?}Idm)XvkC1V*P99wZo@J^iP zO&K9MNxqI*Ggif;Rhpm~tJz8~mthh_t1{m~J_;m5sJtLQT#-#oewd~i@z_B{9$>qslJ+zcBK=3_h}o;iI25&%Xlth2Ms+&h~WC5Mv=Y&l2)P<8BFTvo}9r|Y4HBI8QU zJ3HIj+BPXEZ6EGt71Ne10m;8IS($t#LUtCjcTo}jY*r!)j9OTTNii8sDG*p>X-5*N z1T0onOp4JIoDE44=5)R>Hok`>5PasuLkwK!(w6r2wA~m%+S(p(PuH5mx!)E}SKR-s z3$#>>8;9Jw8zzmSsh4Bgph6m~ZcoTsJP8dHvX~WvHNuaf5WXq<3Br0F5|)rlcXeMO zR!7!mB(4-DXypuE{~>3LRb=N(Hq*)`i1JZ<0G{%>~u)vou>eE+~=Wka^FKWdn$S$sTutl5;Sc>ai*;+WmtK2uHn0iY&Knp%{@Fbn=dJlQ$J=~4 zr#I*Ej~`#M;o)O_tp}U9_nTY?+pO*DTDbCBCBIDbddM3Cl ze7_eahF^DwflyB~dp-3fP*e96O^Mv6d2Nm)qJv3A+$ySx*a);IXlEddvkA?^*4QmK(m{f_7*>WnfU!T`BFQ25|ZLo7)cWUZb3Hi+<9=ka9&%-J9^6F2c94|!bC6ZVowFf z2&-4HF4$9n|HGvr^z=ck34p_33R)PRCor4=^>D}umqiMb(4FCMED_VfVbX(WdeC^n z;7hs-H#F5!RT}PCLMFm`k9Th&iVysdu~JXgmBH9YoePg*RU41irgx3LRfIt z=bBsa=+Cu4Gx)&SV3>x5MMQ(NOl%uQkh#fa_%7}RIG3LX%$NmY38Y41Pvx(9GghSj zqP?H7mNWZIbLpMDIE}*W>E(nHKY?9gc}pse1%UaT=$y@uNA4kY8;N7W6JT@#%X7jyo=)RC*1Kj#a3&=}}9_+M|34^^{|j zFta{}+=|)Q7)wtFO1GYv{MQL|1CkZFCkUz^6$#wn`9}6WsFt zb$MEQ>%});oL{&5?z(+*>-NpBJ1}4S~$?j;nkj;ukx;M%^lyG zhc;Wk>bCQibGquSx;N`)s_%w6=0Y9wO;6ly>YQupoNqcbU*C0q!(ubh)@s=M6>A@) zE8^yiuD*i|L=Q)dWaXtigd>)tmrTK{PUksjuduVlR#|&xDCLXW8|B{Sd*GEXutVJr zuNZde!QCcT(|x<`$?rCKuhO*-?lqPFE2^{)3GH-Ye@{)Gy6as(=UqSj=m*}-rF(SO zyJpV2=B{_+oOk1N-v{0;SbKAs3ql9Y6_)ug@$k@PezN@Xms5g@hZjmSY6)m|#q|&d zwE~JBO~M5!V#7*V4vs~bZ^uD3h?Z&LMxFa{V$2 z?$fXs+{|R8+VhafqGJg&sQL6_{B6e03b> z*k9ICFzvn!Pc%;BHAT#b1oFZQ${$g$PN=r@?uC3E6Cr8dVXA+3RJzCw$j z`|{RDRF22PVc>~`Qidl_8g|ID1WyUsW(+evdzt&xVz)Ve6L8qJELRD(nq{wGt6KJY zZ5x(Xdu`3js|DNEUwf^#hJSKfZS|jWNZ7KmcJ8zM$W_9a8Uk2AZAF8^-1srx1w*79 zT@`(MP!jlZ49RSR?8Px_(N|}dj|m#k$hgqL?+}zQgua>A@@-@G65pJ?+=-J^@Yr49 zRf<$9s0wsGrc~mLtj6rOc$W%C^DwO*&p0vyRBry-zKl@rv?=2#@&XuNmhwRFEB8CBaM%A-W7bse(<>a15dm<#j;(M*{s6 z$vF2bXVq2jc;}Kuu(jlzuIu(m`^06q1dw%1Irs<~X;H#c3_}@R zML|yPe6WO*hO=I>5cTDW#5yLF9zc{6_r>GtF7IU=6 zBjRvO9Y&-Ty-E~f^V-t8db>|_AL%S0D^M&8;f>5Z-rJWbQmTZ3;7&ZNGbd@ne^X>Y#j*G>X|13?@cdIO)`b8&$N zc*0#ZA9|~Stgf2R)k+ND&6>_;%a58}o!f0cdW1(BEmg`1K^U6u1$vfB%M2u-i8PQ; zfHAIuq;q7zZxP8MWDp;laY&Xo9Ai$c;7}>2`e&Sy6-I8};WOsS*p)gcZv_r+_Q6~= zE|N9_RudW$+1?8$&4V%*(N2(Tlon;L<~Q3ank1%lgM7gXKvLMp+(wKFMV&Nq2_eH9 zxiQZ}M&-%45!pc_a}3x@M&^dI(VOl*L{Y4zm|r4H)D-Z#P3(<{qmVg_A`k>XQ4@&N zWbqOM@CQXBZMwU7C=_xSI8Ej+lN!nm%I!x&2igeXK$zb^G6npLXouhCb+4RIQ;Hmp zsL@y~z5C45U7da1Me^zG?kiB%z5}hIVLVgieFyTi6_fTI&;`&{Sm(pB=|ASVo=<*? z{?e1jdV0H0_ld`P`c4&F+{_|vS&*$sl@OK5loGu!?3M-iYBKdy+G|LBrt`#^?p|>- z27`Y_&$yY+|BhntT?+#sun-&oL}Dz|O8GNd3!9W09#NHVQRxOMutv_ojgBzS!H`$^ zb1Ek3R+O_P>*!>g5lfucEt;zQ1=V&jiXS4jfipM6bxFEF+OSG47=!dn89*hN2|4#m zSilbexP+hjb0p&&xqsJXwdr1*p^qP3bOo+9j34|sSepyh=YowmHD{T1x$4?a{Z6la zyz5>SS6Oq@eZ!r7bVgnXJUV{tLkDE4_qrG10N(DLuQqGR9))naXS2H8kgQZL+2;MTz5|FYXk0h(*i`k@hRG+U zduOU=HcUS^-jx%aubg=4#FEA0IslJ^I{<*u>Gjj+W}TsXE}PAcI00|f_({Va6EH3F zB6h-9$kWI?T+$irqDasP5EIHN;U1<^k`RT-a_114WJ-v9WoK9Y73jp7jEY zJA1lBEqVVl?Hyj7ICk<_pST@15FrW(hvG2E)Uq)Y(H*QMY_<&z#3bcCKq=owq7qkQ zJ8q~%@Z1_hK)fhKo2vXZ-XpHxJQT=t{3Q$TTJy0hFx8x`o^`HWb_gDD|0OrH4T=f8 zx_`QQ*3&#I9T(xjG~t;n=jDUPk^=ikC}3v?>_zrCYKLe1FcZ@h8xs#J-^D1>7fKn5 zxuPmU{3&*yag>ssg&J;dYODBraub86FqY95va?NMJOc*q>FyKZL#NL4^mU&$7zWgX z%ZNQ9!~iJc82=7=<=<1n5aC}VM<#k4sPUiNqbAd%6Fx5FH)Qq2iTiqnzH!|78Oy`XB-J!RK-igor;jC}(_~8PD@~(X! zyZoTf%hR@LWoE~$v+bTk0EGgx0tS7e)528{@&5yqT6z##?&Rfn8NFlpyrhUj<-FZR z1DXZL6ohbLEI3Lv#jt4t;uMfC8^}^HUQNg$QfF3NCT4=0ly7B)X&U38As0eAp&?TU zGrLTDRcgO*mW7dl7RvYffT25vP}Q=tbl$%n7@tEU8-rai>&vYn)D?5IW^Dw&mWTD2 z5Wm+dLbWN|qjyCszTdeQ?k}7{h2(!=1|9?{Su%FjTU6tufY~qn$rjixYuq#tx)&d7#`>g_n!NXY` z={iX)>9N_YJG9Bn74Mk>&5Zdo{-IFujhU00%pT#2svisBR|%(D1uWR(uwbjSttMo~ zpQ)18(=g25Nv@9twQUbV0CEFC;=VC|Z)SJzyPMu2Ed$qBbtb6VG6AWDq}o_OYd6YK z%b%%+y}H?4Z&{(hb0Wpp)65sLSD%>3O*qwmhbSF2XqJd1kqpG zXM}Lkv4t#&X=gq{OU;9Rg0WHH`Y6HHJ$<_K$jMG|AgRc)^NH{Uc~sqZ>hR%mrWsW7 zhZ$^!9vSHE?mT_yC=|KgGbj4Orw*UT0C6qBc$J{jD{tcgt&pj#{3RvNQ!+=%9VCa9 zcaeq~6>`zj*JJpKRM|^!$)HqjQ$qHP;d4EO90B3`M`fBy>HI_?w#_~PV{zQNZA76D z3IB>p$w{hw6N&CnFCyef-l;ptw}cNC3@eoxus$XlDg`Jx3VBdT&bSKz8 zb9(%^*~sJb!HsucG=`vAf=$Id9Lr_v!KO4?O`^wQIrIjjF#Ayc2y> z9Sg$RC7g*>E>^9|cF*j2@0p)&T&OxV;kZ|UxP1WZSrFDNR;=<_Qx%K;weW{L zGxO~1uA?9LkL3dErnk=pHs0_rt>UU0mR55JDfUcyuK7NO(*qvA?8-KEUsFcz5M##+^SG+)s{~^_F8gPAmq+x_pG|d*=?(rDmbt2vwP23 zn0w!XB_;R%#_2CjADZ=UUT}sM{ej;u+3@1G_p0+wRGcR?Ts=x|)W+Z5Yh@J)y{V#z z?>eos-i-^+W*p&OlL+Czvvd05w|9N7JJ-7HhfSYa`NnR3ndAM3`Fn849^rpQ9k|fI zZ*U2^3zisF^S#SM$y$$)Fi&ZCg;B#Oc=YnS0@#`^k z0-Khd{NR&We4Vb40c7y7Kcd_p6NI49SAQ!@E$kfGJUeF4#P&jNgU>KN$V3ifhKv=l zvM29vi^|Z$m&%qhX2ohz`)M~j@g}$Y3wcxg^Qt@(Z98P}P=vJ`@}ZFp zH|mXv5eCMbs4PcOM8-sKP>v0qvCTqr#)RD^?J#cud(T_~EXVu;g*rMu0ZPV0>y>M0 zln&I{H|aP~SHLehcq8fbjHHWhNvgU9Um1Tyy^{b^)>A?<*7(rW!@jXI&K4EYe3dcg$|Q2S0EwWj{4CVD z*9ZyNul!HI@c60!9SMkn=RXxHdEqx3Iex=GaGsCo-^KnbKH}VX`~}zYcij44aF5P$ zkKVIe__}2`OL!iH;pF)Z%gz?QlV9%R8~DJz$B*)S{Z!*Uj_%9PSn7HI@-81ASQ6Lr zRk?NRm#lQ(vdy^f+GE`JcP>-;(m~$AZ^AcHOIEtC-?U8kB@lAuxOd5pX4pHr(>vUk q#ix1Rw=`hkz*5q8tPMHAJAUYuqc0tu?7RBR2Let{IH8d_QvMI^%Yk12 literal 0 HcmV?d00001 diff --git a/enrichers/web_search/manifest.yaml b/enrichers/web_search/manifest.yaml new file mode 100644 index 0000000..20d719f --- /dev/null +++ b/enrichers/web_search/manifest.yaml @@ -0,0 +1,11 @@ +id: web_search +name: "Web search (DuckDuckGo)" +description: "Busca el nombre del nodo en DuckDuckGo (HTML) y crea N nodos Url con los resultados, conectados al origen con relacion SEARCH_RESULT_OF. Pensado para nodos text/Concept/Topic — el siguiente paso es correr fetch_webpage sobre cada Url resultante." +applies_to: [text, Text, Concept, Topic, Query] +emits: [Url] +relations: [SEARCH_RESULT_OF] +params: + - { name: limit, type: int, default: 10 } + - { name: region, type: string, default: "" } + - { name: safe, type: string, default: "moderate" } + - { name: timeout_s, type: int, default: 15 } diff --git a/enrichers/web_search/run.py b/enrichers/web_search/run.py new file mode 100755 index 0000000..b09b15d --- /dev/null +++ b/enrichers/web_search/run.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python3 +"""Enricher web_search — busca en DuckDuckGo HTML y crea nodos Url. + +Wire protocol estandar (issue 0026): + - stdin: JSON con node_id, node_name, metadata, ops_db_path, app_dir, + cache_dir, registry_root, params. + - stderr: lineas `PROGRESS: ` para feedback de UI. + - stdout: una linea JSON al final con resumen. + - exit code 0 = ok, !=0 = error. + +DDG endpoint usado: https://html.duckduckgo.com/html/?q= +Devuelve HTML estatico, sin JavaScript. Los enlaces vienen envueltos en +redireccion `//duckduckgo.com/l/?uddg=` que hay que decodificar. + +Para automatizar busquedas masivas en el futuro (sesion persistente, +cookies, JS, captchas) la fase 2 introducira un enricher `web_search_cdp` +que controle un Chromium remoto via DevTools Protocol. Este es el +fallback simple zero-infra. +""" +from __future__ import annotations + +import html +import json +import os +import re +import sqlite3 +import sys +import time +from datetime import datetime, timezone +from html.parser import HTMLParser +from urllib.parse import parse_qs, unquote, urlparse + + +def progress(p: float, stage: str = "") -> None: + sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") + sys.stderr.flush() + + +def log(msg: str) -> None: + sys.stderr.write(f"{msg}\n") + sys.stderr.flush() + + +def now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def now_ms() -> int: + return int(time.time() * 1000) + + +def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str: + """Descarga la pagina HTML de resultados de DuckDuckGo. + + El endpoint `html.duckduckgo.com` no requiere JS y respeta los + parametros `kl` (region) y `kp` (safe search: 1 strict, -1 off, + -2 moderate). Inyecta cookie para que el "moderate" se aplique sin + pantalla intermedia. + """ + params = {"q": query} + if region: + params["kl"] = region + safe_map = {"strict": "1", "moderate": "-1", "off": "-2"} + if safe in safe_map: + params["kp"] = safe_map[safe] + + headers = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.7", + } + try: + import requests # type: ignore + r = requests.post( + "https://html.duckduckgo.com/html/", + data=params, + headers=headers, + timeout=timeout, + ) + return r.text + except ImportError: + from urllib.parse import urlencode + from urllib.request import Request, urlopen + body = urlencode(params).encode() + req = Request("https://html.duckduckgo.com/html/", data=body, + headers=headers) + with urlopen(req, timeout=timeout) as resp: # type: ignore + return resp.read().decode("utf-8", errors="replace") + + +def decode_ddg_href(href: str) -> str: + """Decodifica el href de DDG, que envuelve la URL real en `uddg=`. + + Formatos posibles: + //duckduckgo.com/l/?uddg=https%3A...&rut=... + /l/?uddg=https%3A... + https://example.com/... (raro, pero ocurre con anuncios o cuando DDG + no envuelve) + """ + if not href: + return "" + if href.startswith("//"): + href = "https:" + href + elif href.startswith("/l/"): + href = "https://duckduckgo.com" + href + + try: + u = urlparse(href) + if u.netloc.endswith("duckduckgo.com") and u.path == "/l/": + qs = parse_qs(u.query) + target = qs.get("uddg", [""])[0] + if target: + return unquote(target) + except Exception: + pass + return href + + +class _DDGParser(HTMLParser): + """Extrae resultados (anchor + snippet + rank) del HTML de DDG. + + No intenta ser completo — solo busca `` para el + titulo/url y `` (o el div equivalente) + para el texto. Es robusto a cambios menores: si DDG renombra clases, + el enricher devolvera 0 resultados pero no peta. + """ + + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.results: list[dict] = [] + self._cur: dict | None = None + self._in_title = False + self._in_snippet = False + self._title_buf: list[str] = [] + self._snippet_buf: list[str] = [] + + def _classes(self, attrs: list[tuple[str, str | None]]) -> set[str]: + for k, v in attrs: + if k == "class" and v: + return set(v.split()) + return set() + + def _href(self, attrs: list[tuple[str, str | None]]) -> str: + for k, v in attrs: + if k == "href" and v: + return v + return "" + + def handle_starttag(self, tag: str, attrs): + if tag != "a": + return + cls = self._classes(attrs) + if "result__a" in cls: + if self._cur: + self._flush() + self._cur = {"href": self._href(attrs), "title": "", "snippet": ""} + self._in_title = True + self._title_buf = [] + elif "result__snippet" in cls and self._cur is not None: + self._in_snippet = True + self._snippet_buf = [] + + def handle_endtag(self, tag: str): + if tag != "a": + return + if self._in_title: + self._cur and self._cur.update( + title=" ".join("".join(self._title_buf).split()) + ) + self._in_title = False + elif self._in_snippet: + self._cur and self._cur.update( + snippet=" ".join("".join(self._snippet_buf).split()) + ) + self._in_snippet = False + + def handle_data(self, data: str): + if self._in_title: + self._title_buf.append(data) + elif self._in_snippet: + self._snippet_buf.append(data) + + def _flush(self): + if self._cur and self._cur.get("href"): + self.results.append(self._cur) + self._cur = None + + def close(self) -> None: + if self._cur: + self._flush() + super().close() + + +def parse_ddg_html(htmltxt: str) -> list[dict]: + """Parsea el HTML de DDG y devuelve [{url, title, snippet, rank}].""" + p = _DDGParser() + try: + p.feed(htmltxt) + p.close() + except Exception as e: + log(f"DDG parser failed: {e}") + + out: list[dict] = [] + seen: set[str] = set() + for i, r in enumerate(p.results): + url = decode_ddg_href(r.get("href") or "") + if not url or not url.startswith(("http://", "https://")): + continue + if url in seen: + continue + seen.add(url) + out.append({ + "url": url, + "title": r.get("title") or "", + "snippet": r.get("snippet") or "", + "rank": len(out) + 1, + }) + return out + + +def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None: + """Busca un nodo Url existente con la misma url en metadata.""" + cur = conn.execute( + "SELECT id, metadata FROM entities WHERE type_ref='Url'" + ) + for row in cur: + meta_raw = row[1] or "{}" + try: + meta = json.loads(meta_raw) + except Exception: + continue + if isinstance(meta, dict) and meta.get("url") == url: + return row[0] + return None + + +def insert_url_entity(conn: sqlite3.Connection, url: str, title: str, + snippet: str, rank: int, query: str) -> str: + """Crea un nodo Url y devuelve su id. Si ya existe, lo reusa y refresca.""" + existing = find_url_entity(conn, url) + ts = now_iso() + meta = { + "url": url, + "title": title, + "snippet": snippet, + "rank": rank, + "query": query, + "engine": "duckduckgo", + "found_at": ts, + } + meta_json = json.dumps(meta, ensure_ascii=False) + if existing: + conn.execute( + "UPDATE entities SET metadata=?, updated_at=? WHERE id=?", + (meta_json, ts, existing), + ) + return existing + + new_id = f"Url_{now_ms()}_{rank}_{abs(hash(url)) % 100000}" + name = title[:200] if title else url[:200] + conn.execute( + "INSERT INTO entities (id, name, type_ref, source, metadata, " + " created_at, updated_at) " + "VALUES (?, ?, 'Url', 'enricher:web_search', ?, ?, ?)", + (new_id, name, meta_json, ts, ts), + ) + return new_id + + +def relation_exists(conn: sqlite3.Connection, from_id: str, to_id: str, + name: str) -> bool: + cur = conn.execute( + "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? " + "AND name=? LIMIT 1", + (from_id, to_id, name), + ) + return cur.fetchone() is not None + + +_REL_COUNTER = 0 + + +def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str, + name: str) -> bool: + global _REL_COUNTER + if relation_exists(conn, from_id, to_id, name): + return False + ts = now_iso() + _REL_COUNTER += 1 + rel_id = f"rel_{now_ms()}_{_REL_COUNTER}_{name.lower()}" + conn.execute( + "INSERT INTO relations (id, name, from_entity, to_entity, " + " created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)", + (rel_id, name, from_id, to_id, ts, ts), + ) + return True + + +def main() -> int: + raw = sys.stdin.read() + try: + ctx = json.loads(raw) + except Exception as e: + log(f"stdin not valid JSON: {e}") + return 2 + + node_id = ctx.get("node_id") or "" + node_name = (ctx.get("node_name") or "").strip() + metadata = ctx.get("metadata") or {} + if isinstance(metadata, str): + try: + metadata = json.loads(metadata) + except Exception: + metadata = {} + ops_db_path = ctx.get("ops_db_path") or "" + params = ctx.get("params") or {} + limit = int(params.get("limit", 10)) + region = (params.get("region") or "").strip() + safe = (params.get("safe") or "moderate").strip() + timeout_s = int(params.get("timeout_s", 15)) + + if not node_id or not ops_db_path: + log("missing node_id / ops_db_path") + return 2 + + # Normalizar backslashes a forward slashes — el path puede llegar + # con separadores mezclados desde el lado C++ si fs::path se + # construyo en otro contexto (build cross-platform, copy entre + # Windows y WSL, etc.). + ops_db_path = ops_db_path.replace("\\", "/") + app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/") + + # Resolver a absoluto si llega relativo, usando app_dir como + # ancla y cwd como fallback. Sin esto sqlite3 crea un fichero + # vacio si el cwd del subprocess no coincide con el del padre. + if not os.path.isabs(ops_db_path): + if app_dir_raw and os.path.isdir(app_dir_raw): + cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path)) + if os.path.exists(cand): + ops_db_path = cand + if not os.path.isabs(ops_db_path): + ops_db_path = os.path.abspath(ops_db_path) + + if not os.path.exists(ops_db_path): + log(f"ops_db_path no existe: {ops_db_path} (cwd={os.getcwd()})") + print(json.dumps({"error": "ops_db not found", + "ops_db_path": ops_db_path, + "cwd": os.getcwd(), + "entities_added": 0, "relations_added": 0})) + return 7 + + # Schema check — si no hay tabla entities, el path es incorrecto + # o la operations.db esta sin bootstrappear. + try: + _c = sqlite3.connect(ops_db_path) + try: + row = _c.execute( + "SELECT name FROM sqlite_master " + "WHERE type='table' AND name='entities'" + ).fetchone() + finally: + _c.close() + if not row: + log(f"sin tabla 'entities' en {ops_db_path}") + print(json.dumps({ + "error": "operations.db sin tabla 'entities' — " + "verifica que graph_explorer haya cargado un " + "proyecto valido antes de lanzar el enricher", + "ops_db_path": ops_db_path, + "entities_added": 0, "relations_added": 0})) + return 8 + except sqlite3.Error as e: + log(f"sqlite open failed: {e}") + return 9 + + # Query: prioridad metadata.query > metadata.text > node_name. + query = (metadata.get("query") or metadata.get("text") or node_name).strip() + if not query: + log("nodo sin query (metadata.query / metadata.text / name)") + return 2 + + progress(0.10, "fetching") + try: + htmltxt = fetch_ddg(query, timeout=timeout_s, region=region, safe=safe) + except Exception as e: + log(f"DDG fetch failed: {e}") + print(json.dumps({"error": str(e), "query": query, + "entities_added": 0, "relations_added": 0})) + return 4 + + progress(0.55, "parsing") + results = parse_ddg_html(htmltxt) + if limit > 0: + results = results[:limit] + log(f"DDG returned {len(results)} results") + + progress(0.80, "applying") + conn = sqlite3.connect(ops_db_path) + conn.execute("PRAGMA foreign_keys=OFF") + entities_added = 0 + relations_added = 0 + try: + for r in results: + existed = find_url_entity(conn, r["url"]) is not None + url_id = insert_url_entity( + conn, + url=r["url"], + title=r["title"], + snippet=r["snippet"], + rank=r["rank"], + query=query, + ) + if not existed: + entities_added += 1 + if insert_relation(conn, url_id, node_id, "SEARCH_RESULT_OF"): + relations_added += 1 + conn.commit() + finally: + conn.close() + + progress(1.0, "done") + print(json.dumps({ + "query": query, + "engine": "duckduckgo", + "results": len(results), + "entities_added": entities_added, + "relations_added": relations_added, + }, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df002aec8545a8a1736b1907e002ab5eb92a5a33 GIT binary patch literal 11234 zcmdTqYiv~4nfK1U^PaJ1@B=?ET-#uK6k}sTl9uqO9|IU1Lu`{2vuL=Udu{GyX71$N zJI0=jThmen7rMYE5>8=NXO-PLOMeh)S7oLBu}P(ES1VO`L1nn1QoD_`;-3jbin6QK ze&4xsXU49{qgk!A$DZ^2&Uaql>wM?_tKaWoAaworC(0`g4D$ypSb;M$?4w;ahPlE> z%qSzVk}b|AY@;?7^IV)8ZAtAf0iGum&!&r$bs0-A_glbm7is- z^^b0R0!*L;X44a3f+aA5#7XOtMsp;wJmYxRY8h0HG}^0wRPJb_=`mwR<{gZOFt$of`--Hm_=<=0BN+qxXFe#EwcBG~)MCoM2+L1OZ(=J5ysfao$6QW467J>mz$#GaN zow1UfRHI@ZjWDT*FeO3I(qk#2Mr956Jt@Ep5{d?kI;{{{fEHbis&U#srjBVH(Nqd{ zI~=$6Op{^8skj1&;fzj>CnLq}?S$?zIjTo=IgyHsy4+c8gRMU){QK;i9$306#6=uE zo=zkogS(s7#H0{caozb-?+}I+VFh|2DkjDh5vJ@h53pSTT9I)X<483E%#EH>T*-(X zo(ISzA=dDU$)u{A1TY9?{s@r913NT6K&fK53qu-?h^D8&<+Yy`JwQ z*?z`d&yT}0y3>NyT~^M5)Ty|}+L+I3E2bI9mOzvhTdZ6xXyuBj#7lN-bc&%ce%5A< zD><&&hMABv^V_f%CBT13NO0y!qzd5aR2mKh&UPBglW93VNdaIpEnZS4@gN1_C$ulFCGmKj|@kS40I2M z+=k~gQFIv?WJ))9L}=KhbRwl097qYnjeUkAef>j*P1S(pfKoL%WjJBu6av89gsg`+ z!v%sVf{g}`Gc(*6N5+-7tRaQLW_~u*c}h*lo$+*3)H;EUEHPv8sMe`!N>Yzd!P*J3 zOY2-IV>`|Lx0nky-tgo304DP&$VM2d34WRg(JZsf4tB?X&$k-hY>`S>kK1 zRsN*zt-6K2+qIz^`)~0zclcc&Ro7k%y^(x}{511+W~sXC?|OcI@V$dSAAE1{pLX4P zarpL*k)`U9`g(!NViM;L(8rvss@`$*{(81fd94 zHk(vZg5PQu`ojCVj}D9otssMdgj-E?AeMnYQmB4bqAS8%kZJ-5@pCcfA$4&=GvQWE zv552di9D>Prb#rv-Zhq!kabX`zzYyoG#635Hj_sMQWUn?3{QZ~Lq>@V(qD>*`YNpE zo}^PSupo$feM6QD=WoWBr+`!% zgv0cg*YPW({`{cR)>;y7PPS~)4jsr5+=nc zkSUsZD=gl2!PI}I!I{)#0>>#NRY}!4GX51PLJq^Jy&4Cd^|avxL`fh!!!J+C(KHAH zD%lLraPNWe3*C|77Y7Un7|01lH|$X`=Hw7Z>R`@>O&T+J>}{zh0(VQsaG2IQ2|?i~ z{IqU}K#@2Y&$>&8m)y;>J@>uqF7BP%JHP$5cjIhdj(6SVtC#rdH#2wmCMdmr=**$B zFJR{2nS*Bsa$f&zKiLFWR-7cB5sK! z1tKaLw1n{b(F)47)x6uRFmUtN2RI~vOzK1pbr>AL!rtMv0!$2h;uT4eIY@YC8MqPyn~UV3K9RWsY2|nyjku= z99c(f4W^UWESt4uxvaftPR9!7Gz^gsS$>Muz;mH6rAGL8!5{`!bi{COt3nCUIAZ=) zjZiGMPxB#rW=Frtc0eJN@sL<_B?Wg&GN$N6L?Qwkcm*$ZWUloO4u^+E1Q4T#OAH@H zY8TL|0oyfiynr=J^?tix?Qy#x6t!}@P*lMo;ic{Yuu+A!z3oEjztFmU+xBO-ZtH^o zk!{;{(*M!c_9xVb_5pqP#(W9}JkhQ!4&Zs8f0!1WNr1^a`F45dKe~*m2gnh|O?WE>^gR!C3iV z(bWw9g(`&C*lD2a89R)~OE&arks7@~%WGXAEjWw2k}URzS`K?d%I8z+XM-~Jlx6Lb z^O}8_Y5qdBcAA;ujxnd%W=2PmL&Iiw0eZ%2_aPNrlWoTX4H+1-g zB7eMmAUqU?j=T3h;9&%Ur$+F&9$@YP92)bk-LNSV*@Z2;F`~4FR2Om;X_)MU3d1!n z!zEHp%9L~|IUDx4DoUE+4o~5=6YR2(!=z|%E}*RmG)*bj;G$__a6~bHqP$PHMcCh`d80h7`Zq)H#*PFcfaMh zm^x zBj0~M_A$PykD=Os&*cN&vHL&MP|06u?%|mC1Dkqkm=CyhJyqNXHSV6x+y~7p#4CI$ zX#Y?KdgrS%5KAh*91|sjPk<|UHoiWiO;8r;V@`zO1RcO^4wGHJklFHZE11ohwXfx) z0h}i5IZPKflug!iNKVRAqeka{n?7^rPqL6!S))wROB5RX<2)-X8!vPi- zTya1mpt3UM1gay_kF7i~HMnuqv~IA39D*up`sCGA-e~iq;cap_xGcypKwU=2-$V4r zJnpqu-SbxD)>U6=zuXRN9^9SVyZ`1Zz~&W)*tyjmZwR9wu? zWzT=-9A9AmoX0oY`=LL0js@O#SLU`L?~lA$d8O`h-F)}F_*UPYU?cE;4|I1^fid3* z5@GWqyV&?Hf3xvs_djgTZQQiLz9rqTfdr`B@d?AapIP=YW$Qpa`pRbye!5&yA`LuD zpcbS73>B>SVGruk%Dg;yuBV-OzhzS|Yx|&mUC$ow7p%9p-2RI)7UC810F5&$54Qh5 zmj_wCkGTkIn_!~06ICD+?4Te`1!~W8u|3blc9V-iee#!W7uNI8W9BPyEk3$opVOeY z1~Z5Mh>J@6A7CvOTpUzk7M(4o#zY%z6v8>=@!nADlMTzlOJDj)uE1 z$B=y$w~|5Z(gIOl*_!IRz>(w#whuw{M|!r5vSDGp*bA@h1D8^sC^n zy8fl|wr?<3Rg0Ro9yM+KrvPhd+W-342Gtt+?XNu7y_b2vsjjDze}C_~o*?%CU9xbq z;_%hNsVfN?#8H8%aN`S1^=n|Q(i>iu!HaJG`GU{#Yq>UN>^4v$1&b4ITH4>}1sqh{#rXwr4r_GDevQk(`e&Sa;X$ROFJ5_8g{;%+Yo*^~CC3yyW1n&8l@@TU z+R7Ev80z;`!P>DISJp;?Sy!xn6#x{Ak2mn7gr*q{F(jE)L2tAw^d>83&9c~5+|db} zzQuxD>lMD}5|f;g6Yk)ypSiB*sb$73Ic7XrcMSPveD%4QSdM&&&9+pY6V2-(C=p^IARwFXourZdE}hU$rfybz3T-g_;lZJPa8wSN z?`sZo2*dL&c-Mtph={}pM#fbFPlVvUz&R(h3Bw1?k+HNA*Wr!OaKzz3J+2uw5$%+k zj!W=Plp>LdxH=}rBh+Q#j%ebzjMYY2H^l;%!7w2zffvC%cKnJ#OwxA^g8WTtAVU8r zJh3@U$Bf}AKGVZ>F`XnR60DcTNKgM@(TxJ$GkQRV9i9-Cl;I*Wd=Q|?5+H$xdOfbd zw-qI3tcTD;#tbp#Dxr2X(V|#HY%#Gfdj|Kl-184U-Gvt zeq+i1^z6QzH+XUP-0oX^*WB*&dokI2m+$%w-!&gwtX_28+Rz2X(*N6h*STlSijwRd zsNHSW{u^WwN9jM*eN-O2n3>C5{O;U$Z<2-)-nvYUsG#u`(=%L|7QCyn^}m#=ZrFlo}QS{ zqL8Nn%?B_1s9pX;$bdJU%ko)!7F3t%HZ3}8vjwMYNnPO^hhkmvtB4XsLBFH0L}n4T zsKl~PYedO*%}qVK6k@i`22@@yUF(uk&SXH;W3}TD59=0A#eMcXLWL|}74wv+q7cmG~0X2o`NuzB@F9wg5mcdRD z9zYBQ0CFCST@bARLZ{qk9ynYJEh78OgLeFJ9PPvkXgaeWAtD4Le-2T|L#8lK2b#tl zuFMp2bV$?{1_z%+&_0iY2NfSV0jEq6)6Ho$oRNs6Mk5i!6N!wc zsTzt9U=i>YTB?IgV6hg|7%H0%Q;vda=pZuvBE)bPstjkoj>tGeWKdI0pjAdx#KURR z`L9Fb${0O5`pFVtHe4_T_znan18Cq|t4SF@BO&01)Cd|6hPV635&Xg=GCb5vkg+L~ zL%ZP~pRr1Ms+5k)d&mORfq-W;8DbDdEc?*rWBE_&7`FL880IYVeeZ{i`$MMeLninkwk>DzCV-3_!X1zK(g+U^FnEd{pS2rdPlp7n!Rdt#$E7pR|qWhoF^3#@Y~ z&^hbQ1#0IHECn`Wu3^Eo6lhy~ZYi++#?hs~&YQ=U0{dn?j~rgsvAm(FjGy%^A7mMu z>-CLiHa@gD*|J9*V+$-}Zn?5hZ)g4addPwBx8PY8Zh{mZa?2j3^{JaJ4>=zj%+)k6 z({u;CKwvrC{|M7Z8-r}eCk2r1~ZIh-A6w-!<0G@WlMtAKf)r-4p z{9rXToetyB&eY5>)F1gMf6gQf>0dwl!5=?z>@@C9`GL&P4nNSeK>NXP&b_;Ulx!<)4!XL2UegYg3?|mPTw*(@L3q&M=yjTc~ z2Smh!`QW(3o}ofZ7U~M&@d#^?@~wsFc(fpo%Z1o@tPmfM7ZT%%Lfd#7w1-|3#@k5? zzzz}y*f|?iBU2wiH|KT3G!&xe6}?Dv*-JWA^cjZoe4FN#Y^kUebWDJD=k zpqxDYjpxoj^FrqQbByCO^ixl=pXdhBmAs}vho%nWqk8s|#>V-C%9virDKKb9y;7F$o?VwUDVS!AQ^DwmN$GEntXvx=5&>y>2L@FiCDqwj^4Zo+!d)3 zsSO4yRD@XPz5CSpa5Uz-lEAY6{PsrM7N7h?jl#t&>AEv}>ZfdX+iTAnWl@CwYol&N8+N6V_|?E%3(U3E$I)4n>S9)l z$B~$WO44Sv=0wIBHMh7_J{eoxy4uruRELC)9Q4kiUQ33L*E)U(zh+Ibb!8XeWmS{cAvie-{S_zq>LlqVJyz1$yRNU;yDpaxZlKzU>#Oaqi&+uU z>gsDwU_JQHbE6aI=@8k*PF%0G#Tm`kO`o`}?)coNbK+z?b?}S9FAl#1{Mz6L=g8dY zjaeXlH@3T@uB6D0*90r|l;GZ8m58~^qm8&dRJyHhV^%{Tmjl0n;6SaEbmU$R?e7DS=2ewNO zyQ6f+*mRe^!~wgshwR<-i2SFmf!?f+#=bZTMo7S4`?uxokM{B}xx_dzjwvpBzSbXvb!ir2pIIptR%`_uyg^j##q>VKr z44QEOd1{*B<7u=CvOAV0bhXoR(D6!o$fKm+>Ty<~Z_}&L=l1A6efM6t(mgAv1842P zRN4;cGj#}!0}RKpI??6rU$gn9%+U$jS0XojIv zlTnNS1yqlKT=dYvDbHwx=;UMcE9b%}WRQ~#+QHJ4sX|vLj;rF_1#C(~Tl7aRrtnS= zyLEs#Rl};$4^cq0vL9)WsK!Coc^ZbQMk~q*&X|V5K;!_!8Afmz!4X@6ZVoXx$l%Ca zgmpLy!@{=FxsJLS4T4Z?7(8UR@F?b*NYTn@AN>KmM-gA(-B>saC!;019CYFf^mjl=CI3(*YBsNCu&< zE_y7_+O5YX^&*hc6LX_Tm;oitU_b~3C&0SgX06SWXQdN;HCaBXP+*43jE)kFo64P8$9K8n3vV(cOXiG3R zLzP$;s4i)WJ_mgSz+A$o=Ad%)0Cp}<&{DyPMxk4FU_iEeU|=e0fIf*baIAGQpi?GT zO;Wu2C@u?yR0GWDiSjJl0FBxm8Cc>xta+Kj)c{d8gtE|_xj2=}o4KN4xA0A{yEDe6 z(o`O|r%W@G`O-x#pW#P7V`vi)n3qx7M2QwOGsEXnFedH9lkD_DoPCVkcaRi=|S&RHhKn3G^5ObV}*t0Ms26!!(L<8BGDOgAlsY9a!AQ ziaMpXj1%D_8T1#@yugP#>@OVllO6IGM*M|Qe_@O-81E;Yt}6}Km4-MQ?{|nd@k07= zUFk?&$?1So4yNrePskYlLDY#Jp-#%SRNr!_I`GZvzO$bN#nAKO-Dv01-sOGoZC~%*xjg*t$olq& zst=#6_MQ4P6p&Ag^QS(I2wh55IdpUPYVz=pzcGJuA-M3udPiqfN-Yd7c3c^~BcO-e0xcr=TrJd`8vOQ{c}v4x?kGUs9BN_(ujCp!ly1h7;cyzgjCgir3YJCdN8^nJt#l0 zu;t%N56Vp(aW8BkvHMOB##un0c(Ca~6WulTfq=K|M!OgA%A^C55^`Nq!b`H%!gL6H zVoj40UYJ5U*&LFa-X(vEp+RtIZgWTPh9gbG-7j%Ku$gSKYnx3<)Pl_}Cn*u%bYSHr z=-iojNr@N)Ra+pa($jN^DTqFv(5C2Q4x=YK#9mWCKtu@D4THA9a;PmE{1FxxQB3?& z+-!;~z+xYY`$@MWVBQMmr|3fnb|Xk57zFTOz5iBk_-E-bb}<6Lar>Edr(;ktcaWKQ z9bdz2aLgO{;^?0mI4{Q+AA2XY{3Q3qm!7@dw(p(P=kzgXrbiJl|Lr(nYGQ*sN>5?? zQwUBYcpAYo2+kn*8iKDQcou=@41%0`7mz-O?VbxrpU2`k1m8mN0)q1hGz1qBWC0kc z;`Qzi_jh`}axrKTSa7?Hoe-mWPEJ)FTTks+R;weYtLoF}<(#SWa;gW8zBm18P?Vns z@8)iD=l|Nb`LNEg!*;DEz{Z|L!#=huBf27kK|J}sbn;#BHBS1!t4PNyeCV2V5_oy_ zEx;N31~z&L7OKwHc`w9fAS7vI>vF;k$!F(ajrj*|%MfWHVFKxAu!Vk_V2y^oF!R$z z8@@1;8(PHJeOkmgvxtce7O`#9cWtvo-oIxN+nU_1HH+AO-xjfB4wjm9{ueD`@J6y` z5z!jH8OsJHg>2v@i2u;`kuJPbe-?P7RT3(3sEJ?3duYR_5+MQhdMyGk*Bl*!mup*` zmuve$7Q9?D{^VOL^ujiLNZewtsPKhcEk#ID4{V1$8`Lya>rsOBO!-EQ;ns#v%~xnb z=dDZ=y8D0i)O|I?Qoo%($&Fr~)clklrdivp&K!Rgo7}V=PrCc-K3zLr|1xM6bV!6> zB{nUny8&=&ynCo+!E?_do}Q8}rTLv^J=1NWoep7Rp7 z>N~hq$3Ka{|5x0!W6prI7WjFD&H#5m{ERo4X}{~_FSftey`Jhpo8IeIb-f!}@7`YB zdAOPy#fGtl8I7aWzGIltH~|Tbmj6RSqb{0P_N~VD|1!LkUO2b#!`Hv}c6k1DHMW0U zZh!OWm7`T@Xt4s|%F)Hjs=NoV#~`Q9nEAW zrcC%_QYJ&u34lNHIGIc4P1-#fp6l%A`8h~K_EUYD=1kpg^{VW!Q^$N!I|6@Onm|3_ zIaB(4<^6#o)v o5v9)(g1GySg7BK~Hz6nn82BXCEF?)aoeVP-qwnR z@-0jK=cVtdR-vp?gPc{1hM|^pdW)C~W#1^TgUP^`*!`U7DSEkV+PYmZ%hq*}r8jhY zHW%{}Q@UkndHc4PDi$nTGs9-86Y!bt3rSoM_WPQ@lih}T@ z`i4GJDC^L0$&}?Q@68v~S$$Fc_Iw#SU)1x)yQgMtyJ9^%IB49~OO>KAkT*+%I<*V= z`J$Oq3uW7&hNZ(pU6f6mn(DX7e5F{(>%`p8_`P{UEt}Bzl%eL$vaM5@n3np!kvFUX zS-z>4rV2(`orey-`N!AP3Ji+q&#Jsf(3(C_@77=i;`01^mxtuVpdNabH$p11{5JSf zjgnp{s@V4R*IC>C^uRR1&`kABT6~uECU3t`HhBYbK8jOPzgpo5viR^b1y`31v1FGc z-mCLOtU?`$Aa$cobS`XP*CtUCy)WJ4YgKTfB znLEU}#`9jCeP6J;LjL<28k}?DJOVVVd4$(Do(;{i2<ajlS#WA|NlhPodoIag$Z)gq|ePbX(w~*9gt*4wmt}r zrP;ba43QVf5wvN{K**-q_|rV9`{-r39ywZHooykDW~(!p1@eTyI{Tfr`s&Occy(rZ ztNdACooQI1vYWA=<|`R?D)s&`zIRLrf6<n@Ro zT_X4Yk#v9m{T3o=pA4<$)A!Ht)$DfKxuB+#uK`@RI4 z(1U@{IGgm^3BJ_(=ymZpkikzR|GS(e=gE+ht?z?92i^yJxF`Os?}IYzgWYZy8U9dk zIwu5vhH(}944(@};WqF!$yk3yM#zN&$2UBGP2y{x=B@BH_MP$Huz+}9*e8y`2=>uy ze4{RY67}QVcHjuN@e%N6a1P2W!goZ3Kev5B$cp4s_BFn3dv>QnE2#0^o0O?)PixkRy zxY!!tY(89Ro))1^dJcU(3FJ|F_i(F6x$maX(o0-><@&2Pw41L(hp;cu&iT4?`A_PM z%0>r~K7}RFg`Dj2G5ez`8f8xcfNtd^il_HRf9N2H)o`btVPSYC9{^0yz_5-W_&f{D ztAYps)DhkVNa~6X9Q5%}L_`Eue=UAr6orq|lCT*0QFKAfMLlJjnkAMkSfp*BdHt>c z(-0Xr?}^mH6{D(;-Wi-VOU7VvKCfGY(`8tvY!wbxs5xikZEMhk{DrN}P`xrc2#eJk z^jXcgT`8K>po2E%Kzs^iq1?wnWzkDMH)E7Rla5!%P;a+}mZyde#TrbOpSG|Do-3Mp zy=aXO1coqr!8#1&--Q2M5^CwupLVWYS$=Qj=z6CA&V{Ya$)9%q+Cq%@L3^iIJsX;3 zSaocwfY~!u!~RrxQN38qMg5tsKGc{rf%;K+c`3qaiul}&~q*=4(e3AI7+Dy@$(jhXqizGEmpN8~H1-(t1w4~b_8*9m$@zPiPcNsEA zS1C2A*TGeAmh`fpAEKC0porS(D3TX|LKMFZ6oc%wl!(6_#o>wjnHPnDdQr$e zQ5BW&B;!R5N<9Tfz`8IT7KWI>`gkEMjE2?DhlP=_+QqOi=7|QyH9*I((xf z?_68$T$7frZA#~M(w!@}*VFw=-`J6qZK-=*>i*v1+E`8M-jqgmd)x{jqx+GwzwQqM{@&{=sgU>Wm;)U(swSCjgeM^?!4^V?EC+*?d!LO|EC zlt~*6c2nwKA%V^+_(u=H{e{%Oqoiw@{&hvINvH2z0a8=cJ6G0~r?;ikaBtNMn6%+b zjQRt|krD zl-%0QnliL4;k{KaVA6&!F+QMRvvxD6=WI|744CaGr)$!1O*y?rY6^rN@2z?PlQw*b z@c{*!H3%7MLp^7MYF7L}>0Lc>r@E;OFHQWTl3unyx>S>TS4AMpmsZ8^j&Do7aBne@ z2?1S~dOwD#LWPY68&rP|Ms`rm*-doCN&4AcV#eEkBq?2YtjO&a}p1orO@1=)3F zbX#iNTUf~oHX7`vGztc+_;cWQ+dw(!{`yxbq4g!v!rkp>qpfcpmUd-tachQmS9~ad z!mjMC{~G~`76i!3HnO1j!SyePWkKP01jyUQ?xB0Q+^AjGg%59w6Xh7s&vr^~%xxiY zH||JI{1)t2Zo)}GwzHlo6@)qL&7O;Z4xqXu+JsP@+{oh2VFr57#gT%a?QHl@?@S^3*Kq89(Z6Xc3Bf_OV4h$g=Mn^*|G% zo(=W7*$g#_I%$9Gk?e4l8OUBcv1WO8H$^(#bVP7cPLO*HL^h-eV-W!#=MVS1IzZ*a|i8C>l_}j_FPUoU(2EZ z5j;Z20!`%i(|GfnWkgy@AK%Fef{(mALl({6pTqFpV6GbG zx$*vlY?`g!AP?Yl23hDX@&xI3y6Pk8KJZS^&7;Pj9PdL$B@TaW@Yf1|Dfq)|bH>ds z3OV(<4}>WGVnuP2qL?ktwd^?s`{v%x_OSrrBE&uxKx}qC#@}tK{!fjx zpsUkx?ANLP3-GQi0euQRV)mP!KynHR92tB!FIW;47CZRF}0+t(kfX4q-mhv7relwW))m+=u&102zBpx>8~yn^J9 zki3fI8j{zLTu1Uc5HAY7_gVmY8fD_8IkK5D&7xV!_3WLx8)&B@@#A?Dxi^t~6Ukdh zzJ=s%BpQ$~rr3p0o?#%34Qc2+okV3Hx7CYAn8c9I(C{wl;x1`)msEGm?1s-A_?0#9RbSVNJW%L2NNrA&fj8E4@L7t*YCHjaEu^$Tg#M-zP+nvG=9hW+l3!4)THq>BtKkOTi7@WaBRHh)Af^M zpALiMlbUiID-<{=@4DB~3oF}bu$#axuu^Ddt-*rlzyX*-6HN4*32OE*;0>B*g<*b= z8i)^JUWHtz7uPgm<~7ZeHEnv{o~MSUQ9QoFKaAA#3jW*W^=d2=F~F8BBnbwn;b%3x zWKiVA>t+7CiCIy5wpf^AsgMNx$Hp`QX7&o>S(qgf@XO$*JLm**2$%h}NB;mhJmxX9 zop`QflKG-BPX7$ba96P~)$+@zD2l%jUi~NG_`eFmhr*U{=s%>082MTN@=IAxNlVFJ Q^&J++9zK^8pZER!52SyIegFUf literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_extract_text_entities.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17eb6a1c21cf819ccc3117c67b7da537ed4ebb30 GIT binary patch literal 9081 zcmeHNU2Ggja-P|poHl&q1gztscAbMmqjNs}I2UJ?U|--MH$-1#9$ z)%5htklut0pdc?9&DKcgbrW$Cd{3BYCLy4S1%I-B?saJFrlM-9Oq9H=6lpFyqflA7tIXSKU#G@nylw`R-|sgh?lW_=-ZTS*s7L@^uaToKx5y{i1h%ko=F zxkNQ39KKd0xok!&QT3IuET5If3YlE~6?rm4UQ_Pm@;%DkOre}tYb|p1z(`&(M%ndincfSPYM|slL?U+5XhPAgg&Jqs~Cp;2;?pIoF?> zQZgf%k*R@mL>Wmb{i)%R%;aQWYAT%dV<7CS0iGa>&wmraS=B^dp^J}fo`@BwBR-^t zU-!>??QM19C;rFMBfdri-A@9RzNAaD0q)a%YIU3^b&rD{J88C_*SJr&FGfIvUa(6S zNQl&vFo|eUr!2sGof987JhEFB1+CH15Oh)Z?O_>x6lD5FofhYoB&OH#p8gPGU<8ej zQBN9;upZRIw@*SzM31Py)DqlcwuN)1V7vJ}ai}P0b z(_V2gSao5eQ9H(0r!H`fE5aW)XMHn5)^`hb8KYTmCWkSeBz${V5NLqQf9m@vE(u1$ zXeLLD7{n}bhedg>9phyy%16=Se;#FW?1ioI1RwF9YuwRi^48*G}-Z;Cp@Yagu*QjTSBG)U@!G1+&y`wCZ8K)!KvF^fvY1Ja2vN zJ*W-#pi^k$z8&QBzE^#x$A69QsRrM5cJV$^uSJa=?%Njd7Mvz$_T3}zce_T9bSGMS zzK4z`^x%wVoHcr_J74%c^jdiw$k~r2>w6w2J)~EU+k0*MzW3U8?ukF`y*3PcZJW_b z`rZ@tmMa23`xp`JG@lDS!foJ-)u{fA^pn)Snd(QZ`EFj*gJ>uw$ zU=Ka_J2>#M-->tBz9ZbkN5G%X`KfUMzE>jrjOKYEIXG?xvKc^DrY}owCVghGoS^`# zR5PU9MJ!ahU8y64^b|-XFsDtO9Rb`FQi|CU$rYzf-+aGWM=j)I2H#O8%b97#jB?_eaFhjJtXm8S5;jH)VBvowZ41B!>hMO{vn0X~A8HtvR%IIjGqIqjR+F6$Tv zsp$+M3i%Q-6n5yM)1iLo&_&jv!K4av01fsd8Qj&GbF4GxC?W6xg#W;1WY#Z; z*-0z`|H-hydu#P4uM|xQAhMd2D4yiqw&+<9E8cjFumOx@9k2-+7?1%3qGy2lJwXI8 zstdOOo*JSKhkoloB_hcBx%dlF6h4Yc!h-KF{qthdZ`NximIPR!O<+C#P=N6HdiqR} zsyOqq{0-@yDHW97{9HDp_D&UH-LX~GTc)L1C9A2uB{f&n($h3kp6P|fr}kQGTDe=! zm#9K}HOxX-6v}F3A9~6QX8rkTr3e=EV&!*G-_gvKvT8a@l?;@oaXN^TwdgsYFJ&`% z^+z)+LacBBh;WvvyH8%KATRpt=qvh@o{_sJ|X5+Dk+HX|^j_)?Nh?O&*l?Drk z%^5Je<}78+nd#5u^GUxoMYy(76vLftqv*~I6mH`y7oMj%O|fz-X8qfjZoGMYEPdmO z8DSZaJ5W=ikR5_m1n(t9j7<@=o8i~W3)ZMJ)QqQLOnC@%nPy7`+)c~|s7_DLpt zsG4W*I0#c{+&XB9LSU#kE8+ytEsLSyv#|P7{PDZdIlq25R~t5kzHF zI*v+yg;FzuBVb)fd4+x^us#lYg>zoDKCdv~RU7gO!=|WETmp0mTlO?+tFa#JXlGQ# z-4F6YH7|M|9i80oilk{!$E`s9D|jmxgb4Yq|N8yw zD=n+i;`I&b>~^eW`R-cm(BkXcQe;bNTa(&;y0AK2mD)C>f$ea7D|~P*d~juYJ$!ue z@)uHcX=;V6c5JuHtKa?Gw?9q%%@3-luYT+`qo~ohAR$+uQNxs?@%eT8@7(yd|~6 zy~#u-ICKq5nY8Y*8&dmn+|gOS?68daOlse5>#9m!O9RVf>D4W%3+_!OGQpv1Sjwbz zm)(%MmWiX|Frd60iB%g9twrRjbn3w+Ak~Qc;L=*;*p_q(?oGRZN$Zxx_@}7PSfCG7 zhjkcm_QlcuC&xFAjy@N}=$W;nqbon!Zi+9T`jhB`&ZkYMp817X<5K0rTh*hZi(}7F z=UGT-NK~aGOT)`bHPXE$9f5n3$xK*Bfw2w+gJs33<}4JfNk>+`=cqB}GwH~7q^T;& zOGC?JOBc5!8SYIcGGQGB#yS-Am&crH&O*VOB(LmN{!Eg;h;*(Tubvp*h>R>=`Bfyg zqqnRxY3Ta{^>>-K;5ol&wo=SyRaOL!LMA2;IYF z_%&M>vK@80pJO{KJ1Q9gqmBfPpf2e_%!C*rJp_3=JF_YXv)G$m7XckWbMpP1X>s=Q|2v&B~^&i`%2BI$UlQ3W>R})VSyDnD~&hp_sbe)_liP*W0deT6mdVP*% zJj3o8-{YwUnsrea@27oY4;jH}j9>t=odJ51#Eb@zqk3Q#H|kv%Wbzt~MxTIrIqt`3 z(wmI9-lR&T33F7yH|TN5Kqd49W>A+!_3ucWG?T;&i?x%B-!s;ZCd4{n$E-yU&El7O z*X5#LQP2a+B0Wx8{_p$jnbjI=>7JYO{BPIXJ!iJ|@aNz<(rRPy)_c*K(Q?o_M5{?X zP1;EN9=@CTim~URO|R{_jqAZ*SKLw6z|(q^bnu-aCs+v9+lN}aK;hf)8t%p2 zUBJ^yjTq|9*g(4srmp(;fCmI{%9m)yXaQ97%{cf2*=b(nVe(hx9JHh3vToPm z*22pa(>?Sc5=@&rVEC|uJrN8acEB(@_Tt}Y=m{i8fmB)xITaAnwDb2=c2*9+xfo90 zLM4d65I_cf6;1HWT)AvvMZnGslk@{;gOYwgkai(rRo3!oQtg>mw6MW2CZKWvnMny@ z%BU#;>g>1ZfJKKYA+`BeyX3@rx5$2 zJWy$5$7%~Trx4R(@wsHof;ukBp<~$2RV1$?c>~EcB)^N~I+7beOh1gptOF#e6p0z* z=wz}~%9qN?1H0$x_t8#9f}x{tA$c3gO(fq#@_i)lAh`v^iwi~=wqgV5@7p1zc1Sj8 zV4A}_s)oHDZQQ~T%>dJ5LjSoP(x_KT*-bOel-;x~4ern!+93_AxI}AW7rT%_nB=!1 z%zRD*{HUd!Y?HMhRk{cc5%j5l4hsV?jEyEE;wx92fFUNTuKJSlzE?r$3s7hBq7^o&rJ(aGms=u6n zI1dF+kUX4!GQZyYDsWYcuJyk9X$mBZfB{w{u5L*??hSOw%GO;L3c&qO> zoCW*Uc~%M*Ph1xIz>0yc!OcahLr%>u2E0K>S)rHTrB+h}{pT=KLbAmSrqiU9O{dLp zIz2U~%~2(trg#d4|D9oH4y@PC>`b#zyJqXqQp@(4gNz4#BXsn&>EU_mNn5I&{8JB6E4=BK--J;r65A z_VdgyisI+Ojeiu5{EHyGCu|BG|1SAN-`4_=XAQxqv{?V^mTqzA`T3yux@bB4Po1w` AhyVZp literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_fetch_webpage.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be91d29573a9dcc646930fdf7b6ebcc45c869e63 GIT binary patch literal 10386 zcmd@)TWlOhax=R_?qj&Se7|`dy}3S%ONtWpT2hoHoz5p&l4Ci((7sHD+e32YeYs~S zikn%0ar{9c7l1$>x4`KD-k$_1I3fA*{pB-2es;N}$s61O2jqane|F@{B?1DZYIz5f~t1v$9B{NLXuCq<6?Hze%K!w~-2T|j=s5pJ3z zJn^LXv}fAGBkoOkrv>ZTkZw#sS=u-4uhs;n15hKRf@yJDw8|P%P3h2dC>@>-rz6vm zbn|p`x@Ed0-8$WxZkuicp2q2R;`@Y~?jU}EolD+mAb<6?s_B|U)Rd%VXfiRcQfXGz z6Z7(-I+Ii8R4I|oNK_?KI-B@VRfrmGgkhGFaVMDN_t|avAIVqddrW8uh5|TYH|(U@PLbX9NA5etF?tjJz(>ZkkM!~ zJPAJr?eO{lo&{gQUkH#$A!zuF;DRvD%X~8{#ZaTCmNm=W%uP zq*agDJa?LHtoD3dMelN2hf6+KY4>IL9#^eK-JI>a`M@R)XNHIVYg*Ui4cbyk+XGL& z5ysl&YSHEi z-ROAoy4|wS2{B9OU5qz_+NMkAdT+DiR+KZ0Eq?zf=MR47+HIS-tWa-;JNC)fY4efT zg9dAcyT}L`HM**CTlc=_yW5szKizQ~#BS~I>nL>UN9?(V@7IYZW6l?>hn)O`GkPv_ z&T10%V-Bz5vDO;*0zx2R*?7hAxZ8{cC_oDuB7_YJK% zJ!+20NK#H5=`)yEuz&YU_@i zdug-unysa+-de~SZmLBv%?7*oOju=j&B*xZhsHHKHCl? z{y?x+ZVQ>}Pf5265FE`j)upg&U@4Vbl~bp=5gBr76?9KvVgi zd0o$G=VGzN#l>NFvMKgRFnyR*las_0G77Zv>q|Km@@ccCKdtJJs8xCMF*7`PL&txuub)CQVF8h>k@y(?b)FJf;`Q5sc-1nhpxliRaD1Nk!9C zs#~n5fEDUbMfvBANlk(zW|Yr&+VWV5Bh#B*&`psIQBHs8AeV>eKtlv?Td-8eM z)!8`fY}{~87zQ_ zL^%fu*!)FT=b%{d&v9UYNX$+r>>@y_MPNn9PQvtP!NO29ScrNlzNBz0C+HYp`35%Q zyy;gnV1s044)VR8WCrZ^eH4$jNMdZ51Cbi`vJT^3%miDG;0^a<2&~6`5*B-_rtKjEqVUIyU2qv+Mr3kdH347@ra9h zOwaI$$rBA-Q<9#45S!1Y)mUmFp=hz$3@kI|^|2hy-cb{}7RzeMj4sbnB{v^~#i7M4 zR95fhQdz3fn2srS3(7mzz;JHK48-Tu3~fA0Gq4fC=ex3=^7 z*InkW<5J*1LpX2XQ7~_&SII1y!8fj4zj@>8JM!(DFdb+?@Rbqdf~jZT07%UNEdel- zAi!SH*2+n#8B>5Ip+yDDDP2Bs3WVm}OYIC>YGB)>{HZuE}-Lf{HT}Tm& zRh~;_XOxu8USP7O%&I5>q|Ih&TG3@@w6r#7hA&z#A2=wvLTQ$oZ8i%|yOObvKq%ys zIf^DX8`)ws1I`Men2x7!BJd-)1|T{>`w_=H1pO8Oc(tc<);j=}1Bj)Eu+R$!F%;uB zI&6{fJ~Txf9aHawn;2yT)BmKK7Mw9y@jQSCz^--y!u3Gdk;h zbYus00zOfeS8Me8E`d9`JgcEKq_4Z^RTS|0@3Ver7MgUyPx~iupk?kiLTF{SNFH`> zclAGPD8;^4I(F^1UcTWi{)Yw_m!6UxGFJA)oc4;m0wC&SF z8*P(+aeeuX74OPB+byjn;n2#7)t39Gw}eA*Z!(e*4lH9SqbfCYLpZeB;_$4_6%)mG zS8qB64EanrwB6ZT5(ZbsR>{hREnyJuO-3@pfn_XZRHcS)2!pG{;VE8nP=cz8>|d$U@=sES1~d>#22>W09A5*P2sOTy9Oc>pW%;`y?8 zbW6Z{lhKT*Ai>aeWM-%v0tZMdMFz{FR1%J?T?9}PrL~J?@yM2d_hz+#Q5B0~_&V}4 z)D3~f5-F9%1EAg7+W<=9fwi~YI$8yel0Z3HuD$Kn&tdrr{H<+uym+$WmzFPXi%l!j z_a{oiQ1MLwD-*>xEro}+grQAF0|s{m35KpCGeZHcedM4H0+OnNi(QuTBWv%Mgb~#B z{jxZ+CE&eTEnrl|q8Pp|06#@O2B0J!dpK4Whqna0H>(AVs#p}m z*O8y0ZU`)3l9rCu_x>id#;?8h)6l~Y%hI`W^SR|Wx5f4?v9B!ll?3RquY$e@$iS9R zyElb_m9wm%QbRX{fs)wAiq{dZHE;^I#mwxcU1LaWoOR=YTbj{ch zPcC2nC9FsNug;c)o+1xmc~Z?jXSN4Bp*b{X_Ov2n_do$j*+|A9$=|bqpx2YuB+lh8V1zd*N7JbRDbX7u~u@ zlXa{fs&lL!-gT`0v)W<&9gJdjT_j>1?KbZ^R&UWmHshVJjn$rZn#Vd;$GFM5>K(w> z4$EPqUE{2`C3hdIx9q{|Y_v(MbF5yIUT{C zaJ1P`J;Dj>I>Jf0q}HDSJle%HsrzWRHkx|2svYfqJzp&$)#V)Rc9S00(e8f7zQ@rn zBpP~eP;?s8JEIVa|H`2Fhm3riooL8p?f32~s{({WA{xTTAk9VFMQRP_#Zp zds5tR&;tlC0!h$^T&cz*nx2r6JgetkH!!iK`@HoBm#{4qaiEA-xbxmjQ@WT=C3ir?6M=wt}UT!laYYA z?=qIMf=UhD5V|nJta7eG?6;%QXF}IYj{P=-!!!qmiguYk8Fs%3SvG^RJiDMTP*s*G zzNz6)Y1Pe~nD1#a2v~d7m zczK@x1^3pkxP$-Bai4I18Tb{~{X4b*EdFQv5x)1?Ek4X2vE=?A DH3m8s literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_manifests.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_manifests.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5ee6dd2ad880125ed7e811531d23e4977a4cc09 GIT binary patch literal 7958 zcmeGhTWlN0@s>Q2$Cs$L9VLz#`AJunELo8qN3xvvGJ3D*3GqW@IC%@lA!S%;~e<98@6!jOl;A=0jAJ;+?-hV}9PtyI!Zq z>UidLYAnFJp{(aUP&Q24L%z(PPjCrQo(_hiLURNh&av#&!vT+5DQ9z9OS}snu`U)=`&boCW|T3O-c!vBTS|> z>M+N1<~SZ%6cRdhn2Qr~!tmAVP~lYS)nGyh3z8^v!4!ujfy2SABHt0@QmL3INUWR; zb$UJ}UonJKFPBIpWlk283F$Dj>7$$+h1x%Q`1pb07ecLk)ahSSRMxK8$D!}ai?_X3>6@h$dYsH*#bxlx zIur+PP#oi~6$H@a?FJW&F7H<8DTJF4rO-V67`z>Kf4zsv)u}ion>Gs@^gYGHD3b`F zAHqo9ulM5}#z?QgY_hH_lXWYu2g>bI+z)*3R@``v;?`y5os$%apBo~v2;R%}8moIlklX0OdmJV}LS0uq=;j9QU zOn0Sla$E?@Qdd$E6Eb@ibE#+-sF>8Hfvhlw30QgMQ3VSZDcj=WF;}4 ziV5sA7mwjhFnB9G(qEyNp+2c^oEa|HHO=fV2b$(C&R)C~2*iT+&YnA#x0mZ0=EPa?(gciJ1#d&2 zo%wTD`sT-qOZWKmX_BT6qa$;8n|k$(1txS>K`EF zISK5890d0g+_W_{y(1*4cBoWW95hx;oDKOgStQU;S-`xARi`Miq^1*GTu^O>w1dodY0DpmVCp-b-nZGCi;;1fccFlcWmy|rBj7xfbnG$|J8zJJ_xQ2r65nZm0r z!#?^~bx_==rNl~VTG|27RqA22G*6%Y*F))NivkC^4Cq-F0c6u4n}TZTY}Rh06uY54 z#M4U)vgwt_iKHto-VWMdLv!!bT3XXu$G6bg>X@~378yDOI$NczrL!g_=&WNETJvnO z3Z1FRyz?!)W(6D;D^QiEmK88hCB_Vh6_`R{6n6yie$p#qdym%N@O6)Db)ClSR@FC?5>~g)B6Y2u z@U2&O6@D&5N@2{R9@A}M2VY4N%2(9 zCf)W~vKgNb+2$0Kow*rsy$H08~&AW^@Wu5Ti?;?ADMc zt1`xDAv*^5M3Eq*hZMb;fIgx={+?h^bpQyGIMQ-t`)c~cRQ`o-P3qv#p|c7w9i-@yY!$`4-31y?`z%1beH4@re?~C0D9=SSqmKn z4ic-z9%iGQ1oEFtix>jQh$zH(3BL^8cflk56H+7)J^I_0{q046`?7yy4wVD-b5pZZ zmor*Es5yVJ*t9vvl$)CKQ&;+Pj&fiPNh}qZf!T|tKoII;Ah^s7++qd_8+#Xyy?^S) zsf8B`?E`?b-X&%rH>D#i;1Xaj>exkKpmDPH0%SSRJg3Ym1*U&ix%3*TUR!4RZ!!JX zjw}qlzwgGrh24eren46864Rec>j(?D1XxAKE&@pSSZ~1H@jW>|wYDhFFIgmdI#cVo%^3nwW=DD{cZ$<#fU6_v)0~?o_190Ec+j_$+i-0|+ zw@OuT2|$pPbXz3cO)z&s2hCC&PH#c~c}@qR|9qrMv-P=;8vfAw+t!bm!e+vA#SPSC zeYL3!=3gl=gD{m>N`ZEOih=fJX7Cm>SlHaRaPIxd8}vNM=EB7H|o$ zFbjfRWPq^=7Br@MQ!jzk6k#ZTy1)#92&YSd4FDAb83o`PgXf%0u2O4PDr`a)xgNo zr$*vJF;fPs(3yekDs;8NoLb{n(+aN2q0b4w5!{Nia-4TngBr_-HIko-dGs*8bL9w# z;27_K#|9519N+-qIgCl>7L)%GxQ!GCp#W!X6BFQ^M~I{C27d~Ug@E8tbtelz#mClg zX>hjaj3v(tSU)s`XyReJHHWKtExaGb$OI}WO>GG9+4u!AvRiXDy4QLU5UNAtr2DHQ zfLP(1Ld@5^$PRpz&>bhmNy5UCUhJ_tpl9$)@LFP^FdT;NE7ohiqWXi5t-5&40peGo z3vqSQJ5Yd=VA}4uTj!rBx;I@96y3X)ndghl^Cf2Fo*lXOp_!piybT4qBY!NfTww{o zF1Ktewrm3s%2G=Q0J**b)ZUI|n%o~(5PGXbS~|F>_10(rEVOJh(a>FoSCqh8(`7h8 zkOfxl@W%yy0SR&%Csdjj!*Zxmb+Ig;47052VcAF;UI{FVLD?ylF%rSr89{xF)lOcE0+181rT`M~-jZwH*d=6eW4+^_a| z-=#FE^#x)2<4Hap6ZYabyeEG#q&_GMb&=!)1L2)Y2GSWgBEzg{D=iEE@o_p_{*VVsNRfPn&qW|-&#NSm5!oMM5UmlK#4}Jv5&jlik z3PdEHq?q!IdPKy%N$;q{?iDFt9LiGu(ExiQC4;HZXo%qzNjX(HTA8XEtxAPQ!>Q`g z>Qv2WO{#XZHdQxTCkkpE@y*JDK>P?t>q+1}VYGn+0XC8lz$PLCY@YE(D`$JptD3Gw zh?5LxJ6PdI& z0<`p}F?}-XHv?laO;zIhb+am&&~!yj>j^!fYG#;`R88_aW-t}ItSadYQO!y^od!n! zdb3K?r^ghkUYW+OnZffX&z(P~48Q)C85mDo*Qcqf&DIby0W8{dQrEgC^;9x0aq%*U zli(Eu@xh&16tyne&kQDV;!Oy*rCSxe6ogu#TazOQm#k8|2HuJ9w*QvZ1__YhP49$A zLPRE&H$9~4rY9@uew*L$8qzJRwQPkRbjk%oG(0%r#7n}YIw9V!xnw9p|`m=#-kkn^GhTjNX?t&88khOo-t2htzI&5R5J83ZFagj7)opH}C zOFpDY2bu8eQk{*J?zb%Ts-QKx+HWXv*qGtSgt|GK?B}I>i8C$2`%Aqp#~X4a-F(wC z>w~dYx@xqPTD39KVg!F7{H@2S*E$Q2$Tp)gA+q{a`gWVw)zWizS?O+6*=^YOcSRyx z7qfCUL#FOzG|C~ z?6lTs4SAk)7&X?4t9|ShS8KOq->nrFhE*4W5eG=8HU3?j#{aIXkN*e8@9IsNJ!&|s zknCo=^P7>?p0APBv$9sYp5^iD?e>=(w0!jTUPlYp8Fj;?+QQze} zX|z`WNPbtg$p{%uOoKKX&Dwu$v5FgvX3(Gqu}*0flS7ZaI$K@sm$4u9PP>>6j8UK@@@y9Hm9EeCSsro=4QTJjP( zYSda*MayGb6)kp4_T927LSR)iXPe02dxFt8EZDYwR@V2}b79DK9k4g5wZ9=lp^cmd5sMvhJP@i^Z10tCkc_Jd08w!=MpuNt|SoV4jX;YizX= zxUF&O|F@qr1D2Txf7$wLLiF^A>CyX4pPos@bk&q}^}4<+nSKlvDG6dq=~zlN1NzLA z3L&6zGmujC7>VhzM_~eyD*T(sbjLHPM-|ih_`bu-q8U)r@eE0%C+>X=n=|S)1My5+ zk0sI?t$_m5r$HoZS~G>jPa)2;>|v!8Lj+NeSpnUMPnv$Bj>jP0WL6{-sf2zH14YYz z)0dgj6f$P|V^dQKNl>$jsuK|SqB9E3WORx#7Bk5D&ic(!?7ZoliqTj~Gef0-8}!wy zrms?rAT5iJJVQgv;%wcnuBn;cs1_L-iXhq_)l3hKFMCWclq2Y$^=mp&DII!L6FL^t zG?nU%^#HKK^8-=wkuRY|AYvC4XPfNyc!|@dH#4o9J~cg&NULV3Bul#seWfl8L^b~U zK5TyAQP}m)gZ!NbO}`bOyN9u?*^M1Trm!6wRAPjv#H@CLgI0m}0!>CmimHS*q8z=` zH2J7=QrD-nmwI|st@~Qya$-s)iCA}rPV^wvbDk|55TqxOcK1Ayb_VL#V{}5*VF+{( z+wKRjVF)0$*&+Ki*REab9)nP4IuVa0_hr&a5DzY|o)|0y7(zzt;Y)Sj6wOQ}R1zZ) z?yNa?^7vaPUQ^yWdG5m5^U9m2U?BDI9*}>pHHo1=w~z>*SD4~1ECD%3Ls@&anK+Yb z+LS;%T2!L@vCB@s>B~UQy5)bE%l~lP33~zuP5}&4eDES5ZwMlUMGfKYN^pim1A<)m z*&!mDhCdU3E{ekYVM&R0rSH}?h_f%c)-|koK4XAvnz8+C z#!PQ4nT&eb1mU*WPluqL4by-PvuPLs#F>UR_cUB`PaAOIMMT?nARe3~DjA|RQSC5M zUN%t@+avTO@Y7QOFe>kbWghj=^FW%SYRwk?HnLm<@N^`K+ehmGz$AKK5Nj@q9}FzS z*6SMFK=DgW zF%oZvUYVLgZ~LOcQ}x_^uT&Sh9}?QympTgd-CtDsD}8gr4{HRuB^Oxi zEQa>Xo%lRdm)pIx=cBgu?d?m4KfJoW{rSb?xfdT+c;v3R6AuGILu9dXDSk&=sXzM8 z*|}Hez4PbRYwPk-$Nc_<+8c+~qz?G4a*`7cEMO_8iY0VK>R70Ccorsd@!W-lH=P2G zd?IzMH*L#H?eqN$Wd7wfsU3c+oaBT93s}mjVhLT5+82n!lRM#{9Qj0QUzfx4*KZ8v zrLNpr0P{n+vjw?pO~T(Qr#VqXf}?klnWJt99N5~-jbk8}+_z9T$8z5)$UFF8P&zDX%R3jLySvvU{HDr>&5l;Q(+2UvMgoE!e7T)hw~$lG(L3vy>(>Rk!|$jhBe zfr8w-CgE?@D&SO+Q5?UE{2X;dU|F}DJyno9U~?{B0g#tF7O%LsC@XOGC{WHOUA*Gn zo({{p+?1F4bDg=_!nVCjhYH*Jft){@hr0}-!QZM?z^Niq9K9>`10ToTa7F6>&6nGS z>O&eT!hFT{c)e6Ul|Kj;Y`IiEkw^t3(|xXVzMllPNazP|J3%=j-}Fp~3Gs=O`24+< zWs~?-EQud3lf$V83m84s5rxIK>$|VR( ziR1>BAZ&a}>H8+0zK2ArWCfWjo-oK%!;(48-*AAX#YrT$LdxaoB$9FLRw8+(DY|`j zzz&pjGjkgb{fw!R39v+IWQry-Gy$G#M9Yj(6#@kiYe5^go4I_Z48~?$oy;UL$bcae z`ZfZzuPB};p_qK582zTr2ylwQyT*eTj6Le#WXQ6(2*t271%HOS_jE5}FCaj-lI}x* zeg(H}`w$yIZ~(vveF@Q{2#z5@KX|#CxdU_z3*rbiIslYlnTlW>!32Uy1PKI}5L`x( z1h8DUxzA~QltF-IH=RP@nBDXWk`#f4Ku0i*;3|S^2(BZT0ia=6=&Lp_H+J`JHg=~q z@Nw;Y7ZDpxA!u3$NJxnh0)N)9VN;&wyt7Jk;zMxu3a>+@X(7t+#b9PUfHM zK`XZptX$vMTDi@(mD{`~HLpsH1kC*_U@0#sme3Wc8LeE4a{=tn4V6BTn$cpsc4H7N z#=~ea4(1Mnvg4*7T8uz*0xd?cknj;&j8-Wx;1Epe2OvJ7yxut|mDf9PfDE4Cg$Kxp zZwrm%zg;>45&RuB4jefHP4`EQ^IMZ_%%j?6)FyowS#}~&5Og4T63s|o!5U?>BYhPg`~bmw0Dh|u z+Ot^)v80Eh!%5vZ3%OdceL4>DR!)K8Zh`3k>KcE379$RhQRT(;}R>u~I&#ECm~(^cYxV+i3bcuO8lwPoAOqy~<6gATWjHDgMPU9OhO>9$I{fA-xz&H;zQ z>#|iwl@WksanK0D!L(BU%So~xBq+kcaHvs|q^nN3F*}E%iQAX_{Uq+#LR(7Q;RJ~{ zlBRD+l2zFte6O3W?=_@jxYxp6n+THI5bOXD^=2N_IKe=`%eI+IDFlm@sH@LbmlYWX&kvjKRQ?Fem(#EnPTXT zxf2_Fj{P4tJ@kt5aS;OU_3h;X>T@I9)A;eok4JvUaK!k(cYsFh2iE<)k%e0O8Q$L+ zVS%l-`J=gxyxg`X;cr!9G$)EkaMXc$sVxUS1@gjU8+9H7vLdyyPja-y^eYO<#1+L1 zDa!aX{tp&Kq4-^d|1D!>o0vapwkkY7)y>l-kUHw7>NT2xV-P{72=nT!j4cGykJ+GP zVvM3Y!VWf!lU<=2u|^!!=?@CAbVl|pq(Zf_yn331;- f_{g06rLRjI`10FgmDu@Zq*C1dU`IgQ!Cv*h^=)f? literal 0 HcmV?d00001 diff --git a/tests/_stubs/__pycache__/requests.cpython-312.pyc b/tests/_stubs/__pycache__/requests.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..998598e4e5622555e1b01d95eeef2b3230cb75bb GIT binary patch literal 4371 zcmbtXT}&Lw5$@iZS@w@V0|pMC8P*287YzQ<`3L))U>kG3khKqRB$J%hlbrz;@6PP? z%wm3)h(uNdI!A`+B-V-|b5F75lX8!V@(|ryq`YAE5^p9voN`b15^=^SN0uU`s)t!N zX#JF4O?Az5b#-<1SJkzDsjLhTXutcMmqKbEkDn0QyGe;%0O7-3qN4;vbS^<59>bpxueX1X*pUVe8KBxtr z2s<-^{A&olRu;8{W`syhrV*45TbZB8sf3p7=FcIDaFDWj`h8PxqvK( zRc%axrNh1QT-eg4kqMuluu{nj-QC^idpH;FlEbh?D`UdwiDSpQU}9+tPiTq??%_%< z8M>wE*3prfw1&!1DOYn&j!!7e)U0O8Ub2*gj>^Jn zCVm^AaicqKq|gl4L@;izRa~76+W?YXnb( z+CfFv4cK15F5cl)fQ3ZIH@lz@EErWq2EqO+mWgPm=9-LcvgOoQNvYv4ey#+boO z*s6%)cF+M)EN-ZpU9kfiTgb4a?E|bDYC<1}Nyx!Co*Dot;OO(NG7DuWt5C7%8nMnTacA&zK%#+IYgW*i28F8IxMvGJ6a&p46 zDQPIG$({R{vKkOKQBG#HKv@Hj*~;R6aYh(L@cRPUtggF1yEL0S{4nw$lCM6z?mNut zVB~)+M@zWnL%^&OunqdXO{KSg49hiHMaZfE^m3BFCcp8xJNY>UXbRE9GFUt-y*n~; zU5$mVo`qQ@sDO&LSf% zEZ|TKs}yzuzP|;J`92T`T3_IPU@36FYN=|yu`BO8y3qIcVCdtEs{<yfh(Uy5?u!1Yb+j))N@OHt)%LCU<=Ad~v8s$BTZwOg>l!NdZ!VFhy8<7Pw{* zD~ApLNODJSp9a*^dtF(f?An5hH@ID~G8N~=tZ34Em*G~WQ*gfbu38nYwXe}3 zsl~h0DkAbVvR<{q#gM!5Yb1Tear&wM38q$Paj z_8pM7rA#^v)d*sOwE~HF+1tn?dLh6dPHYK^4K`zYbxl)~noZS`!h$DD$hAx}jpUTZ zF2FR5EAwAKAf|#1ADv&gT&QhWZCYu{b*`QGTW!apR0!5BXPyQRz%?=bN$az=Q%~DY z<=f8u{>q;Q|1kK+=tkRng;0H=cK>SgN^`C^U)!-!+w~~C=(U5j%VTT8uTqCK1!2mZB4zNxEF-}u5W)mJ?y zQf1XvknHQ++E4tI_kByg<>tJvWy9B&V+(zo!MZ|a)%_bwH4)!Ld0;uJj%Lz}8AK$ycn6LzQb=`7*?qojLx-Pc>MvayAYs7mj3wP5j_s~}C3h(wl-EVsSsMn6ZKuAX+X$-Ne+5wMd+eGa z=f7qDbX-g0KMJV8SQzBkAtZ;9v?Ia#$=Ur?u@>E3 zT!Vkq@QCo>pAL^5&|C}5C|1E)1 z`i-ZWir>nF9{P#||3UmOy*2dQHw4IbheQu-*VIw(^Dau|=RQi?UW6$1FVDOn(7p5t LwDUOu!ngik|8lO} literal 0 HcmV?d00001 diff --git a/tests/_stubs/requests.py b/tests/_stubs/requests.py new file mode 100644 index 0000000..a1d4d25 --- /dev/null +++ b/tests/_stubs/requests.py @@ -0,0 +1,89 @@ +"""Stub minimo de `requests` para tests de enrichers. + +Lee el plan de respuesta de `_STUB_REQUESTS_PLAN` (env var con path a un +JSON). Soporta multiples respuestas indexadas por metodo o por sufijo de +URL — la primera coincidencia gana. + +Formato del plan: +{ + "default": {"text": "...", "status": 200, + "headers": {"Content-Type": "text/html; charset=utf-8"}}, + "match": [ + {"contains": "duckduckgo.com", "text": "...", "status": 200}, + {"method": "GET", "contains": "example.com", "text": "..."} + ] +} +""" +from __future__ import annotations + +import json +import os + + +class Response: + def __init__(self, text: str = "", status_code: int = 200, + headers: dict | None = None, url: str = "", + encoding: str = "utf-8") -> None: + self.text = text + self.status_code = status_code + self.headers = headers or {"Content-Type": "text/html; charset=utf-8"} + self.url = url + self.encoding = encoding + self.content = text.encode(encoding, errors="replace") + + def json(self): + return json.loads(self.text) + + def raise_for_status(self): + if self.status_code >= 400: + raise RuntimeError(f"HTTP {self.status_code}") + + +def _load_plan() -> dict: + p = os.environ.get("_STUB_REQUESTS_PLAN") + if not p or not os.path.exists(p): + return {} + with open(p, "r", encoding="utf-8") as f: + return json.load(f) + + +def _resolve(method: str, url: str) -> Response: + plan = _load_plan() + for entry in plan.get("match", []): + if "method" in entry and entry["method"].upper() != method.upper(): + continue + needle = entry.get("contains") or "" + if needle and needle in url: + return Response( + text=entry.get("text", ""), + status_code=int(entry.get("status", 200)), + headers=entry.get("headers"), + url=url, + ) + d = plan.get("default") or {} + return Response( + text=d.get("text", ""), + status_code=int(d.get("status", 200)), + headers=d.get("headers"), + url=url, + ) + + +def get(url, *args, **kwargs): + return _resolve("GET", url) + + +def post(url, *args, **kwargs): + return _resolve("POST", url) + + +# Compatibilidad con `requests.exceptions.RequestException` si algun +# enricher lo importa en el futuro. +class RequestException(Exception): + pass + + +class exceptions: # noqa: N801 + RequestException = RequestException + Timeout = RequestException + ConnectionError = RequestException diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f7e8850 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,237 @@ +"""Fixtures comunes para tests de enrichers de graph_explorer. + +Cada test recibe: + - `ops_db`: path a una operations.db con schema minimo en tmp dir + - `app_dir`: tmp dir que actua como app_dir (cache_dir = /cache) + - `registry_root`: ruta absoluta del registry (para imports en run.py) + - `run_enricher(enricher, ctx_overrides)`: helper que invoca run.py via + subprocess con el mismo wire protocol que jobs.cpp. + +El schema se replica de `fn_operations/project_template/operations.db` — +solo las columnas que usan los enrichers. Si fn_operations cambia el +schema, este conftest se actualiza. +""" +from __future__ import annotations + +import json +import os +import sqlite3 +import subprocess +import sys +from pathlib import Path + +import pytest + + +REGISTRY_ROOT = Path(__file__).resolve().parents[5] +APP_DIR_SRC = Path(__file__).resolve().parents[1] # graph_explorer/ +ENRICHERS_DIR = APP_DIR_SRC / "enrichers" +TESTS_DIR = Path(__file__).resolve().parent +STUBS_DIR = TESTS_DIR / "_stubs" +PYTHON_BIN = REGISTRY_ROOT / "python" / ".venv" / "bin" / "python3" + + +def stub_requests(tmp_path: Path, plan: dict) -> dict: + """Escribe el plan de respuestas y devuelve el env que activa el stub. + + El stub vive en tests/_stubs/requests.py y se activa via PYTHONPATH. + Plan acepta `default` y/o `match` (lista de {contains, status, text}). + """ + plan_file = tmp_path / "_stub_plan.json" + plan_file.write_text(json.dumps(plan), encoding="utf-8") + return { + "PYTHONPATH": str(STUBS_DIR) + os.pathsep + os.environ.get("PYTHONPATH", ""), + "_STUB_REQUESTS_PLAN": str(plan_file), + } + + +SCHEMA_SQL = """ +CREATE TABLE entities ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + type_ref TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + description TEXT NOT NULL DEFAULT '', + domain TEXT NOT NULL DEFAULT '', + tags TEXT NOT NULL DEFAULT '[]', + source TEXT NOT NULL, + metadata TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); +CREATE TABLE relations ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + from_entity TEXT NOT NULL DEFAULT '', + to_entity TEXT NOT NULL, + via TEXT NOT NULL DEFAULT '', + description TEXT NOT NULL DEFAULT '', + purity TEXT NOT NULL DEFAULT '', + direction TEXT NOT NULL DEFAULT 'unidirectional', + weight REAL, + status TEXT NOT NULL DEFAULT 'designed', + started_at TEXT, + ended_at TEXT, + "order" INTEGER, + tags TEXT NOT NULL DEFAULT '[]', + notes TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL +); +""" + + +@pytest.fixture +def ops_db(tmp_path): + """operations.db vacia con schema minimo, lista para insertar nodos.""" + db = tmp_path / "operations.db" + conn = sqlite3.connect(db) + conn.executescript(SCHEMA_SQL) + conn.commit() + conn.close() + return db + + +@pytest.fixture +def app_dir(tmp_path): + """Directorio raiz de una 'app' para los enrichers (cache va dentro).""" + d = tmp_path / "app" + d.mkdir() + (d / "cache").mkdir() + return d + + +@pytest.fixture +def registry_root(): + return REGISTRY_ROOT + + +def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str, + metadata: dict | None = None, source: str = "test") -> None: + """Inserta un nodo de tipo arbitrario en operations.db.""" + conn = sqlite3.connect(ops_db) + conn.execute( + "INSERT INTO entities (id, name, type_ref, source, metadata, " + " created_at, updated_at) VALUES (?, ?, ?, ?, ?, " + " '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')", + (node_id, name, type_ref, source, + json.dumps(metadata or {}, ensure_ascii=False)), + ) + conn.commit() + conn.close() + + +def get_entity(ops_db: Path, entity_id: str) -> dict | None: + conn = sqlite3.connect(ops_db) + try: + cur = conn.execute( + "SELECT id, name, type_ref, source, metadata " + "FROM entities WHERE id=?", (entity_id,)) + row = cur.fetchone() + finally: + conn.close() + if not row: + return None + md = {} + try: + md = json.loads(row[4]) if row[4] else {} + except Exception: + pass + return {"id": row[0], "name": row[1], "type_ref": row[2], + "source": row[3], "metadata": md} + + +def list_entities(ops_db: Path, type_ref: str | None = None) -> list[dict]: + conn = sqlite3.connect(ops_db) + try: + if type_ref: + cur = conn.execute( + "SELECT id, name, type_ref, source, metadata " + "FROM entities WHERE type_ref=? ORDER BY id", (type_ref,)) + else: + cur = conn.execute( + "SELECT id, name, type_ref, source, metadata " + "FROM entities ORDER BY id") + rows = cur.fetchall() + finally: + conn.close() + out = [] + for r in rows: + try: + md = json.loads(r[4]) if r[4] else {} + except Exception: + md = {} + out.append({"id": r[0], "name": r[1], "type_ref": r[2], + "source": r[3], "metadata": md}) + return out + + +def list_relations(ops_db: Path, name: str | None = None) -> list[dict]: + conn = sqlite3.connect(ops_db) + try: + if name: + cur = conn.execute( + "SELECT id, name, from_entity, to_entity FROM relations " + "WHERE name=? ORDER BY id", (name,)) + else: + cur = conn.execute( + "SELECT id, name, from_entity, to_entity FROM relations " + "ORDER BY id") + rows = cur.fetchall() + finally: + conn.close() + return [{"id": r[0], "name": r[1], "from_entity": r[2], "to_entity": r[3]} + for r in rows] + + +def run_enricher(enricher_id: str, ctx: dict, *, env: dict | None = None, + timeout: int = 30) -> tuple[int, dict | None, str]: + """Lanza enrichers//run.py con el wire protocol estandar. + + Returns: (exit_code, stdout_json_or_None, stderr_text) + """ + run_py = ENRICHERS_DIR / enricher_id / "run.py" + assert run_py.exists(), f"no existe {run_py}" + + full_env = os.environ.copy() + if env: + full_env.update(env) + + proc = subprocess.run( + [str(PYTHON_BIN), str(run_py)], + input=json.dumps(ctx), + capture_output=True, + text=True, + timeout=timeout, + env=full_env, + ) + parsed: dict | None = None + if proc.stdout.strip(): + # Ultima linea no vacia es el JSON resumen. + for line in reversed(proc.stdout.strip().splitlines()): + line = line.strip() + if not line: + continue + try: + parsed = json.loads(line) + except Exception: + pass + break + return proc.returncode, parsed, proc.stderr + + +def base_ctx(*, ops_db, app_dir, registry_root, node_id, node_name, + node_type, metadata=None, params=None) -> dict: + """Construye el ctx tipico que jobs.cpp pasa por stdin.""" + return { + "node_id": node_id, + "node_name": node_name, + "node_type": node_type, + "metadata": metadata or {}, + "ops_db_path": str(ops_db), + "app_dir": str(app_dir), + "cache_dir": str(Path(app_dir) / "cache"), + "registry_root": str(registry_root), + "params": params or {}, + } diff --git a/tests/fixtures/ddg_results.html b/tests/fixtures/ddg_results.html new file mode 100644 index 0000000..434863c --- /dev/null +++ b/tests/fixtures/ddg_results.html @@ -0,0 +1,22 @@ + +tomate at DuckDuckGo + + + diff --git a/tests/test_extract_domain.py b/tests/test_extract_domain.py new file mode 100644 index 0000000..523d70a --- /dev/null +++ b/tests/test_extract_domain.py @@ -0,0 +1,60 @@ +"""Tests del enricher extract_domain. + +Pure regex/parsing — sin red. Verifica: + - Url con metadata.url crea Domain + BELONGS_TO + - Email crea Domain (desde la parte derecha del @) + - Si el Domain ya existe se reusa, no se duplica +""" +from __future__ import annotations + +from conftest import ( + base_ctx, get_entity, list_entities, list_relations, + make_node, run_enricher, +) + + +def test_url_creates_domain_and_relation(ops_db, app_dir, registry_root): + make_node(ops_db, node_id="u1", name="ex", + type_ref="Url", metadata={"url": "https://www.example.com/path"}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="u1", node_name="ex", node_type="Url", + metadata={"url": "https://www.example.com/path"}) + + rc, out, err = run_enricher("extract_domain", ctx) + assert rc == 0, err + assert out and out.get("entities_added", 0) >= 1, out + + domains = list_entities(ops_db, type_ref="Domain") + assert any(d["name"] == "www.example.com" for d in domains), domains + + rels = list_relations(ops_db, name="BELONGS_TO") + assert len(rels) == 1 + assert rels[0]["from_entity"] == "u1" + + +def test_email_creates_domain(ops_db, app_dir, registry_root): + make_node(ops_db, node_id="e1", name="user@aurgi.com", + type_ref="Email", metadata={"address": "user@aurgi.com"}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="e1", node_name="user@aurgi.com", node_type="Email") + rc, out, err = run_enricher("extract_domain", ctx) + assert rc == 0, err + domains = list_entities(ops_db, type_ref="Domain") + assert any(d["name"] == "aurgi.com" for d in domains), domains + + +def test_existing_domain_is_reused(ops_db, app_dir, registry_root): + # Pre-crear un Domain con el mismo nombre. + make_node(ops_db, node_id="d1", name="example.com", type_ref="Domain", + metadata={}) + make_node(ops_db, node_id="u1", name="ex", type_ref="Url", + metadata={"url": "https://example.com/x"}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="u1", node_name="ex", node_type="Url", + metadata={"url": "https://example.com/x"}) + rc, out, err = run_enricher("extract_domain", ctx) + assert rc == 0, err + + domains = list_entities(ops_db, type_ref="Domain") + names = [d["name"] for d in domains] + assert names.count("example.com") == 1, domains diff --git a/tests/test_extract_links.py b/tests/test_extract_links.py new file mode 100644 index 0000000..14303d0 --- /dev/null +++ b/tests/test_extract_links.py @@ -0,0 +1,63 @@ +"""Tests del enricher extract_links — sin red, lee markdown del cache.""" +from __future__ import annotations + +from pathlib import Path + +from conftest import ( + base_ctx, list_entities, list_relations, make_node, run_enricher, +) + + +SAMPLE_MD = """# Pagina demo + +Aqui hay [un enlace](https://example.com/articulo) interesante y +otro [duplicado](https://example.com/articulo) que no debe contar +dos veces. + +Tambien una URL pelada: https://otra.example/path?q=1 +y https://tercera.example/ + +Y un email que NO debe extraer como Url: contact@no.example +""" + + +def test_extract_links_creates_url_nodes(ops_db, app_dir, registry_root): + # 1) Crear el cache con el markdown. + md_dir = Path(app_dir) / "cache" / "ab" + md_dir.mkdir(parents=True, exist_ok=True) + md_path = md_dir / "abc.md" + md_path.write_text(SAMPLE_MD, encoding="utf-8") + rel = md_path.relative_to(app_dir) + + # 2) Crear Webpage con metadata.markdown_path apuntando al cache. + make_node(ops_db, node_id="w1", name="demo", + type_ref="Webpage", metadata={"markdown_path": str(rel)}) + + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="w1", node_name="demo", node_type="Webpage", + metadata={"markdown_path": str(rel)}) + + rc, out, err = run_enricher("extract_links", ctx) + assert rc == 0, err + assert out is not None, err + assert out["entities_added"] >= 3, out + + urls = [e["name"] for e in list_entities(ops_db, type_ref="Url")] + assert "https://example.com/articulo" in urls + assert "https://otra.example/path?q=1" in urls + + rels = list_relations(ops_db, name="LINKS_TO") + assert len(rels) >= 3 + assert all(r["from_entity"] == "w1" for r in rels) + + +def test_extract_links_without_markdown_path_errors(ops_db, app_dir, + registry_root): + make_node(ops_db, node_id="w1", name="demo", + type_ref="Webpage", metadata={}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="w1", node_name="demo", node_type="Webpage") + rc, out, err = run_enricher("extract_links", ctx) + assert rc != 0, "deberia fallar sin markdown_path" + assert out is not None + assert "missing markdown_path" in (out.get("error") or "") diff --git a/tests/test_extract_text_entities.py b/tests/test_extract_text_entities.py new file mode 100644 index 0000000..b8d1301 --- /dev/null +++ b/tests/test_extract_text_entities.py @@ -0,0 +1,59 @@ +"""Tests del enricher extract_text_entities — regex IoCs sobre markdown.""" +from __future__ import annotations + +from pathlib import Path + +from conftest import ( + base_ctx, list_entities, list_relations, make_node, run_enricher, +) + + +# Texto con varios IoCs detectables por extract_iocs (regex puro). +SAMPLE_MD = """# Reporte + +Indicators: + - Email: bad@evil.example y otra@victim.example + - IP: 192.0.2.55 + - CVE: CVE-2024-12345 + - Hash: 44d88612fea8a8f36de82e1278abb02f +""" + + +def test_extract_iocs_creates_typed_entities(ops_db, app_dir, registry_root): + md_dir = Path(app_dir) / "cache" / "cd" + md_dir.mkdir(parents=True, exist_ok=True) + md_path = md_dir / "ddd.md" + md_path.write_text(SAMPLE_MD, encoding="utf-8") + rel = md_path.relative_to(app_dir) + + make_node(ops_db, node_id="w1", name="report", + type_ref="Webpage", metadata={"markdown_path": str(rel)}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="w1", node_name="report", node_type="Webpage", + metadata={"markdown_path": str(rel)}) + + rc, out, err = run_enricher("extract_text_entities", ctx) + assert rc == 0, err + assert out is not None + assert out["entities_added"] >= 3, out + + types = {e["type_ref"] for e in list_entities(ops_db) + if e["type_ref"] != "Webpage"} + # No exigimos todos los tipos — depende de que extract_iocs cubra cada + # patron — pero al menos Email y CVE deberian estar. + assert "Email" in types, types + assert "CVE" in types, types + + rels = list_relations(ops_db, name="EXTRACTED_FROM") + assert len(rels) >= 3 + assert all(r["to_entity"] == "w1" for r in rels) + + +def test_extract_iocs_without_markdown_errors(ops_db, app_dir, registry_root): + make_node(ops_db, node_id="w1", name="empty", + type_ref="Webpage", metadata={}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="w1", node_name="empty", node_type="Webpage") + rc, out, err = run_enricher("extract_text_entities", ctx) + assert rc != 0 + assert out and "missing markdown_path" in (out.get("error") or "") diff --git a/tests/test_fetch_webpage.py b/tests/test_fetch_webpage.py new file mode 100644 index 0000000..29564ff --- /dev/null +++ b/tests/test_fetch_webpage.py @@ -0,0 +1,77 @@ +"""Tests del enricher fetch_webpage con red mockeada via stub de requests.""" +from __future__ import annotations + +import os +from pathlib import Path + +from conftest import ( + base_ctx, get_entity, list_entities, list_relations, + make_node, run_enricher, stub_requests, +) + + +SAMPLE_HTML = """ +Acme Demo + +

Hola

+

Esta es la pagina de prueba con un enlace.

+

Email de contacto: ops@acme.example

+ +""" + + +def test_fetch_webpage_creates_domain_and_caches(ops_db, app_dir, registry_root, + tmp_path): + make_node(ops_db, node_id="u1", name="acme", + type_ref="Url", metadata={"url": "https://www.acme.example/"}) + plan = { + "default": {"text": SAMPLE_HTML, "status": 200, + "headers": {"Content-Type": "text/html; charset=utf-8"}}, + } + env = stub_requests(tmp_path, plan) + + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="u1", node_name="acme", node_type="Url", + metadata={"url": "https://www.acme.example/"}) + + rc, out, err = run_enricher("fetch_webpage", ctx, env=env) + assert rc == 0, f"stderr={err}" + assert out is not None, err + assert out["status_code"] == 200 + assert out["title"] == "Acme Demo" + assert out["entities_added"] == 1 # Domain + assert out["relations_added"] == 1 # BELONGS_TO + + # El nodo Url se promueve a Webpage. + e = get_entity(ops_db, "u1") + assert e["type_ref"] == "Webpage", e + assert e["metadata"]["title"] == "Acme Demo" + assert e["metadata"]["status_code"] == 200 + + # Cache existe. + html_path = Path(app_dir) / e["metadata"]["html_path"] + assert html_path.exists() + assert "Acme Demo" in html_path.read_text(encoding="utf-8") + + # Domain creado con relacion. + domains = list_entities(ops_db, type_ref="Domain") + assert any(d["name"] == "www.acme.example" for d in domains) + rels = list_relations(ops_db, name="BELONGS_TO") + assert len(rels) == 1 + + +def test_fetch_webpage_handles_http_error(ops_db, app_dir, registry_root, + tmp_path): + make_node(ops_db, node_id="u1", name="bad", + type_ref="Url", metadata={"url": "https://no.example/"}) + plan = {"default": {"text": "", "status": 404}} + env = stub_requests(tmp_path, plan) + + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="u1", node_name="bad", node_type="Url", + metadata={"url": "https://no.example/"}) + + rc, out, err = run_enricher("fetch_webpage", ctx, env=env) + # 404 es respuesta valida — exit 0 con status_code en el resumen. + assert rc == 0, err + assert out["status_code"] == 404 diff --git a/tests/test_manifests.py b/tests/test_manifests.py new file mode 100644 index 0000000..273f026 --- /dev/null +++ b/tests/test_manifests.py @@ -0,0 +1,72 @@ +"""Sanity check de los manifests YAML de todos los enrichers. + +Confirma que el set actual cubre los tipos esperados y que cada manifest +tiene los campos que `enrichers.cpp` necesita parsear (id, applies_to). +""" +from __future__ import annotations + +from pathlib import Path + +from conftest import ENRICHERS_DIR + + +EXPECTED_IDS = { + "extract_domain", + "extract_links", + "extract_text_entities", + "fetch_webpage", + "web_search", +} + + +def _parse_simple_yaml(text: str) -> dict: + """Parser ad-hoc que replica lo que hace enrichers.cpp.""" + out: dict = {} + in_skip = False + for raw in text.splitlines(): + line = raw.rstrip("\r") + s = line.strip() + if not s or s.startswith("#"): + continue + indented = line and line[0].isspace() + if not indented: + in_skip = False + if in_skip: + continue + if ":" not in s: + continue + key, _, val = s.partition(":") + key = key.strip() + val = val.strip() + if val and val[0] in ('"', "'") and val[-1] == val[0]: + val = val[1:-1] + if key == "params" and not val: + in_skip = True + out[key] = val + return out + + +def test_all_expected_enrichers_present(): + found = {p.name for p in ENRICHERS_DIR.iterdir() if p.is_dir()} + missing = EXPECTED_IDS - found + assert not missing, f"faltan enrichers: {missing}" + + +def test_each_manifest_has_required_fields(): + for d in ENRICHERS_DIR.iterdir(): + if not d.is_dir(): + continue + manifest = d / "manifest.yaml" + runpy = d / "run.py" + assert manifest.exists(), f"falta manifest: {d.name}" + assert runpy.exists(), f"falta run.py: {d.name}" + m = _parse_simple_yaml(manifest.read_text(encoding="utf-8")) + assert m.get("id") == d.name, f"id no coincide con dir: {d.name}" + assert m.get("applies_to"), f"sin applies_to: {d.name}" + assert m.get("description"), f"sin description: {d.name}" + + +def test_web_search_applies_to_text(): + m = _parse_simple_yaml( + (ENRICHERS_DIR / "web_search" / "manifest.yaml").read_text()) + assert "text" in m["applies_to"].lower() diff --git a/tests/test_web_search.py b/tests/test_web_search.py new file mode 100644 index 0000000..778098e --- /dev/null +++ b/tests/test_web_search.py @@ -0,0 +1,97 @@ +"""Tests del enricher web_search (DuckDuckGo HTML).""" +from __future__ import annotations + +from pathlib import Path + +from conftest import ( + base_ctx, list_entities, list_relations, make_node, run_enricher, + stub_requests, TESTS_DIR, +) + + +DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html" + + +def test_web_search_creates_url_results_for_text_node( + ops_db, app_dir, registry_root, tmp_path): + make_node(ops_db, node_id="t1", name="tomate", + type_ref="text", metadata={}) + plan = { + "match": [ + {"contains": "duckduckgo.com", + "text": DDG_FIXTURE.read_text(encoding="utf-8"), + "status": 200}, + ], + "default": {"text": "", "status": 404}, + } + env = stub_requests(tmp_path, plan) + + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="t1", node_name="tomate", node_type="text", + params={"limit": 5}) + + rc, out, err = run_enricher("web_search", ctx, env=env) + assert rc == 0, f"stderr={err}" + assert out is not None, err + assert out["engine"] == "duckduckgo" + assert out["results"] == 3, out + assert out["entities_added"] == 3 + assert out["relations_added"] == 3 + + urls = list_entities(ops_db, type_ref="Url") + targets = {e["metadata"].get("url") for e in urls} + assert "https://es.wikipedia.org/wiki/Tomate" in targets + assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets + + rels = list_relations(ops_db, name="SEARCH_RESULT_OF") + assert len(rels) == 3 + assert all(r["to_entity"] == "t1" for r in rels) + + # Metadata enriquecida. + wiki = next(e for e in urls + if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate") + assert wiki["metadata"]["query"] == "tomate" + assert wiki["metadata"]["rank"] == 1 + assert "Wikipedia" in wiki["metadata"]["title"] + + +def test_web_search_uses_metadata_query_over_name(ops_db, app_dir, + registry_root, tmp_path): + """metadata.query debe ganar prioridad sobre node_name.""" + make_node(ops_db, node_id="t1", name="placeholder", + type_ref="text", metadata={"query": "tomate"}) + plan = {"match": [{"contains": "duckduckgo.com", + "text": DDG_FIXTURE.read_text(encoding="utf-8")}]} + env = stub_requests(tmp_path, plan) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="t1", node_name="placeholder", node_type="text", + metadata={"query": "tomate"}) + rc, out, err = run_enricher("web_search", ctx, env=env) + assert rc == 0, err + assert out["query"] == "tomate" + + +def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root, + tmp_path): + make_node(ops_db, node_id="t1", name="tomate", type_ref="text") + plan = {"match": [{"contains": "duckduckgo.com", + "text": DDG_FIXTURE.read_text(encoding="utf-8")}]} + env = stub_requests(tmp_path, plan) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="t1", node_name="tomate", node_type="text", + params={"limit": 1}) + rc, out, err = run_enricher("web_search", ctx, env=env) + assert rc == 0, err + assert out["results"] == 1 + assert out["entities_added"] == 1 + + +def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root, + tmp_path): + make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={}) + env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}}) + ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, + node_id="t1", node_name="", node_type="text") + rc, out, err = run_enricher("web_search", ctx, env=env) + assert rc == 2 + assert "sin query" in err