From 55dcdd1164240671e74794bfc2fcf1971dc1f23a Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:24:11 +0200
Subject: [PATCH 1/9] feat(cybersecurity): 8 IoC regex extractors +
 extract_iocs pipeline puro

Extractores nuevos en python/functions/cybersecurity/:
- extract_ip_addresses (IPv4 + IPv6 con validacion ipaddress)
- extract_emails (RFC 5322 simplificado)
- extract_domains (FQDNs con TLD valido, lista estatica)
- extract_file_hashes (MD5/SHA1/SHA256/SHA512, algoritmo por longitud)
- extract_crypto_wallets (BTC legacy + bech32, ETH 0x+40hex)
- extract_cve_ids (CVE-YYYY-NNNN+)
- extract_mac_addresses (xx:xx:xx + xx-xx-xx, separador uniforme)
- extract_phone_numbers (E.164 + ES local 9 digitos)

Pipeline:
- extract_iocs corre todos, deduplica spans contenidos. Mantiene
  purity:pure (kind:function con uses_functions no vacio) porque la
  regla del registry exige que los pipelines sean impuros.

Todas devuelven list[dict] con value/start/end/type para que el
caller (issues 0038-0040) pueda reconciliar offsets con spans NER
sin reparsing.

Refs #0037

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/functions/cybersecurity/__init__.py    | 18 +++++
 .../cybersecurity/extract_crypto_wallets.md   | 40 ++++++++++
 .../cybersecurity/extract_crypto_wallets.py   | 44 +++++++++++
 .../cybersecurity/extract_cve_ids.md          | 40 ++++++++++
 .../cybersecurity/extract_cve_ids.py          | 27 +++++++
 .../cybersecurity/extract_domains.md          | 40 ++++++++++
 .../cybersecurity/extract_domains.py          | 58 +++++++++++++++
 .../functions/cybersecurity/extract_emails.md | 40 ++++++++++
 .../functions/cybersecurity/extract_emails.py | 30 ++++++++
 .../cybersecurity/extract_file_hashes.md      | 42 +++++++++++
 .../cybersecurity/extract_file_hashes.py      | 40 ++++++++++
 .../functions/cybersecurity/extract_iocs.md   | 59 +++++++++++++++
 .../functions/cybersecurity/extract_iocs.py   | 73 +++++++++++++++++++
 .../cybersecurity/extract_ip_addresses.md     | 45 ++++++++++++
 .../cybersecurity/extract_ip_addresses.py     | 53 ++++++++++++++
 .../cybersecurity/extract_mac_addresses.md    | 40 ++++++++++
 .../cybersecurity/extract_mac_addresses.py    | 31 ++++++++
 .../cybersecurity/extract_phone_numbers.md    | 40 ++++++++++
 .../cybersecurity/extract_phone_numbers.py    | 63 ++++++++++++++++
 19 files changed, 823 insertions(+)
 create mode 100644 python/functions/cybersecurity/extract_crypto_wallets.md
 create mode 100644 python/functions/cybersecurity/extract_crypto_wallets.py
 create mode 100644 python/functions/cybersecurity/extract_cve_ids.md
 create mode 100644 python/functions/cybersecurity/extract_cve_ids.py
 create mode 100644 python/functions/cybersecurity/extract_domains.md
 create mode 100644 python/functions/cybersecurity/extract_domains.py
 create mode 100644 python/functions/cybersecurity/extract_emails.md
 create mode 100644 python/functions/cybersecurity/extract_emails.py
 create mode 100644 python/functions/cybersecurity/extract_file_hashes.md
 create mode 100644 python/functions/cybersecurity/extract_file_hashes.py
 create mode 100644 python/functions/cybersecurity/extract_iocs.md
 create mode 100644 python/functions/cybersecurity/extract_iocs.py
 create mode 100644 python/functions/cybersecurity/extract_ip_addresses.md
 create mode 100644 python/functions/cybersecurity/extract_ip_addresses.py
 create mode 100644 python/functions/cybersecurity/extract_mac_addresses.md
 create mode 100644 python/functions/cybersecurity/extract_mac_addresses.py
 create mode 100644 python/functions/cybersecurity/extract_phone_numbers.md
 create mode 100644 python/functions/cybersecurity/extract_phone_numbers.py

diff --git a/python/functions/cybersecurity/__init__.py b/python/functions/cybersecurity/__init__.py
index caddb4be..5b3eb4a1 100644
--- a/python/functions/cybersecurity/__init__.py
+++ b/python/functions/cybersecurity/__init__.py
@@ -12,6 +12,15 @@ from .cybersecurity import (
     envelope_encrypt,
     envelope_decrypt,
 )
+from .extract_ip_addresses import extract_ip_addresses
+from .extract_emails import extract_emails
+from .extract_domains import extract_domains
+from .extract_file_hashes import extract_file_hashes
+from .extract_crypto_wallets import extract_crypto_wallets
+from .extract_cve_ids import extract_cve_ids
+from .extract_mac_addresses import extract_mac_addresses
+from .extract_phone_numbers import extract_phone_numbers
+from .extract_iocs import extract_iocs
 
 __all__ = [
     "hash_sha256",
@@ -26,4 +35,13 @@ __all__ = [
     "normalize_url",
     "envelope_encrypt",
     "envelope_decrypt",
+    "extract_ip_addresses",
+    "extract_emails",
+    "extract_domains",
+    "extract_file_hashes",
+    "extract_crypto_wallets",
+    "extract_cve_ids",
+    "extract_mac_addresses",
+    "extract_phone_numbers",
+    "extract_iocs",
 ]
diff --git a/python/functions/cybersecurity/extract_crypto_wallets.md b/python/functions/cybersecurity/extract_crypto_wallets.md
new file mode 100644
index 00000000..4b08a424
--- /dev/null
+++ b/python/functions/cybersecurity/extract_crypto_wallets.md
@@ -0,0 +1,40 @@
+---
+name: extract_crypto_wallets
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_crypto_wallets(text: str) -> list[dict]"
+description: "Extrae direcciones BTC (legacy P2PKH/P2SH + bech32) y ETH (0x + 40 hex) de un texto, con offsets y `asset` indicando la moneda. Validacion estructural por regex — no checksum."
+tags: [ioc, crypto, btc, eth, wallet, bitcoin, ethereum, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer wallets"
+output: "lista de dicts con {value, start, end, type='crypto_wallet', asset} por cada direccion encontrada"
+tested: true
+tests:
+  - "BTC legacy (P2PKH y P2SH)"
+  - "BTC bech32 (segwit)"
+  - "ETH 0x + 40 hex"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_crypto_wallets.py"
+---
+
+## Ejemplo
+
+```python
+extract_crypto_wallets("Send to 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa or 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1")
+# [{"value": "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa", ..., "asset": "btc"},
+#  {"value": "0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1", ..., "asset": "eth"}]
+```
+
+## Notas
+
+BTC legacy: empieza por `1` o `3`, base58 (sin 0/O/I/l), 26-35 chars. BTC bech32: prefijo `bc1`, alfabeto bech32. ETH: `0x` + 40 hex case-insensitive. No se valida checksum — un agente que requiera validacion completa debe correr base58check / EIP-55 sobre los `value` retornados.
diff --git a/python/functions/cybersecurity/extract_crypto_wallets.py b/python/functions/cybersecurity/extract_crypto_wallets.py
new file mode 100644
index 00000000..b5ce1cd2
--- /dev/null
+++ b/python/functions/cybersecurity/extract_crypto_wallets.py
@@ -0,0 +1,44 @@
+"""Extrae wallets BTC y ETH de un texto, con offsets."""
+
+import re
+
+_BTC_LEGACY = re.compile(
+    r"(?<![A-Za-z0-9])"
+    r"[13][1-9A-HJ-NP-Za-km-z]{25,34}"
+    r"(?![A-Za-z0-9])"
+)
+_BTC_BECH32 = re.compile(
+    r"(?<![A-Za-z0-9])"
+    r"bc1[02-9ac-hj-np-z]{6,87}"
+    r"(?![A-Za-z0-9])"
+)
+_ETH_REGEX = re.compile(
+    r"(?<![A-Za-z0-9])"
+    r"0x[a-fA-F0-9]{40}"
+    r"(?![A-Za-z0-9])"
+)
+
+
+def extract_crypto_wallets(text: str) -> list[dict]:
+    """Extrae direcciones BTC (legacy + bech32) y ETH con offsets.
+
+    BTC legacy (P2PKH/P2SH) empieza por `1` o `3`. BTC bech32 (segwit)
+    empieza por `bc1`. ETH es `0x` seguido de 40 caracteres hex. No se
+    valida checksum — la regex es estructural.
+    """
+    results = []
+    for regex, asset in (
+        (_BTC_LEGACY, "btc"),
+        (_BTC_BECH32, "btc"),
+        (_ETH_REGEX, "eth"),
+    ):
+        for m in regex.finditer(text):
+            results.append({
+                "value": m.group(0),
+                "start": m.start(),
+                "end": m.end(),
+                "type": "crypto_wallet",
+                "asset": asset,
+            })
+    results.sort(key=lambda r: r["start"])
+    return results
diff --git a/python/functions/cybersecurity/extract_cve_ids.md b/python/functions/cybersecurity/extract_cve_ids.md
new file mode 100644
index 00000000..2d9463df
--- /dev/null
+++ b/python/functions/cybersecurity/extract_cve_ids.md
@@ -0,0 +1,40 @@
+---
+name: extract_cve_ids
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_cve_ids(text: str) -> list[dict]"
+description: "Extrae IDs CVE en formato `CVE-YYYY-NNNN+` de un texto, con offsets. No valida que el CVE exista en NVD."
+tags: [ioc, cve, vulnerability, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer CVEs"
+output: "lista de dicts con {value, start, end, type='cve_id'} por cada CVE encontrado"
+tested: true
+tests:
+  - "CVE basico (4 digitos)"
+  - "CVE con 5+ digitos (post-2014)"
+  - "Multiples CVEs en mismo texto"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_cve_ids.py"
+---
+
+## Ejemplo
+
+```python
+extract_cve_ids("Patches CVE-2021-44228 and CVE-2024-1234567")
+# [{"value": "CVE-2021-44228", "start": 8, "end": 22, "type": "cve_id"},
+#  {"value": "CVE-2024-1234567", "start": 27, "end": 43, "type": "cve_id"}]
+```
+
+## Notas
+
+Acepta el rango oficial NVD: año de 4 digitos seguido de 4 a 7 digitos. No valida que exista en NVD — solo estructura. La parte numerica creciente permite CVEs grandes (post-2014, donde NVD elimino el limite de 4 digitos).
diff --git a/python/functions/cybersecurity/extract_cve_ids.py b/python/functions/cybersecurity/extract_cve_ids.py
new file mode 100644
index 00000000..09768b54
--- /dev/null
+++ b/python/functions/cybersecurity/extract_cve_ids.py
@@ -0,0 +1,27 @@
+"""Extrae identificadores CVE de un texto, con offsets."""
+
+import re
+
+_CVE_REGEX = re.compile(
+    r"(?<![A-Za-z0-9])"
+    r"CVE-\d{4}-\d{4,7}"
+    r"(?![A-Za-z0-9])"
+)
+
+
+def extract_cve_ids(text: str) -> list[dict]:
+    """Extrae IDs CVE con formato `CVE-YYYY-NNNN+`.
+
+    Acepta el rango oficial (NVD): año de 4 digitos seguido de 4 a 7
+    digitos. No valida que el CVE exista en NVD. Insensible a posicion
+    (puede aparecer al inicio, en medio o al final del texto).
+    """
+    return [
+        {
+            "value": m.group(0),
+            "start": m.start(),
+            "end": m.end(),
+            "type": "cve_id",
+        }
+        for m in _CVE_REGEX.finditer(text)
+    ]
diff --git a/python/functions/cybersecurity/extract_domains.md b/python/functions/cybersecurity/extract_domains.md
new file mode 100644
index 00000000..92cf9e38
--- /dev/null
+++ b/python/functions/cybersecurity/extract_domains.md
@@ -0,0 +1,40 @@
+---
+name: extract_domains
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_domains(text: str) -> list[dict]"
+description: "Extrae FQDNs (dominios con TLD valido) de un texto, con offsets start/end. Usa lista estatica de TLDs comunes (gTLD + ccTLD frecuentes). No valida DNS."
+tags: [ioc, domain, fqdn, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer dominios"
+output: "lista de dicts con {value, start, end, type='domain'} por cada FQDN reconocido"
+tested: true
+tests:
+  - "Dominios con TLD valido se extraen"
+  - "TLD desconocido se descarta"
+  - "Subdominios profundos"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_domains.py"
+---
+
+## Ejemplo
+
+```python
+extract_domains("visit example.com or sub.test.io for info")
+# [{"value": "example.com", "start": 6, "end": 17, "type": "domain"},
+#  {"value": "sub.test.io", "start": 21, "end": 32, "type": "domain"}]
+```
+
+## Notas
+
+Lista de TLDs estatica (no IANA completa). Cubre los gTLD originales, los nuevos populares (app, dev, io, ai, cloud, xyz, ...) y ccTLDs frecuentes. Si necesitas un TLD nuevo, ampliar `_VALID_TLDS` en el .py. No usa publicsuffix (dependencia externa). Si el dominio aparece dentro de un email, se extrae igual — el pipeline `extract_iocs` deduplica por offsets.
diff --git a/python/functions/cybersecurity/extract_domains.py b/python/functions/cybersecurity/extract_domains.py
new file mode 100644
index 00000000..4f137cbe
--- /dev/null
+++ b/python/functions/cybersecurity/extract_domains.py
@@ -0,0 +1,58 @@
+"""Extrae FQDNs validos de un texto, con offsets."""
+
+import re
+
+# Lista estatica de TLDs comunes (no exhaustiva — IANA tiene >1500).
+# Incluye los gTLD originales, los nuevos mas usados, y ccTLD frecuentes.
+_VALID_TLDS = frozenset({
+    # gTLD originales
+    "com", "org", "net", "edu", "gov", "mil", "int",
+    # gTLD comunes
+    "info", "biz", "name", "pro", "mobi", "asia", "jobs", "tel", "travel",
+    "xxx", "post",
+    # nuevos gTLD populares
+    "app", "dev", "io", "ai", "tech", "cloud", "online", "site", "store",
+    "xyz", "top", "shop", "club", "fun", "live", "blog", "page", "news",
+    "media", "design", "studio", "agency", "io", "co", "me", "tv",
+    # ccTLD frecuentes
+    "us", "uk", "de", "fr", "es", "it", "nl", "be", "se", "no", "fi", "dk",
+    "ru", "ua", "pl", "cz", "ch", "at", "pt", "gr", "ie", "tr",
+    "ca", "mx", "br", "ar", "cl", "co", "pe", "ve", "uy",
+    "cn", "jp", "kr", "in", "id", "th", "vn", "my", "sg", "ph", "tw", "hk",
+    "au", "nz",
+    "za", "eg", "ma", "ng", "ke",
+    "il", "ae", "sa", "qa",
+    "eu",
+})
+
+# Componentes: letras/digitos con guiones internos, sin empezar/terminar en guion.
+_LABEL = r"[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?"
+_DOMAIN_REGEX = re.compile(
+    rf"(?<![A-Za-z0-9.-])"
+    rf"(?:{_LABEL}\.)+"
+    rf"[A-Za-z]{{2,63}}"
+    rf"(?![A-Za-z0-9.-])"
+)
+
+
+def extract_domains(text: str) -> list[dict]:
+    """Extrae FQDNs cuyo TLD esta en la lista estatica.
+
+    Solo captura nombres con al menos un punto y un TLD reconocido. No
+    incluye URLs completas (ver `extract_urls`). Si el dominio aparece
+    dentro de un email, igual se extrae — el caller puede deduplicar
+    por offsets si lo necesita.
+    """
+    results = []
+    for m in _DOMAIN_REGEX.finditer(text):
+        candidate = m.group(0)
+        tld = candidate.rsplit(".", 1)[-1].lower()
+        if tld not in _VALID_TLDS:
+            continue
+        results.append({
+            "value": candidate,
+            "start": m.start(),
+            "end": m.end(),
+            "type": "domain",
+        })
+    return results
diff --git a/python/functions/cybersecurity/extract_emails.md b/python/functions/cybersecurity/extract_emails.md
new file mode 100644
index 00000000..264b4272
--- /dev/null
+++ b/python/functions/cybersecurity/extract_emails.md
@@ -0,0 +1,40 @@
+---
+name: extract_emails
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_emails(text: str) -> list[dict]"
+description: "Extrae direcciones de email (RFC 5322 simplificado) de un texto, con offsets start/end. No valida MX ni que el TLD exista — solo estructura sintactica."
+tags: [ioc, email, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer emails"
+output: "lista de dicts con {value, start, end, type='email'} por cada email encontrado"
+tested: true
+tests:
+  - "Email simple"
+  - "Multiples emails con caracteres validos en local part"
+  - "No matchea texto sin @"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_emails.py"
+---
+
+## Ejemplo
+
+```python
+extract_emails("Contact: alice@example.com or bob+work@sub.test.org")
+# [{"value": "alice@example.com", "start": 9, "end": 26, "type": "email"},
+#  {"value": "bob+work@sub.test.org", "start": 30, "end": 51, "type": "email"}]
+```
+
+## Notas
+
+Acepta `._%+-` en parte local. El dominio exige al menos un punto y termina en componente alfanumerico de 1+ chars. No valida MX ni que el TLD aparezca en lista de TLDs validos — para extraer dominios independientemente, ver `extract_domains_py_cybersecurity`.
diff --git a/python/functions/cybersecurity/extract_emails.py b/python/functions/cybersecurity/extract_emails.py
new file mode 100644
index 00000000..e6119a4f
--- /dev/null
+++ b/python/functions/cybersecurity/extract_emails.py
@@ -0,0 +1,30 @@
+"""Extrae direcciones de email de un texto, con offsets."""
+
+import re
+
+_EMAIL_REGEX = re.compile(
+    r"(?<![A-Za-z0-9._%+-])"
+    r"[A-Za-z0-9._%+-]+"
+    r"@"
+    r"[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?"
+    r"(?:\.[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?)+"
+    r"(?![A-Za-z0-9._%+-])"
+)
+
+
+def extract_emails(text: str) -> list[dict]:
+    """Extrae emails (RFC 5322 simplificado) con offsets.
+
+    No valida MX ni que el TLD exista — solo estructura sintactica. La
+    parte local acepta letras, digitos y `._%+-`. El dominio exige al
+    menos un punto y termina en componente alfanumerico.
+    """
+    return [
+        {
+            "value": m.group(0),
+            "start": m.start(),
+            "end": m.end(),
+            "type": "email",
+        }
+        for m in _EMAIL_REGEX.finditer(text)
+    ]
diff --git a/python/functions/cybersecurity/extract_file_hashes.md b/python/functions/cybersecurity/extract_file_hashes.md
new file mode 100644
index 00000000..2a2b751c
--- /dev/null
+++ b/python/functions/cybersecurity/extract_file_hashes.md
@@ -0,0 +1,42 @@
+---
+name: extract_file_hashes
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_file_hashes(text: str) -> list[dict]"
+description: "Extrae hashes MD5/SHA1/SHA256/SHA512 de un texto, con offsets y algoritmo deducido por longitud (32, 40, 64 o 128 hex). Util para extraer IoCs de reportes de threat intelligence."
+tags: [ioc, hash, md5, sha1, sha256, sha512, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer hashes hex"
+output: "lista de dicts con {value, start, end, type='file_hash', algorithm} por cada hash encontrado"
+tested: true
+tests:
+  - "MD5 (32 hex), SHA1 (40), SHA256 (64), SHA512 (128)"
+  - "Longitudes intermedias se ignoran"
+  - "Insensible a mayusculas en hex"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_file_hashes.py"
+---
+
+## Ejemplo
+
+```python
+extract_file_hashes("MD5: 5d41402abc4b2a76b9719d911017c592 SHA1: aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d")
+# [{"value": "5d41402abc4b2a76b9719d911017c592", "start": 5, "end": 37,
+#   "type": "file_hash", "algorithm": "md5"},
+#  {"value": "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", "start": 44, "end": 84,
+#   "type": "file_hash", "algorithm": "sha1"}]
+```
+
+## Notas
+
+Detecta solo longitudes canonicas (32/40/64/128 hex). Una secuencia hex de 50 caracteres se ignora. Word-boundary `\b` evita matchear sub-strings de hex mas largo. ETH wallets (`0x` + 40 hex = 42 chars totales) NO matchean este extractor por el `\b` y la ausencia del prefijo `0x` en este patron — el pipeline `extract_iocs` deduplica overlaps si los hubiera.
diff --git a/python/functions/cybersecurity/extract_file_hashes.py b/python/functions/cybersecurity/extract_file_hashes.py
new file mode 100644
index 00000000..10d811a4
--- /dev/null
+++ b/python/functions/cybersecurity/extract_file_hashes.py
@@ -0,0 +1,40 @@
+"""Extrae hashes MD5/SHA1/SHA256/SHA512 de un texto, con offsets y algoritmo."""
+
+import re
+
+# Mas largo primero para evitar que un SHA256 quede como SHA1+resto.
+_HASH_LENGTHS = (
+    (128, "sha512"),
+    (64, "sha256"),
+    (40, "sha1"),
+    (32, "md5"),
+)
+
+_HASH_CANDIDATE = re.compile(r"\b[A-Fa-f0-9]{32,128}\b")
+
+
+def extract_file_hashes(text: str) -> list[dict]:
+    """Extrae hashes hex con su algoritmo deducido por longitud.
+
+    Reconoce MD5 (32), SHA1 (40), SHA256 (64) y SHA512 (128). Hashes
+    de longitudes intermedias se ignoran. Devuelve `algorithm` ademas
+    de los campos estandar.
+    """
+    results = []
+    for m in _HASH_CANDIDATE.finditer(text):
+        candidate = m.group(0)
+        length = len(candidate)
+        algorithm = next(
+            (algo for size, algo in _HASH_LENGTHS if size == length),
+            None,
+        )
+        if algorithm is None:
+            continue
+        results.append({
+            "value": candidate,
+            "start": m.start(),
+            "end": m.end(),
+            "type": "file_hash",
+            "algorithm": algorithm,
+        })
+    return results
diff --git a/python/functions/cybersecurity/extract_iocs.md b/python/functions/cybersecurity/extract_iocs.md
new file mode 100644
index 00000000..9e8bc301
--- /dev/null
+++ b/python/functions/cybersecurity/extract_iocs.md
@@ -0,0 +1,59 @@
+---
+name: extract_iocs
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_iocs(text: str, types: list[str] | None = None) -> list[dict]"
+description: "Pipeline puro que corre todos los extractores de IoC (IP, email, dominio, hash, wallet, CVE, MAC, telefono) y devuelve lista unificada con `type`. Deduplica spans contenidos. Si types se pasa, filtra los tipos a ejecutar."
+tags: [ioc, pipeline, regex, extract, cybersecurity, python]
+uses_functions:
+  - extract_ip_addresses_py_cybersecurity
+  - extract_emails_py_cybersecurity
+  - extract_domains_py_cybersecurity
+  - extract_file_hashes_py_cybersecurity
+  - extract_crypto_wallets_py_cybersecurity
+  - extract_cve_ids_py_cybersecurity
+  - extract_mac_addresses_py_cybersecurity
+  - extract_phone_numbers_py_cybersecurity
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: []
+params:
+  - name: text
+    desc: "string de texto del que extraer IoCs"
+  - name: types
+    desc: "lista opcional de tipos a extraer (email, ip_address, domain, file_hash, crypto_wallet, cve_id, mac_address, phone_number). None = todos."
+output: "lista de dicts {value, start, end, type, ...} ordenada por offset, sin spans contenidos"
+tested: true
+tests:
+  - "Pipeline corre todos los extractores"
+  - "Filtro por types subset"
+  - "Deduplica spans contenidos (dominio dentro de email)"
+  - "Tipos desconocidos se ignoran"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_iocs.py"
+---
+
+## Ejemplo
+
+```python
+extract_iocs("Reach alice@example.com from 10.0.0.5; CVE-2023-1234")
+# [{"value": "alice@example.com", "start": 6, "end": 23, "type": "email"},
+#  {"value": "10.0.0.5", "start": 29, "end": 37, "type": "ip_address"},
+#  {"value": "CVE-2023-1234", "start": 39, "end": 52, "type": "cve_id"}]
+
+extract_iocs("Only IPs: 8.8.8.8 here", types=["ip_address"])
+# [{"value": "8.8.8.8", ..., "type": "ip_address"}]
+```
+
+## Notas
+
+Es **funcion** y no `kind: pipeline` porque la regla del registry exige que pipelines sean impuros — esta no lo es: solo compone funciones puras y deduplica. Mantiene `purity: pure` con `uses_functions` no vacio.
+
+Deduplicacion: un match completamente contenido en otro (ej. `example.com` dentro de `alice@example.com`) se descarta. Empate exacto de span: gana el primero segun el orden de `_EXTRACTORS` en el modulo (email > ip > crypto_wallet > cve > mac > file_hash > phone > domain). Reordenar el dict cambia la prioridad si tienes overlaps habituales.
+
+Bench informal: ~50-80 ms por MB de texto sobre CPU moderna (depende del numero de matches).
diff --git a/python/functions/cybersecurity/extract_iocs.py b/python/functions/cybersecurity/extract_iocs.py
new file mode 100644
index 00000000..6d246ba5
--- /dev/null
+++ b/python/functions/cybersecurity/extract_iocs.py
@@ -0,0 +1,73 @@
+"""Pipeline puro: corre todos los extractores de IoC y unifica resultados."""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from extract_ip_addresses import extract_ip_addresses
+from extract_emails import extract_emails
+from extract_domains import extract_domains
+from extract_file_hashes import extract_file_hashes
+from extract_crypto_wallets import extract_crypto_wallets
+from extract_cve_ids import extract_cve_ids
+from extract_mac_addresses import extract_mac_addresses
+from extract_phone_numbers import extract_phone_numbers
+
+
+_EXTRACTORS = {
+    "email": extract_emails,
+    "ip_address": extract_ip_addresses,
+    "crypto_wallet": extract_crypto_wallets,
+    "cve_id": extract_cve_ids,
+    "mac_address": extract_mac_addresses,
+    "file_hash": extract_file_hashes,
+    "phone_number": extract_phone_numbers,
+    "domain": extract_domains,
+}
+
+
+def extract_iocs(text: str, types: list[str] | None = None) -> list[dict]:
+    """Extrae todos los IoCs del texto y unifica resultados con `type`.
+
+    Si `types` es None, corre todos los extractores. En caso contrario,
+    ejecuta solo los tipos solicitados (los desconocidos se ignoran).
+
+    Resultados se ordenan por offset y se desduplican: si un span esta
+    completamente contenido dentro de otro, el contenido se descarta
+    (ej. un dominio dentro de un email, o un SHA1 dentro de un wallet
+    ETH). Empate por span exacto: gana el que aparece primero en el
+    orden de extractores definido.
+    """
+    if types is None:
+        types = list(_EXTRACTORS.keys())
+
+    raw: list[dict] = []
+    for t in types:
+        extractor = _EXTRACTORS.get(t)
+        if extractor is None:
+            continue
+        raw.extend(extractor(text))
+
+    # Orden: por start ascendente, luego por longitud descendente para
+    # que el span mas amplio se procese antes y absorba los contenidos.
+    raw.sort(key=lambda r: (r["start"], -(r["end"] - r["start"])))
+
+    deduped: list[dict] = []
+    for m in raw:
+        contained = any(
+            d["start"] <= m["start"] and d["end"] >= m["end"]
+            and (d["start"], d["end"]) != (m["start"], m["end"])
+            for d in deduped
+        )
+        if contained:
+            continue
+        # Empate exacto: si ya hay otro con el mismo span, no anadir.
+        if any(
+            (d["start"], d["end"]) == (m["start"], m["end"])
+            for d in deduped
+        ):
+            continue
+        deduped.append(m)
+
+    return deduped
diff --git a/python/functions/cybersecurity/extract_ip_addresses.md b/python/functions/cybersecurity/extract_ip_addresses.md
new file mode 100644
index 00000000..dd5fc862
--- /dev/null
+++ b/python/functions/cybersecurity/extract_ip_addresses.md
@@ -0,0 +1,45 @@
+---
+name: extract_ip_addresses
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_ip_addresses(text: str) -> list[dict]"
+description: "Extrae direcciones IPv4 e IPv6 validas de un texto, con offsets start/end. Filtra candidatos invalidos via ipaddress (rechaza 999.999.999.999 y similares). No distingue privadas de publicas — el filtrado de relevancia es del caller."
+tags: [ioc, ip, ipv4, ipv6, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re, ipaddress]
+params:
+  - name: text
+    desc: "string de texto del que extraer IPs"
+output: "lista de dicts con {value, start, end, type='ip_address'} por cada IP encontrada"
+tested: true
+tests:
+  - "IPv4 valida y rangos limite"
+  - "IPv4 invalida (>255 octeto) descartada"
+  - "IPv6 forma completa y comprimida"
+  - "IPv6 invalida descartada"
+  - "Texto sin IPs"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_ip_addresses.py"
+---
+
+## Ejemplo
+
+```python
+extract_ip_addresses("Server 192.168.1.1 talks to 8.8.8.8")
+# [{"value": "192.168.1.1", "start": 7, "end": 18, "type": "ip_address"},
+#  {"value": "8.8.8.8", "start": 28, "end": 35, "type": "ip_address"}]
+
+extract_ip_addresses("not an IP: 999.999.999.999")
+# []
+```
+
+## Notas
+
+Usa `ipaddress.IPv4Address` / `IPv6Address` para validacion estructural — descarta `999.999.999.999` y otras combinaciones sintacticamente plausibles pero invalidas. IPs privadas (10/8, 172.16/12, 192.168/16) se extraen igual; el filtrado de relevancia es responsabilidad del caller. Pure — solo regex compilado y `ipaddress`, sin red ni disco.
diff --git a/python/functions/cybersecurity/extract_ip_addresses.py b/python/functions/cybersecurity/extract_ip_addresses.py
new file mode 100644
index 00000000..51cdfc26
--- /dev/null
+++ b/python/functions/cybersecurity/extract_ip_addresses.py
@@ -0,0 +1,53 @@
+"""Extrae IPv4 + IPv6 validas de un texto, con offsets."""
+
+import ipaddress
+import re
+
+_IPV4_CANDIDATE = re.compile(r"\b\d{1,3}(?:\.\d{1,3}){3}\b")
+_IPV6_CANDIDATE = re.compile(
+    r"(?<![0-9A-Fa-f:])"
+    r"(?:[0-9A-Fa-f]{0,4}:){2,7}[0-9A-Fa-f]{0,4}"
+    r"(?:%[0-9A-Za-z]+)?"
+    r"(?![0-9A-Fa-f:])"
+)
+
+
+def extract_ip_addresses(text: str) -> list[dict]:
+    """Extrae IPv4 e IPv6 validas con offsets.
+
+    Filtra candidatos que no parsean como IP valida con `ipaddress`. No
+    distingue IP privadas (10.x, 192.168.x) de publicas — el filtrado de
+    relevancia es responsabilidad del caller.
+    """
+    results: list[dict] = []
+
+    for m in _IPV4_CANDIDATE.finditer(text):
+        candidate = m.group(0)
+        try:
+            ipaddress.IPv4Address(candidate)
+        except ValueError:
+            continue
+        results.append({
+            "value": candidate,
+            "start": m.start(),
+            "end": m.end(),
+            "type": "ip_address",
+        })
+
+    for m in _IPV6_CANDIDATE.finditer(text):
+        candidate = m.group(0).split("%", 1)[0]
+        if candidate.count(":") < 2:
+            continue
+        try:
+            ipaddress.IPv6Address(candidate)
+        except ValueError:
+            continue
+        results.append({
+            "value": m.group(0),
+            "start": m.start(),
+            "end": m.end(),
+            "type": "ip_address",
+        })
+
+    results.sort(key=lambda r: r["start"])
+    return results
diff --git a/python/functions/cybersecurity/extract_mac_addresses.md b/python/functions/cybersecurity/extract_mac_addresses.md
new file mode 100644
index 00000000..9dc86393
--- /dev/null
+++ b/python/functions/cybersecurity/extract_mac_addresses.md
@@ -0,0 +1,40 @@
+---
+name: extract_mac_addresses
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_mac_addresses(text: str) -> list[dict]"
+description: "Extrae direcciones MAC en formato `xx:xx:xx:xx:xx:xx` o con guiones (`-`) de un texto, con offsets. Acepta hex en cualquier caso. Rechaza separadores mezclados."
+tags: [ioc, mac, network, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer MAC addresses"
+output: "lista de dicts con {value, start, end, type='mac_address'} por cada MAC encontrada"
+tested: true
+tests:
+  - "MAC con dos puntos"
+  - "MAC con guiones"
+  - "Separadores mezclados se rechazan"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_mac_addresses.py"
+---
+
+## Ejemplo
+
+```python
+extract_mac_addresses("router 00:1A:2B:3C:4D:5E and AA-BB-CC-DD-EE-FF")
+# [{"value": "00:1A:2B:3C:4D:5E", ..., "type": "mac_address"},
+#  {"value": "AA-BB-CC-DD-EE-FF", ..., "type": "mac_address"}]
+```
+
+## Notas
+
+Cada direccion debe usar un solo separador (todos `:` o todos `-`). No se valida OUI ni se distingue unicast/multicast. Para extraer la parte de fabricante OUI: tomar los primeros 6 hex chars del `value` y consultar registro IEEE.
diff --git a/python/functions/cybersecurity/extract_mac_addresses.py b/python/functions/cybersecurity/extract_mac_addresses.py
new file mode 100644
index 00000000..5d041a49
--- /dev/null
+++ b/python/functions/cybersecurity/extract_mac_addresses.py
@@ -0,0 +1,31 @@
+"""Extrae direcciones MAC de un texto, con offsets."""
+
+import re
+
+_MAC_REGEX = re.compile(
+    r"(?<![A-Fa-f0-9:-])"
+    r"(?:[A-Fa-f0-9]{2}[:-]){5}[A-Fa-f0-9]{2}"
+    r"(?![A-Fa-f0-9:-])"
+)
+
+
+def extract_mac_addresses(text: str) -> list[dict]:
+    """Extrae MAC addresses en formato `xx:xx:xx:xx:xx:xx` o con guiones.
+
+    Ambos separadores deben ser uniformes (no mezcla `:` y `-` en una
+    misma direccion — se aceptan independientemente). Insensible a
+    mayusculas.
+    """
+    results = []
+    for m in _MAC_REGEX.finditer(text):
+        candidate = m.group(0)
+        # Asegurar separador uniforme.
+        if ":" in candidate and "-" in candidate:
+            continue
+        results.append({
+            "value": candidate,
+            "start": m.start(),
+            "end": m.end(),
+            "type": "mac_address",
+        })
+    return results
diff --git a/python/functions/cybersecurity/extract_phone_numbers.md b/python/functions/cybersecurity/extract_phone_numbers.md
new file mode 100644
index 00000000..c8e23371
--- /dev/null
+++ b/python/functions/cybersecurity/extract_phone_numbers.md
@@ -0,0 +1,40 @@
+---
+name: extract_phone_numbers
+kind: function
+lang: py
+domain: cybersecurity
+version: "1.0.0"
+purity: pure
+signature: "def extract_phone_numbers(text: str) -> list[dict]"
+description: "Extrae numeros de telefono en formato E.164 (`+CC...`) y formato local ES (9 digitos empezando por 6/7/8/9), con offsets. Permite separadores `space` y `-` entre grupos."
+tags: [ioc, phone, e164, spain, regex, extract, cybersecurity, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: ""
+imports: [re]
+params:
+  - name: text
+    desc: "string de texto del que extraer telefonos"
+output: "lista de dicts con {value, start, end, type='phone_number'}"
+tested: true
+tests:
+  - "Numero E.164 con espacios"
+  - "Numero local ES de 9 digitos"
+  - "Numero demasiado corto se descarta"
+test_file_path: "python/functions/cybersecurity/tests/test_extract_iocs.py"
+file_path: "python/functions/cybersecurity/extract_phone_numbers.py"
+---
+
+## Ejemplo
+
+```python
+extract_phone_numbers("Llamar al +34 612 345 678 o al 912345678")
+# [{"value": "+34 612 345 678", "start": 10, "end": 25, "type": "phone_number"},
+#  {"value": "912345678", "start": 31, "end": 40, "type": "phone_number"}]
+```
+
+## Notas
+
+E.164 (ITU-T): entre 8 y 15 digitos tras el `+`. ES local: 9 digitos exactos, primero ∈ {6,7,8,9}. No se discrimina entre movil y fijo. No se normaliza el formato — el caller decide. Para parseo robusto multi-pais usar `phonenumbers` (libpostal-style), pero esa dependencia no es necesaria para extraer candidatos como IoC.
diff --git a/python/functions/cybersecurity/extract_phone_numbers.py b/python/functions/cybersecurity/extract_phone_numbers.py
new file mode 100644
index 00000000..ec1265c3
--- /dev/null
+++ b/python/functions/cybersecurity/extract_phone_numbers.py
@@ -0,0 +1,63 @@
+"""Extrae numeros de telefono (E.164 + formatos comunes ES/EU) con offsets."""
+
+import re
+
+# E.164: + seguido de 8 a 15 digitos, opcionalmente con espacios/guiones internos.
+_E164_REGEX = re.compile(
+    r"(?<![A-Za-z0-9])"
+    r"\+\d{1,3}[\s\-]?\d{1,4}(?:[\s\-]?\d{1,4}){1,4}"
+    r"(?![A-Za-z0-9])"
+)
+# ES: 9 digitos empezando por 6, 7, 8 o 9 (movil/fijo).
+_ES_LOCAL_REGEX = re.compile(
+    r"(?<![A-Za-z0-9+])"
+    r"[6789]\d{2}[\s\-]?\d{3}[\s\-]?\d{3}"
+    r"(?![A-Za-z0-9])"
+)
+
+
+def extract_phone_numbers(text: str) -> list[dict]:
+    """Extrae numeros de telefono E.164 y formato local ES de 9 digitos.
+
+    Acepta separadores `space`, `-` entre grupos. E.164 requiere `+` y
+    entre 8 y 15 digitos (ITU-T). Formato local ES son 9 digitos que
+    empiezan por 6/7/8/9. Tras quitar separadores se valida la longitud
+    minima.
+    """
+    seen_spans = set()
+    results = []
+
+    for m in _E164_REGEX.finditer(text):
+        candidate = m.group(0)
+        digits = re.sub(r"[^0-9]", "", candidate)
+        if not (8 <= len(digits) <= 15):
+            continue
+        span = (m.start(), m.end())
+        if span in seen_spans:
+            continue
+        seen_spans.add(span)
+        results.append({
+            "value": candidate,
+            "start": m.start(),
+            "end": m.end(),
+            "type": "phone_number",
+        })
+
+    for m in _ES_LOCAL_REGEX.finditer(text):
+        candidate = m.group(0)
+        digits = re.sub(r"[^0-9]", "", candidate)
+        if len(digits) != 9:
+            continue
+        span = (m.start(), m.end())
+        if span in seen_spans:
+            continue
+        seen_spans.add(span)
+        results.append({
+            "value": candidate,
+            "start": m.start(),
+            "end": m.end(),
+            "type": "phone_number",
+        })
+
+    results.sort(key=lambda r: r["start"])
+    return results

From 2b82b4b9ceaf469a65fb417c672725bb9494d729 Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:24:18 +0200
Subject: [PATCH 2/9] test(cybersecurity): corpus para los 8 extractores +
 pipeline extract_iocs

30 tests cubriendo positivos y negativos por tipo:
- IPv4 valida/invalida + rangos limite
- IPv6 forma completa/comprimida
- Emails (caracteres validos en local part)
- Dominios con TLD valido vs desconocido
- Hashes MD5/SHA1/SHA256/SHA512 por longitud
- Wallets BTC legacy/bech32 y ETH
- CVEs 4 y 7 digitos
- MAC con `:` y `-` (separadores mezclados rechazados)
- Telefonos E.164 y ES local 9 digitos
- Pipeline filtrado por types y deduplicacion de spans contenidos

Refs #0037

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../functions/cybersecurity/tests/__init__.py |   0
 .../cybersecurity/tests/test_extract_iocs.py  | 289 ++++++++++++++++++
 2 files changed, 289 insertions(+)
 create mode 100644 python/functions/cybersecurity/tests/__init__.py
 create mode 100644 python/functions/cybersecurity/tests/test_extract_iocs.py

diff --git a/python/functions/cybersecurity/tests/__init__.py b/python/functions/cybersecurity/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/functions/cybersecurity/tests/test_extract_iocs.py b/python/functions/cybersecurity/tests/test_extract_iocs.py
new file mode 100644
index 00000000..d514757a
--- /dev/null
+++ b/python/functions/cybersecurity/tests/test_extract_iocs.py
@@ -0,0 +1,289 @@
+"""Tests para los extractores de IoC y el pipeline `extract_iocs`."""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from extract_ip_addresses import extract_ip_addresses
+from extract_emails import extract_emails
+from extract_domains import extract_domains
+from extract_file_hashes import extract_file_hashes
+from extract_crypto_wallets import extract_crypto_wallets
+from extract_cve_ids import extract_cve_ids
+from extract_mac_addresses import extract_mac_addresses
+from extract_phone_numbers import extract_phone_numbers
+from extract_iocs import extract_iocs
+
+
+# ---------- IP addresses ----------
+
+
+def test_ipv4_valida_y_rangos_limite():
+    """IPv4 valida y rangos limite."""
+    text = "valid 0.0.0.0 and 255.255.255.255 plus 10.0.0.1"
+    ips = extract_ip_addresses(text)
+    assert [r["value"] for r in ips] == ["0.0.0.0", "255.255.255.255", "10.0.0.1"]
+    assert all(r["type"] == "ip_address" for r in ips)
+
+
+def test_ipv4_invalida_descartada():
+    """IPv4 invalida (>255 octeto) descartada."""
+    text = "fake: 999.999.999.999 and 256.0.0.1 and 1.2.3"
+    ips = extract_ip_addresses(text)
+    assert ips == []
+
+
+def test_ipv6_forma_completa_y_comprimida():
+    """IPv6 forma completa y comprimida."""
+    text = "v6: 2001:db8:85a3::8a2e:370:7334 and ::1"
+    ips = extract_ip_addresses(text)
+    values = [r["value"] for r in ips]
+    assert "2001:db8:85a3::8a2e:370:7334" in values
+    assert "::1" in values
+
+
+def test_ipv6_invalida_descartada():
+    """IPv6 invalida descartada."""
+    # Demasiados grupos (9) — ipaddress lo rechaza aunque la regex lo intente.
+    text = "v6 fake: 1:2:3:4:5:6:7:8:9"
+    ips = extract_ip_addresses(text)
+    assert all(":9" not in r["value"].rsplit(":", 1)[-1] or False for r in ips) or ips == []
+    # Con 9 grupos, ipaddress siempre rechaza.
+    assert "1:2:3:4:5:6:7:8:9" not in {r["value"] for r in ips}
+
+
+def test_texto_sin_ips():
+    """Texto sin IPs."""
+    assert extract_ip_addresses("nothing to see here") == []
+
+
+# ---------- Emails ----------
+
+
+def test_email_simple():
+    """Email simple."""
+    text = "Contact: alice@example.com"
+    emails = extract_emails(text)
+    assert len(emails) == 1
+    assert emails[0]["value"] == "alice@example.com"
+    assert text[emails[0]["start"] : emails[0]["end"]] == "alice@example.com"
+
+
+def test_multiples_emails_con_caracteres_validos_en_local_part():
+    """Multiples emails con caracteres validos en local part."""
+    text = "alice+work@sub.test.org or first.last_99@a-b.io"
+    emails = extract_emails(text)
+    values = [r["value"] for r in emails]
+    assert "alice+work@sub.test.org" in values
+    assert "first.last_99@a-b.io" in values
+
+
+def test_no_matchea_texto_sin_arroba():
+    """No matchea texto sin @."""
+    assert extract_emails("just text, no email here") == []
+
+
+# ---------- Domains ----------
+
+
+def test_dominios_con_tld_valido_se_extraen():
+    """Dominios con TLD valido se extraen."""
+    text = "visit example.com or test.io"
+    domains = extract_domains(text)
+    values = [r["value"] for r in domains]
+    assert "example.com" in values
+    assert "test.io" in values
+
+
+def test_tld_desconocido_se_descarta():
+    """TLD desconocido se descarta."""
+    text = "visit example.fakextld for info"
+    assert extract_domains(text) == []
+
+
+def test_subdominios_profundos():
+    """Subdominios profundos."""
+    text = "api.v2.service.example.com is up"
+    domains = extract_domains(text)
+    assert any(r["value"] == "api.v2.service.example.com" for r in domains)
+
+
+# ---------- File hashes ----------
+
+
+def test_md5_sha1_sha256_sha512():
+    """MD5 (32 hex), SHA1 (40), SHA256 (64), SHA512 (128)."""
+    md5 = "5d41402abc4b2a76b9719d911017c592"
+    sha1 = "aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d"
+    sha256 = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+    sha512 = "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e"
+    text = f"{md5} {sha1} {sha256} {sha512}"
+    hashes = extract_file_hashes(text)
+    by_algo = {r["algorithm"]: r["value"] for r in hashes}
+    assert by_algo["md5"] == md5
+    assert by_algo["sha1"] == sha1
+    assert by_algo["sha256"] == sha256
+    assert by_algo["sha512"] == sha512
+
+
+def test_longitudes_intermedias_se_ignoran():
+    """Longitudes intermedias se ignoran."""
+    text = "abcdef" * 10  # 60 hex chars
+    assert extract_file_hashes(text) == []
+
+
+def test_insensible_a_mayusculas_en_hex():
+    """Insensible a mayusculas en hex."""
+    md5 = "5D41402ABC4B2A76B9719D911017C592"
+    hashes = extract_file_hashes(md5)
+    assert len(hashes) == 1
+    assert hashes[0]["algorithm"] == "md5"
+
+
+# ---------- Crypto wallets ----------
+
+
+def test_btc_legacy():
+    """BTC legacy (P2PKH y P2SH)."""
+    p2pkh = "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa"
+    p2sh = "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy"
+    text = f"send to {p2pkh} or {p2sh}"
+    wallets = extract_crypto_wallets(text)
+    values = [r["value"] for r in wallets]
+    assert p2pkh in values
+    assert p2sh in values
+    assert all(r["asset"] == "btc" for r in wallets)
+
+
+def test_btc_bech32_segwit():
+    """BTC bech32 (segwit)."""
+    bech32 = "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq"
+    wallets = extract_crypto_wallets(f"address: {bech32}")
+    assert len(wallets) == 1
+    assert wallets[0]["value"] == bech32
+    assert wallets[0]["asset"] == "btc"
+
+
+def test_eth_0x_y_40_hex():
+    """ETH 0x + 40 hex."""
+    eth = "0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1"
+    wallets = extract_crypto_wallets(f"send {eth} to me")
+    assert len(wallets) == 1
+    assert wallets[0]["value"] == eth
+    assert wallets[0]["asset"] == "eth"
+
+
+# ---------- CVEs ----------
+
+
+def test_cve_basico():
+    """CVE basico (4 digitos)."""
+    text = "Patch CVE-2014-0160 immediately"
+    cves = extract_cve_ids(text)
+    assert [r["value"] for r in cves] == ["CVE-2014-0160"]
+
+
+def test_cve_con_5_o_mas_digitos_post_2014():
+    """CVE con 5+ digitos (post-2014)."""
+    cves = extract_cve_ids("see CVE-2024-1234567 advisory")
+    assert [r["value"] for r in cves] == ["CVE-2024-1234567"]
+
+
+def test_multiples_cves_en_mismo_texto():
+    """Multiples CVEs en mismo texto."""
+    text = "Affected: CVE-2021-44228, CVE-2021-45046, CVE-2021-45105"
+    cves = extract_cve_ids(text)
+    values = [r["value"] for r in cves]
+    assert values == ["CVE-2021-44228", "CVE-2021-45046", "CVE-2021-45105"]
+
+
+# ---------- MAC addresses ----------
+
+
+def test_mac_con_dos_puntos():
+    """MAC con dos puntos."""
+    text = "iface 00:1A:2B:3C:4D:5E up"
+    macs = extract_mac_addresses(text)
+    assert [r["value"] for r in macs] == ["00:1A:2B:3C:4D:5E"]
+
+
+def test_mac_con_guiones():
+    """MAC con guiones."""
+    text = "AA-BB-CC-DD-EE-FF"
+    macs = extract_mac_addresses(text)
+    assert [r["value"] for r in macs] == ["AA-BB-CC-DD-EE-FF"]
+
+
+def test_separadores_mezclados_se_rechazan():
+    """Separadores mezclados se rechazan."""
+    text = "00:1A-2B:3C-4D:5E"
+    assert extract_mac_addresses(text) == []
+
+
+# ---------- Phone numbers ----------
+
+
+def test_numero_e164_con_espacios():
+    """Numero E.164 con espacios."""
+    text = "call +34 612 345 678 now"
+    phones = extract_phone_numbers(text)
+    assert any(r["value"].startswith("+34") for r in phones)
+
+
+def test_numero_local_es_9_digitos():
+    """Numero local ES de 9 digitos."""
+    text = "directo 612345678 fijo"
+    phones = extract_phone_numbers(text)
+    assert any(r["value"] == "612345678" for r in phones)
+
+
+def test_numero_demasiado_corto_se_descarta():
+    """Numero demasiado corto se descarta."""
+    text = "ext 1234"
+    assert extract_phone_numbers(text) == []
+
+
+# ---------- Pipeline extract_iocs ----------
+
+
+def test_pipeline_corre_todos_los_extractores():
+    """Pipeline corre todos los extractores."""
+    text = (
+        "Reach alice@example.com from 10.0.0.5; "
+        "CVE-2023-1234 vendor 00:1A:2B:3C:4D:5E "
+        "wallet 0x742d35Cc6634C0532925a3b844Bc9e7595f0bEb1"
+    )
+    iocs = extract_iocs(text)
+    types = {r["type"] for r in iocs}
+    assert "email" in types
+    assert "ip_address" in types
+    assert "cve_id" in types
+    assert "mac_address" in types
+    assert "crypto_wallet" in types
+
+
+def test_filtro_por_types_subset():
+    """Filtro por types subset."""
+    text = "alice@example.com 10.0.0.5"
+    iocs = extract_iocs(text, types=["ip_address"])
+    types = {r["type"] for r in iocs}
+    assert types == {"ip_address"}
+
+
+def test_deduplica_spans_contenidos():
+    """Deduplica spans contenidos (dominio dentro de email)."""
+    text = "Email: alice@example.com nothing else"
+    iocs = extract_iocs(text)
+    # El email aparece, el dominio interno se descarta por contenido.
+    types = [r["type"] for r in iocs]
+    assert "email" in types
+    assert "domain" not in types
+
+
+def test_tipos_desconocidos_se_ignoran():
+    """Tipos desconocidos se ignoran."""
+    text = "alice@example.com"
+    iocs = extract_iocs(text, types=["nonexistent", "email"])
+    assert len(iocs) == 1
+    assert iocs[0]["type"] == "email"

From 2cbf7546204711d3565da4804470a669bc4adcc6 Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:24:25 +0200
Subject: [PATCH 3/9] =?UTF-8?q?docs(issues):=20cerrar=200037=20=E2=80=94?=
 =?UTF-8?q?=20IoC=20regex=20extractor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move dev/issues/0037-ioc-regex-extractor.md a completed/
- Update README link y estado a completado
- Limpiar duplicado obsoleto de 0042 (ya estaba en completed/)

Closes #0037

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dev/issues/0042-cpp-layout-storage-public.md  | 110 ------------------
 dev/issues/README.md                          |   2 +-
 .../0037-ioc-regex-extractor.md               |   0
 3 files changed, 1 insertion(+), 111 deletions(-)
 delete mode 100644 dev/issues/0042-cpp-layout-storage-public.md
 rename dev/issues/{ => completed}/0037-ioc-regex-extractor.md (100%)

diff --git a/dev/issues/0042-cpp-layout-storage-public.md b/dev/issues/0042-cpp-layout-storage-public.md
deleted file mode 100644
index 53dc03c3..00000000
--- a/dev/issues/0042-cpp-layout-storage-public.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# 0042 — C++ layout_storage: extraer y publicar como API reutilizable
-
-## Metadata
-
-| Campo | Valor |
-|-------|-------|
-| **ID** | 0042 |
-| **Estado** | pendiente |
-| **Prioridad** | alta |
-| **Tipo** | feature — C++ core (`cpp/functions/core`) |
-
-## Dependencias
-
-Ninguna. Habilita **0043** (estandarizar apps).
-
----
-
-## Objetivo
-
-Extraer la persistencia de layouts ImGui (actualmente privada en `shaders_lab/main.cpp` lineas 415-447) a una funcion publica del registry: `layout_storage_cpp_core`. Cualquier app puede pasarla a `app_menubar` via `LayoutCallbacks` con un solo `setup`.
-
-## Contexto
-
-`shaders_lab` guarda layouts (snapshots de `imgui.ini`) en SQLite via `shaderlab_db_cpp_core`. La logica que conecta SQLite con `LayoutCallbacks` (`save`, `load`, `list`, `remove`) esta inline en su `main.cpp` y no es reusable. Otras apps (registry_dashboard, primitives_gallery, chart_demo) no tienen layouts persistentes.
-
-## Arquitectura
-
-```
-cpp/functions/core/
-├── layout_storage.h          # NEW — API publica
-├── layout_storage.cpp        # NEW — impl con SQLite
-├── layout_storage.md         # NEW
-└── (opcional) layouts_menu.h ya existe — sin cambios
-
-cpp/apps/shaders_lab/
-└── main.cpp                  # MOD — usa layout_storage en lugar de inline
-```
-
-### API propuesta
-
-```cpp
-namespace fn_ui {
-
-struct LayoutStorage; // opaque
-
-// Crea un storage que persiste layouts ImGui en una tabla SQLite del path dado.
-// Si la BD no existe, la crea. Tabla: `imgui_layouts(name TEXT PRIMARY KEY, ini TEXT, updated_at)`.
-LayoutStorage* layout_storage_open(const char* db_path);
-void           layout_storage_close(LayoutStorage* s);
-
-// Helper que rellena un LayoutCallbacks usando este storage.
-// El caller mantiene vivo el storage durante la vida de los callbacks.
-void layout_storage_make_callbacks(LayoutStorage* s, LayoutCallbacks& out);
-
-}
-```
-
-`LayoutCallbacks` ya esta definido en `panel_menu.h`/`layouts_menu.h`. Esta funcion solo wirea SQLite.
-
-## Tareas
-
-### Fase 1 — Codigo
-
-1.1 Crear `cpp/functions/core/layout_storage.{h,cpp,md}`.
-1.2 Implementar usando sqlite3 vendoreada (`cpp/vendor/sqlite3`). Tabla unica `imgui_layouts`.
-1.3 Save: serializa `ImGui::SaveIniSettingsToMemory()` y hace UPSERT por nombre.
-1.4 Load: lee `ini` y llama `ImGui::LoadIniSettingsFromMemory(ini, len)`.
-1.5 List: `SELECT name FROM imgui_layouts ORDER BY updated_at DESC`.
-1.6 Remove: `DELETE FROM imgui_layouts WHERE name=?`.
-1.7 Frontmatter `.md` con `purity: impure`, `error_type: error_go_core`, `uses_types: []`.
-
-### Fase 2 — Migrar shaders_lab
-
-2.1 Reemplazar el bloque inline (l. 415-447) por:
-```cpp
-auto* g_layouts = fn_ui::layout_storage_open("shaders_lab.db");
-fn_ui::LayoutCallbacks layouts_cb;
-fn_ui::layout_storage_make_callbacks(g_layouts, layouts_cb);
-// ...pasar layouts_cb a app_menubar
-```
-2.2 Mantener `shaderlab_db_cpp_core` como esta (no es lo mismo: guarda shaders, no layouts) — pero quitar de el la parte de layouts si la tiene.
-2.3 Verificar que los layouts existentes siguen cargando (compatibilidad de schema o migracion automatica).
-
-### Fase 3 — Tests
-
-3.1 Test unitario: open → save("test", ini) → list() == ["test"] → load("test") devuelve el ini → remove("test") → list() == [].
-3.2 Test de regresion en shaders_lab (build + abrir/cerrar layout manual).
-
-### Fase 4 — Indexar
-
-4.1 `./fn index` y verificar `fn show layout_storage_cpp_core`.
-
-## Decisiones de diseno
-
-- BD SQLite por app (no compartida) — cada app gestiona sus layouts.
-- Schema simple (`name PRIMARY KEY, ini, updated_at`) — sin namespaces ni jerarquia.
-- API opaca (`LayoutStorage*`) para no exponer sqlite3 en headers publicos.
-
-## Riesgos
-
-- shaders_lab tiene layouts existentes en su `shaders_lab.db`. Si la tabla actual difiere del schema nuevo: migracion automatica al primer open o conservar ambas tablas.
-- Threading: ImGui::SaveIniSettingsToMemory solo es seguro desde el thread principal — documentar.
-
-## Validacion
-
-```bash
-cd cpp/build && cmake --build . --target shaders_lab
-# Abrir shaders_lab, guardar/cargar layouts, restart, verificar que persiste.
-./fn show layout_storage_cpp_core
-```
diff --git a/dev/issues/README.md b/dev/issues/README.md
index a91500de..60ef9277 100644
--- a/dev/issues/README.md
+++ b/dev/issues/README.md
@@ -42,7 +42,7 @@
 | [0034](completed/0034-cpp-scientific-viz.md) | C++ scientific viz (treemap, sankey, chord, contour, voronoi) | completado | media | feature | — |
 | [0035](0035-cpp-map-tiles.md) | C++ map_tiles (slippy map OSM) | pendiente | baja | feature | — |
 | [0036](0036-cpp-image-canvas-webcam.md) | C++ image_canvas + webcam_texture | pendiente | baja | feature | — |
-| [0037](0037-ioc-regex-extractor.md) | IoC regex extractor (IP, email, dominio, hash, wallet, CVE, MAC) | pendiente | alta | feature | — |
+| [0037](completed/0037-ioc-regex-extractor.md) | IoC regex extractor (IP, email, dominio, hash, wallet, CVE, MAC) | completado | alta | feature | — |
 | [0038](0038-gliner-entity-extractor.md) | GLiNER entity extractor (zero-shot NER multilingue) | pendiente | alta | feature | 0039, 0040 |
 | [0039](0039-glirel-relation-extractor.md) | GLiREL relation extractor (zero-shot triplets) | pendiente | media | feature | 0040 |
 | [0040](0040-hybrid-extraction-pipeline.md) | Pipeline hibrido extraccion grafos (regex + GLiNER + GLiREL + LLM fallback) | pendiente | media | feature | — |
diff --git a/dev/issues/0037-ioc-regex-extractor.md b/dev/issues/completed/0037-ioc-regex-extractor.md
similarity index 100%
rename from dev/issues/0037-ioc-regex-extractor.md
rename to dev/issues/completed/0037-ioc-regex-extractor.md

From c663f9d6e8e294e3a9bb2cfe369354228288f37c Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:33:38 +0200
Subject: [PATCH 4/9] feat(datascience): GLiNER entity extractor (zero-shot
 NER) drop-in con LLM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Funciones nuevas en python/functions/datascience/:
- gliner_load_model: carga + cachea modelo GLiNER por (name, device).
  device='auto' resuelve a cuda/cpu segun torch.cuda.is_available, sin
  fallar si torch no esta instalado. ImportError claro si falta gliner.
- extract_entities_gliner: contrato drop-in de extract_entities_llm
  (mismo entity_schema, mismo list[EntityCandidate]). El caller inyecta
  el modelo (cargado UNA vez por proceso). Anota offsets start/end en
  attributes para reconciliar con extract_iocs (issue 0040).

Diferencias vs LLM extractor:
- 50-200x mas rapido en GPU, 0 USD/token.
- Malo con IoCs tecnicos (lo cubre 0037).
- Threshold y flat_ner ajustables por dominio.

pyproject.toml: gliner como extra opcional `[nlp]` para no inflar el
.venv de quien no use NER. Instalacion: `uv pip install -e '.[nlp]'`.

Refs #0038 — Desbloquea 0039 (GLiREL) y 0040 (pipeline hibrido).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../datascience/extract_entities_gliner.md    |  89 ++++++++++++
 .../datascience/extract_entities_gliner.py    | 136 ++++++++++++++++++
 .../datascience/gliner_load_model.md          |  66 +++++++++
 .../datascience/gliner_load_model.py          |  63 ++++++++
 python/pyproject.toml                         |   5 +
 5 files changed, 359 insertions(+)
 create mode 100644 python/functions/datascience/extract_entities_gliner.md
 create mode 100644 python/functions/datascience/extract_entities_gliner.py
 create mode 100644 python/functions/datascience/gliner_load_model.md
 create mode 100644 python/functions/datascience/gliner_load_model.py

diff --git a/python/functions/datascience/extract_entities_gliner.md b/python/functions/datascience/extract_entities_gliner.md
new file mode 100644
index 00000000..a5dd15f2
--- /dev/null
+++ b/python/functions/datascience/extract_entities_gliner.md
@@ -0,0 +1,89 @@
+---
+name: extract_entities_gliner
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def extract_entities_gliner(text: str, entity_schema: list[dict], model: Any, threshold: float = 0.5, flat_ner: bool = True) -> list[EntityCandidate]"
+description: "Extrae entidades zero-shot con GLiNER. Drop-in del contrato de extract_entities_llm pero 50-200x mas rapido y sin coste por token. El caller inyecta el modelo cargado con gliner_load_model. Anota offsets start/end en attributes para reconciliar con extract_iocs."
+tags: [gliner, ner, nlp, entity, extract, zero-shot, osint, graph, datascience, python]
+uses_functions: [gliner_load_model_py_datascience]
+uses_types: [entity_candidate_py_datascience]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [warnings]
+params:
+  - name: text
+    desc: "chunk de texto a analizar (parrafo, documento corto, output de OCR)"
+  - name: entity_schema
+    desc: "lista de dicts con 'type_ref' y 'label'. Mismo formato que extract_entities_llm. El 'label' se usa como label de GLiNER."
+  - name: model
+    desc: "instancia GLiNER cargada con gliner_load_model. Inyectar para evitar penalty de carga en batch."
+  - name: threshold
+    desc: "score minimo para aceptar una entidad (0.0-1.0). Defecto 0.5 — ajustable segun precision/recall objetivo."
+  - name: flat_ner
+    desc: "True (defecto) sin entidades anidadas; False permite spans solapados (ej. 'Universidad de Madrid' como ORG y 'Madrid' como LOC en simultaneo)"
+output: "lista de EntityCandidate con name, type_ref, type_label, confidence y attributes={'start': int, 'end': int}"
+tested: true
+tests:
+  - "Schema basico y modelo stub retorna EntityCandidate con offsets"
+  - "Threshold filtra spans con score bajo"
+  - "Schema vacio lanza ValueError"
+  - "Schema sin label+type_ref validos retorna vacio con warning"
+  - "Excepcion del modelo se captura y retorna vacio"
+  - "Label desconocido se descarta"
+  - "flat_ner se propaga al modelo"
+test_file_path: "python/functions/datascience/tests/test_extract_entities_gliner.py"
+file_path: "python/functions/datascience/extract_entities_gliner.py"
+---
+
+## Ejemplo
+
+```python
+from python.functions.datascience import (
+    gliner_load_model,
+    extract_entities_gliner,
+)
+
+model = gliner_load_model(device="auto")
+
+schema = [
+    {"type_ref": "osint_person_go_cybersecurity", "label": "Person"},
+    {"type_ref": "osint_organization_go_cybersecurity", "label": "Organization"},
+    {"type_ref": "osint_location_go_cybersecurity", "label": "Location"},
+]
+
+text = "Alice Johnson works at OpenAI in San Francisco."
+entities = extract_entities_gliner(text, schema, model, threshold=0.4)
+# [EntityCandidate(name='Alice Johnson', type_ref='osint_person_go_cybersecurity',
+#                  attributes={'start': 0, 'end': 13}, confidence=0.92), ...]
+```
+
+## Drop-in con extract_entities_llm
+
+El retorno es identico (`list[EntityCandidate]`), por lo que se puede sustituir
+sin tocar el resto del pipeline (`deduplicate_entities`, `merge_entity_attributes`,
+etc). Diferencias:
+
+- **Coste**: GLiNER = 0 USD/token. LLM = depende de modelo.
+- **Latencia**: GLiNER 50-200x mas rapido en GPU.
+- **IoCs tecnicos** (IPs, hashes, wallets, CVEs): GLiNER es malo — usar
+  `extract_iocs_py_cybersecurity` para esos. Combinar regex + GLiNER en
+  el pipeline hibrido (issue 0040).
+- **Schemas con muchos tipos**: GLiNER pierde precision con >20 labels;
+  LLM la mantiene. Para esquemas grandes, dividir en bloques.
+- **Razonamiento implicito** ("CEO de la empresa"): el LLM lo deduce, GLiNER
+  solo extrae lo explicito.
+
+## Notas
+
+- El modelo se carga UNA vez por proceso. No cargarlo aqui dentro: penalty fatal
+  en batch. Inyeccion explicita por contrato.
+- impure: el modelo es estado externo (memoria, GPU si aplica). `error_type:
+  error_go_core` segun la regla de pureza del registry.
+- Si `flat_ner=False`, validar que el caller dedupica/normaliza spans solapados
+  — `EntityCandidate.attributes['start'/'end']` permite hacerlo facilmente.
+- Para precision maxima, ajustar `threshold` por dominio: 0.3-0.4 para recall
+  alto, 0.6-0.8 para precision alta.
diff --git a/python/functions/datascience/extract_entities_gliner.py b/python/functions/datascience/extract_entities_gliner.py
new file mode 100644
index 00000000..d17a8fa2
--- /dev/null
+++ b/python/functions/datascience/extract_entities_gliner.py
@@ -0,0 +1,136 @@
+"""Extrae entidades de un chunk de texto usando GLiNER (zero-shot NER)."""
+
+from __future__ import annotations
+
+import os
+import sys
+import warnings
+from typing import Any
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from python.types.datascience.entity_candidate import EntityCandidate
+
+
+def _build_label_maps(entity_schema: list[dict]) -> tuple[list[str], dict[str, str], dict[str, str]]:
+    """Traduce el schema al formato que espera GLiNER.
+
+    Returns:
+        labels: lista de strings (lo que se pasa a model.predict_entities).
+        label_to_type_ref: dict para mapear el label predicho al type_ref.
+        label_to_label: dict label -> label legible (para `type_label`).
+    """
+    labels: list[str] = []
+    label_to_type_ref: dict[str, str] = {}
+    label_to_label: dict[str, str] = {}
+    for entry in entity_schema:
+        label = entry.get("label", "").strip()
+        type_ref = entry.get("type_ref", "").strip()
+        if not label or not type_ref:
+            continue
+        labels.append(label)
+        # last-wins si dos type_refs comparten label.
+        label_to_type_ref[label] = type_ref
+        label_to_label[label] = label
+    return labels, label_to_type_ref, label_to_label
+
+
+def extract_entities_gliner(
+    text: str,
+    entity_schema: list[dict],
+    model: Any,
+    threshold: float = 0.5,
+    flat_ner: bool = True,
+) -> list[EntityCandidate]:
+    """Extrae entidades zero-shot con GLiNER, contrato drop-in con `extract_entities_llm`.
+
+    Cada `entity_schema` entry usa su `label` como label de GLiNER. El
+    type_ref se reconstruye desde `label_to_type_ref`. Offsets de span
+    se anotan en `attributes["start"]` y `attributes["end"]` para que
+    el caller pueda reconciliar con regex IoCs (ver `extract_iocs`).
+
+    Args:
+        text: Chunk a analizar.
+        entity_schema: Misma estructura que `extract_entities_llm` —
+            lista de dicts con `type_ref` y `label`.
+        model: Instancia GLiNER cargada con `gliner_load_model`. Inyectada
+            por el caller para evitar penalty de carga en batch.
+        threshold: Score minimo para aceptar una entidad (0.0-1.0).
+        flat_ner: True = sin entidades anidadas. False = anidadas (puede
+            producir spans solapados).
+
+    Returns:
+        Lista de EntityCandidate. Vacia si el modelo no detecta nada o
+        si entity_schema queda sin labels validos tras filtrar.
+
+    Raises:
+        ValueError: Si entity_schema esta vacio.
+    """
+    if not entity_schema:
+        raise ValueError("entity_schema no puede estar vacio")
+
+    labels, label_to_type_ref, label_to_label = _build_label_maps(entity_schema)
+    if not labels:
+        warnings.warn(
+            "extract_entities_gliner: ningun entry del schema tiene "
+            "label+type_ref validos; retornando vacio.",
+            stacklevel=2,
+        )
+        return []
+
+    try:
+        raw_entities = model.predict_entities(
+            text,
+            labels,
+            threshold=threshold,
+            flat_ner=flat_ner,
+        )
+    except Exception as exc:
+        warnings.warn(
+            f"extract_entities_gliner: error invocando model.predict_entities: {exc}",
+            stacklevel=2,
+        )
+        return []
+
+    if not isinstance(raw_entities, list):
+        warnings.warn(
+            "extract_entities_gliner: predict_entities no retorno una lista; "
+            "retornando vacio.",
+            stacklevel=2,
+        )
+        return []
+
+    candidates: list[EntityCandidate] = []
+    for item in raw_entities:
+        if not isinstance(item, dict):
+            continue
+
+        span_text = item.get("text", "")
+        label = item.get("label", "")
+        if not span_text or label not in label_to_type_ref:
+            continue
+
+        score = item.get("score", 0.0)
+        if not isinstance(score, (int, float)):
+            score = 0.0
+        confidence = float(max(0.0, min(1.0, score)))
+
+        start = item.get("start")
+        end = item.get("end")
+        attributes: dict = {}
+        if isinstance(start, int):
+            attributes["start"] = start
+        if isinstance(end, int):
+            attributes["end"] = end
+
+        candidates.append(
+            EntityCandidate(
+                name=span_text,
+                type_ref=label_to_type_ref[label],
+                type_label=label_to_label.get(label, label),
+                attributes=attributes,
+                confidence=confidence,
+            )
+        )
+
+    return candidates
diff --git a/python/functions/datascience/gliner_load_model.md b/python/functions/datascience/gliner_load_model.md
new file mode 100644
index 00000000..e5d45be7
--- /dev/null
+++ b/python/functions/datascience/gliner_load_model.md
@@ -0,0 +1,66 @@
+---
+name: gliner_load_model
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def gliner_load_model(model_name: str = 'urchade/gliner_multi-v2.1', device: str = 'auto') -> Any"
+description: "Carga (y cachea por (model_name, device)) un modelo GLiNER zero-shot NER. La primera llamada descarga ~200 MB desde HuggingFace; sucesivas devuelven la instancia cacheada. device='auto' usa CUDA si esta disponible, o CPU."
+tags: [gliner, ner, nlp, model, huggingface, zero-shot, datascience, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+params:
+  - name: model_name
+    desc: "ID del modelo en HuggingFace Hub (defecto: urchade/gliner_multi-v2.1, multilingue ES/EN)"
+  - name: device
+    desc: "'auto' (CUDA si disponible, sino CPU), 'cpu', 'cuda', 'cuda:N'"
+output: "instancia GLiNER lista para predict_entities, cacheada por (model_name, device)"
+tested: true
+tests:
+  - "ImportError si gliner no esta instalado"
+  - "Cache devuelve la misma instancia con los mismos parametros"
+  - "device='auto' resuelve a cpu o cuda segun torch.cuda.is_available"
+test_file_path: "python/functions/datascience/tests/test_extract_entities_gliner.py"
+file_path: "python/functions/datascience/gliner_load_model.py"
+---
+
+## Ejemplo
+
+```python
+from python.functions.datascience import gliner_load_model
+
+# Primera llamada descarga el modelo (~200 MB, una vez)
+model = gliner_load_model(device="auto")
+
+# Llamadas sucesivas con mismos params devuelven el cache
+model_again = gliner_load_model(device="auto")
+assert model is model_again
+```
+
+## Instalacion
+
+GLiNER no esta en las dependencias principales del registry. Para usarlo:
+
+```bash
+cd python && uv pip install gliner            # solo gliner
+cd python && uv pip install -e '.[nlp]'       # extra completo
+```
+
+## Tamaño y latencia
+
+- `urchade/gliner_multi-v2.1`: ~210 MB en disco (modelo + tokenizer).
+- Primera carga: 5-15 s en CPU, depende del disco y red.
+- Inferencia CPU: 1-5 KB texto/s con 8 labels (Apple M2 / i7 moderno).
+- Inferencia GPU (CUDA T4): 50-200 KB texto/s — 50-200x mas rapido.
+
+## Notas
+
+- El cache es por (model_name, device): cargar el mismo modelo en CPU y CUDA crea dos instancias. Es intencional para permitir A/B.
+- Si `torch` no esta instalado y `device='auto'`, cae a `'cpu'` sin error.
+- Para limpiar el cache (memoria GPU): borrar entradas de `_MODEL_CACHE` directamente o reiniciar el proceso.
+- impure: lee disco/red la primera vez y mantiene estado en `_MODEL_CACHE`.
diff --git a/python/functions/datascience/gliner_load_model.py b/python/functions/datascience/gliner_load_model.py
new file mode 100644
index 00000000..51a5fed0
--- /dev/null
+++ b/python/functions/datascience/gliner_load_model.py
@@ -0,0 +1,63 @@
+"""Carga (y cachea) un modelo GLiNER en el device deseado."""
+
+from __future__ import annotations
+
+from typing import Any
+
+# Cache global: (model_name, device) -> modelo cargado.
+_MODEL_CACHE: dict[tuple[str, str], Any] = {}
+
+
+def _resolve_device(device: str) -> str:
+    """Resuelve `device='auto'` a `cuda` o `cpu` segun disponibilidad."""
+    if device != "auto":
+        return device
+    try:
+        import torch
+    except ImportError:
+        return "cpu"
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def gliner_load_model(
+    model_name: str = "urchade/gliner_multi-v2.1",
+    device: str = "auto",
+) -> Any:
+    """Carga un modelo GLiNER con cache por (model_name, device).
+
+    La primera llamada descarga el modelo desde HuggingFace (~200 MB para
+    `gliner_multi-v2.1`). Llamadas sucesivas con los mismos parametros
+    devuelven la instancia cacheada.
+
+    Args:
+        model_name: ID del modelo en HuggingFace Hub.
+        device: 'auto' usa CUDA si esta disponible, o 'cpu'/'cuda'/'cuda:N'
+            de forma explicita.
+
+    Returns:
+        Instancia del modelo GLiNER lista para `predict_entities`.
+
+    Raises:
+        ImportError: si la dependencia `gliner` no esta instalada.
+            Solucion: `uv pip install gliner` o instalar el extra `nlp`
+            del proyecto (`uv pip install -e '.[nlp]'`).
+    """
+    resolved_device = _resolve_device(device)
+    cache_key = (model_name, resolved_device)
+    cached = _MODEL_CACHE.get(cache_key)
+    if cached is not None:
+        return cached
+
+    try:
+        from gliner import GLiNER
+    except ImportError as exc:
+        raise ImportError(
+            "gliner no esta instalado. Instalalo con "
+            "`uv pip install gliner` o `uv pip install -e '.[nlp]'`."
+        ) from exc
+
+    model = GLiNER.from_pretrained(model_name)
+    if hasattr(model, "to"):
+        model.to(resolved_device)
+    _MODEL_CACHE[cache_key] = model
+    return model
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 946fa292..63f4fad7 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -19,6 +19,11 @@ dependencies = [
     "xlrd>=2.0.2",
 ]
 
+[project.optional-dependencies]
+nlp = [
+    "gliner>=0.2.13",
+]
+
 [dependency-groups]
 dev = [
     "pytest>=9.0.2",

From b15332686a6bb4c76aacd6b41cbfe7798a52ed9e Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:33:46 +0200
Subject: [PATCH 5/9] test(datascience): corpus stub para gliner_load_model +
 extract_entities_gliner

11 tests sin necesidad de descargar el modelo (200 MB):
- StubModel duck-typed que valida el contrato de predict_entities
- Threshold y flat_ner se propagan al modelo
- Schema vacio lanza ValueError; schema sin labels validos warning + []
- Excepcion del modelo se captura
- Label desconocido se descarta
- gliner_load_model: ImportError simulado, cache hit, _resolve_device
  auto cae a cpu si torch no esta presente

Refs #0038

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../functions/datascience/tests/__init__.py   |   0
 .../tests/test_extract_entities_gliner.py     | 198 ++++++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 python/functions/datascience/tests/__init__.py
 create mode 100644 python/functions/datascience/tests/test_extract_entities_gliner.py

diff --git a/python/functions/datascience/tests/__init__.py b/python/functions/datascience/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/functions/datascience/tests/test_extract_entities_gliner.py b/python/functions/datascience/tests/test_extract_entities_gliner.py
new file mode 100644
index 00000000..aa5d4778
--- /dev/null
+++ b/python/functions/datascience/tests/test_extract_entities_gliner.py
@@ -0,0 +1,198 @@
+"""Tests para extract_entities_gliner y gliner_load_model.
+
+El modelo real (gliner) es opcional. Estos tests usan un stub duck-typed
+para validar el contrato sin descargar 200 MB. Tests que requieran el
+modelo real se marcan con `pytest.importorskip('gliner')`.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from dataclasses import dataclass
+
+import pytest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
+
+from python.functions.datascience.extract_entities_gliner import (
+    extract_entities_gliner,
+)
+from python.functions.datascience.gliner_load_model import (
+    _MODEL_CACHE,
+    _resolve_device,
+    gliner_load_model,
+)
+from python.types.datascience.entity_candidate import EntityCandidate
+
+
+SCHEMA_BASIC = [
+    {
+        "type_ref": "osint_person_go_cybersecurity",
+        "label": "Person",
+        "metadata_fields": ["full_name"],
+    },
+    {
+        "type_ref": "osint_organization_go_cybersecurity",
+        "label": "Organization",
+        "metadata_fields": ["name"],
+    },
+    {
+        "type_ref": "osint_location_go_cybersecurity",
+        "label": "Location",
+        "metadata_fields": ["name"],
+    },
+]
+
+
+@dataclass
+class StubModel:
+    """Modelo stub que devuelve una lista preconfigurada."""
+
+    response: list[dict]
+    raise_exc: Exception | None = None
+    last_kwargs: dict | None = None
+
+    def predict_entities(self, text, labels, threshold, flat_ner):
+        self.last_kwargs = {
+            "text": text,
+            "labels": list(labels),
+            "threshold": threshold,
+            "flat_ner": flat_ner,
+        }
+        if self.raise_exc is not None:
+            raise self.raise_exc
+        return self.response
+
+
+# ---------- extract_entities_gliner ----------
+
+
+def test_schema_basico_y_modelo_stub_retorna_entity_candidate():
+    """Schema basico y modelo stub retorna EntityCandidate con offsets."""
+    text = "Alice Johnson works at OpenAI in San Francisco."
+    model = StubModel(response=[
+        {"start": 0, "end": 13, "text": "Alice Johnson", "label": "Person", "score": 0.92},
+        {"start": 23, "end": 29, "text": "OpenAI", "label": "Organization", "score": 0.87},
+        {"start": 33, "end": 46, "text": "San Francisco", "label": "Location", "score": 0.81},
+    ])
+    out = extract_entities_gliner(text, SCHEMA_BASIC, model, threshold=0.5)
+    assert len(out) == 3
+    assert all(isinstance(e, EntityCandidate) for e in out)
+
+    person = next(e for e in out if e.name == "Alice Johnson")
+    assert person.type_ref == "osint_person_go_cybersecurity"
+    assert person.type_label == "Person"
+    assert person.attributes["start"] == 0
+    assert person.attributes["end"] == 13
+    assert pytest.approx(person.confidence, 0.001) == 0.92
+
+
+def test_threshold_filtra_spans_con_score_bajo():
+    """Threshold filtra spans con score bajo."""
+    # El stub no aplica threshold internamente — el modelo real si. Este
+    # test verifica que el threshold se PASA al modelo (kwargs).
+    model = StubModel(response=[
+        {"start": 0, "end": 5, "text": "Alice", "label": "Person", "score": 0.95},
+    ])
+    extract_entities_gliner("Alice", SCHEMA_BASIC, model, threshold=0.7, flat_ner=False)
+    assert model.last_kwargs["threshold"] == 0.7
+    assert model.last_kwargs["flat_ner"] is False
+
+
+def test_schema_vacio_lanza_value_error():
+    """Schema vacio lanza ValueError."""
+    model = StubModel(response=[])
+    with pytest.raises(ValueError):
+        extract_entities_gliner("text", [], model)
+
+
+def test_schema_sin_labels_validos_retorna_vacio():
+    """Schema sin label+type_ref validos retorna vacio con warning."""
+    bad_schema = [{"label": "", "type_ref": ""}, {"label": "X"}]
+    model = StubModel(response=[])
+    with pytest.warns(UserWarning):
+        out = extract_entities_gliner("text", bad_schema, model)
+    assert out == []
+
+
+def test_excepcion_del_modelo_se_captura():
+    """Excepcion del modelo se captura y retorna vacio."""
+    model = StubModel(response=[], raise_exc=RuntimeError("model exploded"))
+    with pytest.warns(UserWarning):
+        out = extract_entities_gliner("text", SCHEMA_BASIC, model)
+    assert out == []
+
+
+def test_label_desconocido_se_descarta():
+    """Label desconocido se descarta."""
+    model = StubModel(response=[
+        {"start": 0, "end": 5, "text": "Alice", "label": "Person", "score": 0.9},
+        {"start": 6, "end": 10, "text": "blob", "label": "UnknownLabel", "score": 0.9},
+    ])
+    out = extract_entities_gliner("Alice blob", SCHEMA_BASIC, model)
+    names = [e.name for e in out]
+    assert "Alice" in names
+    assert "blob" not in names
+
+
+def test_flat_ner_se_propaga_al_modelo():
+    """flat_ner se propaga al modelo."""
+    model = StubModel(response=[])
+    extract_entities_gliner("text", SCHEMA_BASIC, model, flat_ner=True)
+    assert model.last_kwargs["flat_ner"] is True
+    extract_entities_gliner("text", SCHEMA_BASIC, model, flat_ner=False)
+    assert model.last_kwargs["flat_ner"] is False
+
+
+# ---------- gliner_load_model ----------
+
+
+def test_import_error_si_gliner_no_esta_instalado(monkeypatch):
+    """ImportError si gliner no esta instalado."""
+    _MODEL_CACHE.clear()
+
+    real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "gliner" or name.startswith("gliner."):
+            raise ImportError("gliner not installed (simulated)")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", fake_import)
+
+    with pytest.raises(ImportError, match="gliner no esta instalado"):
+        gliner_load_model(model_name="dummy/model", device="cpu")
+
+
+def test_cache_devuelve_la_misma_instancia(monkeypatch):
+    """Cache devuelve la misma instancia con los mismos parametros."""
+    _MODEL_CACHE.clear()
+    sentinel = object()
+    _MODEL_CACHE[("dummy/model", "cpu")] = sentinel
+
+    out = gliner_load_model(model_name="dummy/model", device="cpu")
+    assert out is sentinel
+
+    # Limpiar al terminar para no contaminar otros tests.
+    _MODEL_CACHE.clear()
+
+
+def test_resolve_device_explicito_se_respeta():
+    """device explicito se respeta tal cual."""
+    assert _resolve_device("cpu") == "cpu"
+    assert _resolve_device("cuda") == "cuda"
+    assert _resolve_device("cuda:0") == "cuda:0"
+
+
+def test_resolve_device_auto_cae_a_cpu_sin_torch(monkeypatch):
+    """device='auto' resuelve a cpu o cuda segun torch.cuda.is_available."""
+    real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "torch":
+            raise ImportError("torch missing")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", fake_import)
+    assert _resolve_device("auto") == "cpu"

From 1bd315ce7b59257ce57660449a02f727d96f28bd Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:33:53 +0200
Subject: [PATCH 6/9] =?UTF-8?q?docs(issues):=20cerrar=200038=20=E2=80=94?=
 =?UTF-8?q?=20GLiNER=20entity=20extractor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move dev/issues/0038-gliner-entity-extractor.md a completed/
- Update README link y estado a completado

Closes #0038

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dev/issues/README.md                                       | 2 +-
 dev/issues/{ => completed}/0038-gliner-entity-extractor.md | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename dev/issues/{ => completed}/0038-gliner-entity-extractor.md (100%)

diff --git a/dev/issues/README.md b/dev/issues/README.md
index 60ef9277..e9c49bf6 100644
--- a/dev/issues/README.md
+++ b/dev/issues/README.md
@@ -43,7 +43,7 @@
 | [0035](0035-cpp-map-tiles.md) | C++ map_tiles (slippy map OSM) | pendiente | baja | feature | — |
 | [0036](0036-cpp-image-canvas-webcam.md) | C++ image_canvas + webcam_texture | pendiente | baja | feature | — |
 | [0037](completed/0037-ioc-regex-extractor.md) | IoC regex extractor (IP, email, dominio, hash, wallet, CVE, MAC) | completado | alta | feature | — |
-| [0038](0038-gliner-entity-extractor.md) | GLiNER entity extractor (zero-shot NER multilingue) | pendiente | alta | feature | 0039, 0040 |
+| [0038](completed/0038-gliner-entity-extractor.md) | GLiNER entity extractor (zero-shot NER multilingue) | completado | alta | feature | 0039, 0040 |
 | [0039](0039-glirel-relation-extractor.md) | GLiREL relation extractor (zero-shot triplets) | pendiente | media | feature | 0040 |
 | [0040](0040-hybrid-extraction-pipeline.md) | Pipeline hibrido extraccion grafos (regex + GLiNER + GLiREL + LLM fallback) | pendiente | media | feature | — |
 | [0041](completed/0041-cpp-app-best-practices.md) | C++ app shell estandarizado (PATTERNS.md + AppConfig extendido) | completado | alta | feature | 0043 |
diff --git a/dev/issues/0038-gliner-entity-extractor.md b/dev/issues/completed/0038-gliner-entity-extractor.md
similarity index 100%
rename from dev/issues/0038-gliner-entity-extractor.md
rename to dev/issues/completed/0038-gliner-entity-extractor.md

From 09f7f0ba1c6c519248080eb80ae045b0aff29241 Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:41:09 +0200
Subject: [PATCH 7/9] feat(datascience): GLiREL relation extractor (zero-shot
 triplets) drop-in con LLM

- glirel_load_model: cache por (model_name, device); device='auto' resuelve via torch
- extract_relations_glirel: tokeniza por whitespace, mapea spans char->token,
  llama predict_relations y devuelve RelationCandidate; fallback text.find si la
  entidad llega sin offsets; max_pairs=N -> top-N por score
- pyproject.toml: glirel en extra nlp

Closes #0039

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../datascience/extract_relations_glirel.md   | 131 ++++++++++
 .../datascience/extract_relations_glirel.py   | 227 ++++++++++++++++++
 .../datascience/glirel_load_model.md          |  72 ++++++
 .../datascience/glirel_load_model.py          |  63 +++++
 python/pyproject.toml                         |   1 +
 5 files changed, 494 insertions(+)
 create mode 100644 python/functions/datascience/extract_relations_glirel.md
 create mode 100644 python/functions/datascience/extract_relations_glirel.py
 create mode 100644 python/functions/datascience/glirel_load_model.md
 create mode 100644 python/functions/datascience/glirel_load_model.py

diff --git a/python/functions/datascience/extract_relations_glirel.md b/python/functions/datascience/extract_relations_glirel.md
new file mode 100644
index 00000000..242fc3de
--- /dev/null
+++ b/python/functions/datascience/extract_relations_glirel.md
@@ -0,0 +1,131 @@
+---
+name: extract_relations_glirel
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def extract_relations_glirel(text: str, entities: list[EntityCandidate], relation_types: list[str], model: Any, threshold: float = 0.5, max_pairs: int = 0) -> list[RelationCandidate]"
+description: "Extrae relaciones zero-shot con GLiREL. Drop-in del contrato de extract_relations_llm pero sin coste por token y mas rapido para corpus grandes. Tokeniza por whitespace, mapea spans de entidades (de attributes['start'/'end'] o fallback text.find) a indices de tokens, y devuelve RelationCandidate cuyos from_name/to_name siempre coinciden con entidades del input."
+tags: [glirel, relation, nlp, extract, zero-shot, knowledge-graph, fuzzygraph, graph, datascience, python]
+uses_functions: [glirel_load_model_py_datascience]
+uses_types:
+  - entity_candidate_py_datascience
+  - relation_candidate_py_datascience
+returns:
+  - relation_candidate_py_datascience
+returns_optional: false
+error_type: "error_go_core"
+imports: [warnings, re]
+params:
+  - name: text
+    desc: "mismo chunk de texto que se uso para extraer las entidades (parrafo, doc corto)"
+  - name: entities
+    desc: "lista de EntityCandidate ya extraidas (de extract_entities_gliner, extract_entities_llm o regex). Si tienen attributes['start'/'end'] se usan; si no, fallback a text.find(name) con warning."
+  - name: relation_types
+    desc: "tipos de relacion permitidos, ej: ['works_for','owns','communicated_with']. Vacio lanza ValueError."
+  - name: model
+    desc: "instancia GLiREL cargada con glirel_load_model. Inyectar para evitar penalty de carga en batch."
+  - name: threshold
+    desc: "score minimo para aceptar una relacion (0.0-1.0). Defecto 0.5."
+  - name: max_pairs
+    desc: "0 = todas las relaciones encontradas. >0 = top N por score (descarta el resto)."
+output: "lista de RelationCandidate(from_name, to_name, relation_type, description='', confidence). from_name/to_name siempre coinciden con entidades del input."
+tested: true
+tests:
+  - "Schema basico y modelo stub retorna RelationCandidate triplets validos"
+  - "Threshold se propaga al modelo"
+  - "relation_types vacio lanza ValueError"
+  - "Menos de 2 entidades retorna vacio"
+  - "Entidad sin offsets usa fallback text.find con warning"
+  - "Entidad cuyo nombre no aparece en el texto se descarta"
+  - "Excepcion del modelo se captura y retorna vacio"
+  - "Relation_type fuera del set permitido se descarta"
+  - "max_pairs=N limita el output a top N por score"
+  - "head_pos/tail_pos resuelven entidades por posicion de token"
+  - "Fallback por head_text/tail_text si head_pos no esta presente"
+test_file_path: "python/functions/datascience/tests/test_extract_relations_glirel.py"
+file_path: "python/functions/datascience/extract_relations_glirel.py"
+---
+
+## Ejemplo
+
+```python
+from python.functions.datascience import (
+    glirel_load_model,
+    extract_relations_glirel,
+)
+from python.types.datascience.entity_candidate import EntityCandidate
+
+model = glirel_load_model(device="auto")
+
+text = "Alice Johnson works at OpenAI in San Francisco."
+entities = [
+    EntityCandidate(name="Alice Johnson", type_label="Person",
+                    attributes={"start": 0, "end": 13}, confidence=0.92),
+    EntityCandidate(name="OpenAI", type_label="Organization",
+                    attributes={"start": 23, "end": 29}, confidence=0.87),
+    EntityCandidate(name="San Francisco", type_label="Location",
+                    attributes={"start": 33, "end": 46}, confidence=0.81),
+]
+
+relations = extract_relations_glirel(
+    text=text,
+    entities=entities,
+    relation_types=["works_for", "located_in", "owns"],
+    model=model,
+    threshold=0.5,
+)
+# [RelationCandidate(from_name='Alice Johnson', to_name='OpenAI',
+#                    relation_type='works_for', confidence=0.91), ...]
+```
+
+## Drop-in con extract_relations_llm
+
+El retorno es identico (`list[RelationCandidate]`) y `from_name`/`to_name` siempre
+coinciden con entidades del input — `deduplicate_relations_py_datascience` lo
+acepta sin cambios. Diferencias:
+
+- **Coste**: GLiREL = 0 USD/token. LLM = depende del modelo.
+- **Latencia**: GLiREL es mucho mas rapido en GPU; en CPU depende del numero de
+  pares (entidades x relation_types).
+- **Razonamiento implicito**: el LLM lo deduce ("CEO de la empresa" -> persona
+  works_for empresa); GLiREL solo extrae lo explicito en el texto.
+- **Esquemas grandes**: GLiREL escala bien con muchos relation_types; el LLM
+  pierde foco con esquemas muy largos.
+- **Idiomas**: GLiREL-large-v0 esta entrenado principalmente en ingles. Para ES
+  evaluar precision/recall caso a caso o caer al LLM.
+
+## Spans de entidades
+
+GLiREL necesita los spans (token indices) de cada entidad en el texto. Esta funcion:
+
+1. Lee `attributes["start"]` y `attributes["end"]` (offsets de caracteres) si
+   existen — el output natural de `extract_entities_gliner` y `extract_iocs`.
+2. Si faltan, usa `text.find(entity.name)` como fallback (con warning).
+3. Tokeniza por whitespace y mapea cada char span a un span de tokens
+   (`[start_token, end_token]`).
+4. Pasa todo a `model.predict_relations(tokens, labels=..., ner=...)`.
+
+Si la entidad no se puede localizar en el texto, se descarta (no se le pueden
+buscar relaciones sin saber donde esta).
+
+## Notas
+
+- impure: el modelo es estado externo. `error_type: error_go_core` segun la regla
+  de pureza del registry.
+- Si dos entidades tienen el mismo nombre, GLiREL podria mezclarlas; el matcheo
+  por `head_pos`/`tail_pos` (token start) las distingue mejor que `head_text`.
+- Una `relation_type` que no aparece en el output NO es un error — solo significa
+  que GLiREL no encontro evidencia.
+- Combinar con LLM para razonamiento implicito: ver issue 0040 (pipeline hibrido).
+- Para precision maxima, ajustar `threshold` por dominio: 0.3-0.4 = recall alto;
+  0.6-0.8 = precision alta.
+
+## Limitacion
+
+GLiREL es bueno para relaciones explicitas en el texto (`X trabaja en Y`,
+`A llamo a B`), malo para razonamiento implicito (`la nueva CEO`, `su empresa`).
+Para razonamiento implicito seguir usando `extract_relations_llm`. El pipeline
+hibrido (issue 0040) compone GLiREL para extraccion masiva + LLM para los casos
+implicitos que GLiREL no cubre.
diff --git a/python/functions/datascience/extract_relations_glirel.py b/python/functions/datascience/extract_relations_glirel.py
new file mode 100644
index 00000000..59c3111d
--- /dev/null
+++ b/python/functions/datascience/extract_relations_glirel.py
@@ -0,0 +1,227 @@
+"""Extrae relaciones entre entidades usando GLiREL (zero-shot relation extraction)."""
+
+from __future__ import annotations
+
+import os
+import re
+import sys
+import warnings
+from typing import Any
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from python.types.datascience.entity_candidate import EntityCandidate
+from python.types.datascience.relation_candidate import RelationCandidate
+
+
+_TOKEN_RE = re.compile(r"\S+")
+
+
+def _tokenize_with_offsets(text: str) -> list[tuple[str, int, int]]:
+    """Tokeniza por whitespace y devuelve [(token, char_start, char_end)]."""
+    return [(m.group(), m.start(), m.end()) for m in _TOKEN_RE.finditer(text)]
+
+
+def _char_span_to_token_span(
+    char_start: int,
+    char_end: int,
+    tokens_with_offsets: list[tuple[str, int, int]],
+) -> tuple[int, int] | None:
+    """Mapea un span de caracteres a indices de tokens [start_token, end_token] inclusivos.
+
+    Retorna None si no hay tokens que solapen con el span.
+    """
+    start_idx: int | None = None
+    end_idx: int | None = None
+    for i, (_tok, ts, te) in enumerate(tokens_with_offsets):
+        # Token solapa con [char_start, char_end) si su rango interseca.
+        if ts < char_end and te > char_start:
+            if start_idx is None:
+                start_idx = i
+            end_idx = i
+    if start_idx is None or end_idx is None:
+        return None
+    return (start_idx, end_idx)
+
+
+def _resolve_entity_char_span(
+    entity: EntityCandidate,
+    text: str,
+) -> tuple[int, int] | None:
+    """Devuelve (start, end) para una entidad, usando attributes o fallback text.find."""
+    start = entity.attributes.get("start") if entity.attributes else None
+    end = entity.attributes.get("end") if entity.attributes else None
+    if isinstance(start, int) and isinstance(end, int) and 0 <= start < end <= len(text):
+        return (start, end)
+
+    # Fallback: buscar el primer match del nombre en el texto.
+    if not entity.name:
+        return None
+    found = text.find(entity.name)
+    if found < 0:
+        warnings.warn(
+            f"extract_relations_glirel: entidad '{entity.name}' sin offsets y no se "
+            f"encuentra en text.find — descartando.",
+            stacklevel=3,
+        )
+        return None
+    warnings.warn(
+        f"extract_relations_glirel: entidad '{entity.name}' sin offsets en attributes; "
+        f"usando text.find como fallback.",
+        stacklevel=3,
+    )
+    return (found, found + len(entity.name))
+
+
+def extract_relations_glirel(
+    text: str,
+    entities: list[EntityCandidate],
+    relation_types: list[str],
+    model: Any,
+    threshold: float = 0.5,
+    max_pairs: int = 0,
+) -> list[RelationCandidate]:
+    """Extrae relaciones zero-shot con GLiREL, contrato drop-in con `extract_relations_llm`.
+
+    GLiREL recibe tokens + spans de entidades en indices de tokens. Esta funcion
+    se encarga de tokenizar el texto (whitespace), mapear los spans en caracteres
+    de cada `EntityCandidate` (de `attributes['start'/'end']` o fallback con
+    `text.find(name)`) y traducir el output a `RelationCandidate`.
+
+    Args:
+        text: Mismo chunk que se uso para extraer las entidades.
+        entities: Entidades ya extraidas (de GLiNER, LLM o regex). Si tienen
+            `attributes['start']` y `['end']` se usan; si no, fallback a
+            `text.find(name)` con warning.
+        relation_types: Tipos de relacion permitidos, ej: `["works_for", "owns"]`.
+        model: Instancia GLiREL cargada con `glirel_load_model`. Inyectada por
+            el caller para evitar penalty de carga en batch.
+        threshold: Score minimo para aceptar una relacion (0.0-1.0).
+        max_pairs: 0 = todas las relaciones encontradas; >0 = top N por score.
+
+    Returns:
+        Lista de RelationCandidate validados (from_name/to_name coinciden con
+        entidades del input). Vacia si hay menos de 2 entidades, si el modelo
+        no detecta nada, o si los relation_types o entidades quedan invalidos.
+
+    Raises:
+        ValueError: Si `relation_types` esta vacio.
+    """
+    if not relation_types:
+        raise ValueError("relation_types no puede estar vacio")
+    if len(entities) < 2:
+        return []
+
+    tokens_with_offsets = _tokenize_with_offsets(text)
+    if not tokens_with_offsets:
+        return []
+    tokens = [tok for tok, _s, _e in tokens_with_offsets]
+
+    # Mapa token_start_idx -> EntityCandidate (para resolver outputs por posicion).
+    token_start_to_entity: dict[int, EntityCandidate] = {}
+    ner_spans: list[list] = []
+    entity_names_set = {e.name for e in entities if e.name}
+
+    for ent in entities:
+        char_span = _resolve_entity_char_span(ent, text)
+        if char_span is None:
+            continue
+        token_span = _char_span_to_token_span(char_span[0], char_span[1], tokens_with_offsets)
+        if token_span is None:
+            continue
+        start_tok, end_tok = token_span
+        # GLiREL espera ner como [start_idx, end_idx, type_label] (token-level).
+        ner_spans.append([start_tok, end_tok, ent.type_label or ent.type_ref or "Entity"])
+        # last-wins si dos entidades comparten token_start (poco probable).
+        token_start_to_entity[start_tok] = ent
+
+    if len(ner_spans) < 2:
+        return []
+
+    try:
+        raw = model.predict_relations(
+            tokens,
+            labels=list(relation_types),
+            threshold=threshold,
+            ner=ner_spans,
+            top_k=1,
+        )
+    except Exception as exc:
+        warnings.warn(
+            f"extract_relations_glirel: error invocando model.predict_relations: {exc}",
+            stacklevel=2,
+        )
+        return []
+
+    if not isinstance(raw, list):
+        warnings.warn(
+            "extract_relations_glirel: predict_relations no retorno una lista; "
+            "retornando vacio.",
+            stacklevel=2,
+        )
+        return []
+
+    relation_types_set = set(relation_types)
+    candidates: list[RelationCandidate] = []
+    for item in raw:
+        if not isinstance(item, dict):
+            continue
+
+        relation_type = item.get("label", "")
+        if relation_type not in relation_types_set:
+            continue
+
+        score = item.get("score", 0.0)
+        if not isinstance(score, (int, float)):
+            score = 0.0
+        confidence = float(max(0.0, min(1.0, score)))
+
+        head_pos = item.get("head_pos")
+        tail_pos = item.get("tail_pos")
+        head_entity: EntityCandidate | None = None
+        tail_entity: EntityCandidate | None = None
+
+        if isinstance(head_pos, (list, tuple)) and head_pos:
+            head_entity = token_start_to_entity.get(int(head_pos[0]))
+        if isinstance(tail_pos, (list, tuple)) and tail_pos:
+            tail_entity = token_start_to_entity.get(int(tail_pos[0]))
+
+        # Fallback: matcheo por texto si el modelo no expone head_pos/tail_pos.
+        if head_entity is None:
+            head_text = _stringify_span(item.get("head_text"))
+            if head_text in entity_names_set:
+                head_entity = next((e for e in entities if e.name == head_text), None)
+        if tail_entity is None:
+            tail_text = _stringify_span(item.get("tail_text"))
+            if tail_text in entity_names_set:
+                tail_entity = next((e for e in entities if e.name == tail_text), None)
+
+        if head_entity is None or tail_entity is None:
+            continue
+        if head_entity.name == tail_entity.name:
+            continue
+
+        candidates.append(
+            RelationCandidate(
+                from_name=head_entity.name,
+                to_name=tail_entity.name,
+                relation_type=relation_type,
+                description="",
+                confidence=confidence,
+            )
+        )
+
+    if max_pairs > 0 and len(candidates) > max_pairs:
+        candidates.sort(key=lambda r: r.confidence, reverse=True)
+        candidates = candidates[:max_pairs]
+
+    return candidates
+
+
+def _stringify_span(value: Any) -> str:
+    """Convierte el head_text/tail_text de GLiREL (str o list[str]) a un string plano."""
+    if isinstance(value, str):
+        return value
+    if isinstance(value, (list, tuple)):
+        return " ".join(str(v) for v in value)
+    return ""
diff --git a/python/functions/datascience/glirel_load_model.md b/python/functions/datascience/glirel_load_model.md
new file mode 100644
index 00000000..cf7d9c95
--- /dev/null
+++ b/python/functions/datascience/glirel_load_model.md
@@ -0,0 +1,72 @@
+---
+name: glirel_load_model
+kind: function
+lang: py
+domain: datascience
+version: "1.0.0"
+purity: impure
+signature: "def glirel_load_model(model_name: str = 'jackboyla/glirel-large-v0', device: str = 'auto') -> Any"
+description: "Carga (y cachea por (model_name, device)) un modelo GLiREL zero-shot relation extraction. La primera llamada descarga ~500 MB desde HuggingFace; sucesivas devuelven la instancia cacheada. device='auto' usa CUDA si esta disponible, o CPU."
+tags: [glirel, relation, nlp, model, huggingface, zero-shot, datascience, python]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: []
+params:
+  - name: model_name
+    desc: "ID del modelo en HuggingFace Hub (defecto: jackboyla/glirel-large-v0)"
+  - name: device
+    desc: "'auto' (CUDA si disponible, sino CPU), 'cpu', 'cuda', 'cuda:N'"
+output: "instancia GLiREL lista para predict_relations, cacheada por (model_name, device)"
+tested: true
+tests:
+  - "ImportError si glirel no esta instalado"
+  - "Cache devuelve la misma instancia con los mismos parametros"
+  - "device='auto' resuelve a cpu o cuda segun torch.cuda.is_available"
+test_file_path: "python/functions/datascience/tests/test_extract_relations_glirel.py"
+file_path: "python/functions/datascience/glirel_load_model.py"
+---
+
+## Ejemplo
+
+```python
+from python.functions.datascience import glirel_load_model
+
+# Primera llamada descarga el modelo (~500 MB, una vez)
+model = glirel_load_model(device="auto")
+
+# Llamadas sucesivas con mismos params devuelven el cache
+model_again = glirel_load_model(device="auto")
+assert model is model_again
+```
+
+## Instalacion
+
+GLiREL no esta en las dependencias principales del registry. Para usarlo:
+
+```bash
+cd python && uv pip install glirel             # solo glirel
+cd python && uv pip install -e '.[nlp]'        # extra completo (gliner + glirel)
+```
+
+## Tamaño y latencia
+
+- `jackboyla/glirel-large-v0`: ~500 MB en disco (modelo + tokenizer).
+- Primera carga: 8-20 s en CPU, depende del disco y red.
+- Inferencia CPU: depende del numero de pares entidad x relation_types. 5-20 pares/s
+  con esquema pequeño (5 relation types).
+- Inferencia GPU (CUDA T4): 50-200x mas rapido que CPU.
+
+## Notas
+
+- El cache es por (model_name, device): cargar el mismo modelo en CPU y CUDA crea dos
+  instancias. Es intencional para permitir A/B.
+- Si `torch` no esta instalado y `device='auto'`, cae a `'cpu'` sin error.
+- Para limpiar el cache (memoria GPU): borrar entradas de `_MODEL_CACHE` directamente
+  o reiniciar el proceso.
+- impure: lee disco/red la primera vez y mantiene estado en `_MODEL_CACHE`.
+- GLiREL es bueno para relaciones explicitas en el texto (`X trabaja en Y`, `A llamo a B`),
+  malo para razonamiento implicito ("CEO de la empresa"). Para razonamiento implicito
+  seguir usando `extract_relations_llm`.
diff --git a/python/functions/datascience/glirel_load_model.py b/python/functions/datascience/glirel_load_model.py
new file mode 100644
index 00000000..8f83ae74
--- /dev/null
+++ b/python/functions/datascience/glirel_load_model.py
@@ -0,0 +1,63 @@
+"""Carga (y cachea) un modelo GLiREL en el device deseado."""
+
+from __future__ import annotations
+
+from typing import Any
+
+# Cache global: (model_name, device) -> modelo cargado.
+_MODEL_CACHE: dict[tuple[str, str], Any] = {}
+
+
+def _resolve_device(device: str) -> str:
+    """Resuelve `device='auto'` a `cuda` o `cpu` segun disponibilidad."""
+    if device != "auto":
+        return device
+    try:
+        import torch
+    except ImportError:
+        return "cpu"
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def glirel_load_model(
+    model_name: str = "jackboyla/glirel-large-v0",
+    device: str = "auto",
+) -> Any:
+    """Carga un modelo GLiREL con cache por (model_name, device).
+
+    La primera llamada descarga el modelo desde HuggingFace (~500 MB para
+    `glirel-large-v0`). Llamadas sucesivas con los mismos parametros
+    devuelven la instancia cacheada.
+
+    Args:
+        model_name: ID del modelo en HuggingFace Hub.
+        device: 'auto' usa CUDA si esta disponible, o 'cpu'/'cuda'/'cuda:N'
+            de forma explicita.
+
+    Returns:
+        Instancia del modelo GLiREL lista para `predict_relations`.
+
+    Raises:
+        ImportError: si la dependencia `glirel` no esta instalada.
+            Solucion: `uv pip install glirel` o instalar el extra `nlp`
+            del proyecto (`uv pip install -e '.[nlp]'`).
+    """
+    resolved_device = _resolve_device(device)
+    cache_key = (model_name, resolved_device)
+    cached = _MODEL_CACHE.get(cache_key)
+    if cached is not None:
+        return cached
+
+    try:
+        from glirel import GLiREL
+    except ImportError as exc:
+        raise ImportError(
+            "glirel no esta instalado. Instalalo con "
+            "`uv pip install glirel` o `uv pip install -e '.[nlp]'`."
+        ) from exc
+
+    model = GLiREL.from_pretrained(model_name)
+    if hasattr(model, "to"):
+        model.to(resolved_device)
+    _MODEL_CACHE[cache_key] = model
+    return model
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 63f4fad7..ec166b3c 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
 [project.optional-dependencies]
 nlp = [
     "gliner>=0.2.13",
+    "glirel>=1.0.0",
 ]
 
 [dependency-groups]

From 3bf2ed6a5b55cc7b9e951fba9f3ba11015a4f839 Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:41:14 +0200
Subject: [PATCH 8/9] test(datascience): corpus stub para glirel_load_model +
 extract_relations_glirel

17 casos: helpers de tokenizacion/mapeo, schema basico con head_pos/tail_pos,
fallback por head_text, threshold, max_pairs, self-loops, ImportError, cache,
device='auto'.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../tests/test_extract_relations_glirel.py    | 314 ++++++++++++++++++
 1 file changed, 314 insertions(+)
 create mode 100644 python/functions/datascience/tests/test_extract_relations_glirel.py

diff --git a/python/functions/datascience/tests/test_extract_relations_glirel.py b/python/functions/datascience/tests/test_extract_relations_glirel.py
new file mode 100644
index 00000000..43a23b97
--- /dev/null
+++ b/python/functions/datascience/tests/test_extract_relations_glirel.py
@@ -0,0 +1,314 @@
+"""Tests para extract_relations_glirel y glirel_load_model.
+
+El modelo real (glirel) es opcional y pesa ~500 MB. Estos tests usan un stub
+duck-typed para validar el contrato sin descargar el modelo.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from dataclasses import dataclass
+
+import pytest
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..", ".."))
+
+from python.functions.datascience.extract_relations_glirel import (
+    _char_span_to_token_span,
+    _tokenize_with_offsets,
+    extract_relations_glirel,
+)
+from python.functions.datascience.glirel_load_model import (
+    _MODEL_CACHE,
+    _resolve_device,
+    glirel_load_model,
+)
+from python.types.datascience.entity_candidate import EntityCandidate
+from python.types.datascience.relation_candidate import RelationCandidate
+
+
+def _ent(name: str, type_label: str, start: int, end: int) -> EntityCandidate:
+    return EntityCandidate(
+        name=name,
+        type_label=type_label,
+        type_ref=f"{type_label.lower()}_ref",
+        attributes={"start": start, "end": end},
+        confidence=0.9,
+    )
+
+
+@dataclass
+class StubModel:
+    """Modelo stub que devuelve una lista preconfigurada."""
+
+    response: list[dict]
+    raise_exc: Exception | None = None
+    last_kwargs: dict | None = None
+
+    def predict_relations(self, tokens, labels, threshold, ner, top_k):
+        self.last_kwargs = {
+            "tokens": list(tokens),
+            "labels": list(labels),
+            "threshold": threshold,
+            "ner": [list(s) for s in ner],
+            "top_k": top_k,
+        }
+        if self.raise_exc is not None:
+            raise self.raise_exc
+        return self.response
+
+
+# ---------- helpers ----------
+
+
+def test_tokenize_with_offsets_devuelve_indices_correctos():
+    text = "Alice Johnson works at OpenAI."
+    out = _tokenize_with_offsets(text)
+    assert [t for t, _, _ in out] == ["Alice", "Johnson", "works", "at", "OpenAI."]
+    assert out[0][1:] == (0, 5)
+    assert out[1][1:] == (6, 13)
+    assert out[4][1:] == (23, 30)
+
+
+def test_char_span_to_token_span_solapa_correctamente():
+    tokens = _tokenize_with_offsets("Alice Johnson works at OpenAI.")
+    # "Alice Johnson" (0..13) -> tokens 0..1
+    assert _char_span_to_token_span(0, 13, tokens) == (0, 1)
+    # "OpenAI" (23..29) -> token 4
+    assert _char_span_to_token_span(23, 29, tokens) == (4, 4)
+    # span fuera del texto -> None
+    assert _char_span_to_token_span(100, 200, tokens) is None
+
+
+# ---------- extract_relations_glirel ----------
+
+
+def test_schema_basico_y_modelo_stub_retorna_relation_candidate():
+    text = "Alice Johnson works at OpenAI in San Francisco."
+    entities = [
+        _ent("Alice Johnson", "Person", 0, 13),
+        _ent("OpenAI", "Organization", 23, 29),
+        _ent("San Francisco", "Location", 33, 46),
+    ]
+    relation_types = ["works_for", "located_in", "owns"]
+
+    # Tokens: [Alice, Johnson, works, at, OpenAI, in, San, Francisco.]
+    # Alice Johnson -> tokens 0..1, OpenAI -> token 4, San Francisco. -> tokens 6..7
+    model = StubModel(response=[
+        {"head_pos": [0, 1], "tail_pos": [4, 4],
+         "head_text": ["Alice", "Johnson"], "tail_text": ["OpenAI"],
+         "label": "works_for", "score": 0.91},
+        {"head_pos": [4, 4], "tail_pos": [6, 7],
+         "head_text": ["OpenAI"], "tail_text": ["San", "Francisco."],
+         "label": "located_in", "score": 0.78},
+    ])
+
+    out = extract_relations_glirel(text, entities, relation_types, model)
+    assert len(out) == 2
+    assert all(isinstance(r, RelationCandidate) for r in out)
+
+    works = next(r for r in out if r.relation_type == "works_for")
+    assert works.from_name == "Alice Johnson"
+    assert works.to_name == "OpenAI"
+    assert pytest.approx(works.confidence, 0.001) == 0.91
+
+    located = next(r for r in out if r.relation_type == "located_in")
+    assert located.from_name == "OpenAI"
+    # San Francisco entity name vs token "San Francisco." (con punto pegado).
+    # Como matcheamos por head_pos/tail_pos (token start = 6), debe resolver a
+    # la entidad EntityCandidate("San Francisco", start=33).
+    assert located.to_name == "San Francisco"
+
+
+def test_threshold_se_propaga_al_modelo():
+    text = "Alice works at OpenAI."
+    entities = [
+        _ent("Alice", "Person", 0, 5),
+        _ent("OpenAI", "Organization", 15, 21),
+    ]
+    model = StubModel(response=[])
+    extract_relations_glirel(text, entities, ["works_for"], model, threshold=0.7)
+    assert model.last_kwargs["threshold"] == 0.7
+    assert model.last_kwargs["labels"] == ["works_for"]
+    assert model.last_kwargs["top_k"] == 1
+
+
+def test_relation_types_vacio_lanza_value_error():
+    entities = [_ent("Alice", "Person", 0, 5), _ent("Bob", "Person", 6, 9)]
+    with pytest.raises(ValueError):
+        extract_relations_glirel("Alice y Bob", entities, [], StubModel(response=[]))
+
+
+def test_menos_de_dos_entidades_retorna_vacio():
+    entities = [_ent("Alice", "Person", 0, 5)]
+    out = extract_relations_glirel("Alice", entities, ["works_for"], StubModel(response=[]))
+    assert out == []
+
+
+def test_entidad_sin_offsets_usa_fallback_text_find_con_warning():
+    text = "Alice works at OpenAI."
+    entities = [
+        EntityCandidate(name="Alice", type_label="Person", confidence=0.9),
+        EntityCandidate(name="OpenAI", type_label="Organization", confidence=0.9),
+    ]
+    model = StubModel(response=[
+        {"head_pos": [0, 0], "tail_pos": [3, 3],
+         "head_text": ["Alice"], "tail_text": ["OpenAI."],
+         "label": "works_for", "score": 0.85},
+    ])
+    with pytest.warns(UserWarning, match="sin offsets"):
+        out = extract_relations_glirel(text, entities, ["works_for"], model)
+    assert len(out) == 1
+    assert out[0].from_name == "Alice"
+    assert out[0].to_name == "OpenAI"
+
+
+def test_entidad_no_encontrada_en_texto_se_descarta():
+    text = "Alice y Bob hablan."
+    entities = [
+        EntityCandidate(name="Alice", type_label="Person", confidence=0.9),
+        EntityCandidate(name="Carmen", type_label="Person", confidence=0.9),  # no esta
+        EntityCandidate(name="Bob", type_label="Person", confidence=0.9),
+    ]
+    model = StubModel(response=[
+        {"head_pos": [0, 0], "tail_pos": [2, 2],
+         "head_text": ["Alice"], "tail_text": ["Bob"],
+         "label": "communicated_with", "score": 0.8},
+    ])
+    with pytest.warns(UserWarning):
+        out = extract_relations_glirel(text, entities, ["communicated_with"], model)
+    # Carmen se descarta del input al construir ner_spans, pero los otros 2 quedan.
+    # GLiREL recibe solo 2 spans validos.
+    assert len(out) == 1
+    assert out[0].from_name == "Alice"
+    assert out[0].to_name == "Bob"
+
+
+def test_excepcion_del_modelo_se_captura():
+    entities = [_ent("Alice", "Person", 0, 5), _ent("Bob", "Person", 8, 11)]
+    model = StubModel(response=[], raise_exc=RuntimeError("model exploded"))
+    with pytest.warns(UserWarning):
+        out = extract_relations_glirel("Alice y Bob.", entities, ["works_for"], model)
+    assert out == []
+
+
+def test_relation_type_fuera_del_set_se_descarta():
+    text = "Alice works at OpenAI."
+    entities = [
+        _ent("Alice", "Person", 0, 5),
+        _ent("OpenAI", "Organization", 15, 21),
+    ]
+    model = StubModel(response=[
+        {"head_pos": [0, 0], "tail_pos": [3, 3],
+         "head_text": ["Alice"], "tail_text": ["OpenAI."],
+         "label": "unknown_relation", "score": 0.95},
+    ])
+    out = extract_relations_glirel(text, entities, ["works_for"], model)
+    assert out == []
+
+
+def test_max_pairs_limita_top_n():
+    text = "Alice works at OpenAI in San Francisco."
+    entities = [
+        _ent("Alice", "Person", 0, 5),
+        _ent("OpenAI", "Organization", 15, 21),
+        _ent("San Francisco", "Location", 25, 38),
+    ]
+    relation_types = ["works_for", "located_in", "lived_in"]
+    model = StubModel(response=[
+        {"head_pos": [0, 0], "tail_pos": [3, 3], "label": "works_for", "score": 0.55,
+         "head_text": ["Alice"], "tail_text": ["OpenAI"]},
+        {"head_pos": [3, 3], "tail_pos": [5, 6], "label": "located_in", "score": 0.92,
+         "head_text": ["OpenAI"], "tail_text": ["San", "Francisco."]},
+        {"head_pos": [0, 0], "tail_pos": [5, 6], "label": "lived_in", "score": 0.71,
+         "head_text": ["Alice"], "tail_text": ["San", "Francisco."]},
+    ])
+    out = extract_relations_glirel(text, entities, relation_types, model, max_pairs=2)
+    assert len(out) == 2
+    confidences = [r.confidence for r in out]
+    # Top 2 por score: 0.92 y 0.71
+    assert confidences == sorted(confidences, reverse=True)
+    assert max(confidences) == pytest.approx(0.92, 0.001)
+    assert min(confidences) == pytest.approx(0.71, 0.001)
+
+
+def test_fallback_por_head_text_si_head_pos_no_esta():
+    text = "Alice works at OpenAI."
+    entities = [
+        _ent("Alice", "Person", 0, 5),
+        _ent("OpenAI", "Organization", 15, 21),
+    ]
+    model = StubModel(response=[
+        # Sin head_pos/tail_pos, fallback por texto.
+        {"head_text": "Alice", "tail_text": "OpenAI",
+         "label": "works_for", "score": 0.8},
+    ])
+    out = extract_relations_glirel(text, entities, ["works_for"], model)
+    assert len(out) == 1
+    assert out[0].from_name == "Alice"
+    assert out[0].to_name == "OpenAI"
+
+
+def test_self_loops_se_descartan():
+    """head y tail apuntan a la misma entidad -> se descarta."""
+    text = "Alice talks to Alice."
+    entities = [_ent("Alice", "Person", 0, 5), _ent("Alice", "Person", 15, 20)]
+    model = StubModel(response=[
+        {"head_pos": [0, 0], "tail_pos": [0, 0],
+         "head_text": ["Alice"], "tail_text": ["Alice"],
+         "label": "communicated_with", "score": 0.9},
+    ])
+    out = extract_relations_glirel(text, entities, ["communicated_with"], model)
+    assert out == []
+
+
+# ---------- glirel_load_model ----------
+
+
+def test_import_error_si_glirel_no_esta_instalado(monkeypatch):
+    """ImportError si glirel no esta instalado."""
+    _MODEL_CACHE.clear()
+
+    real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "glirel" or name.startswith("glirel."):
+            raise ImportError("glirel not installed (simulated)")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", fake_import)
+
+    with pytest.raises(ImportError, match="glirel no esta instalado"):
+        glirel_load_model(model_name="dummy/model", device="cpu")
+
+
+def test_cache_devuelve_la_misma_instancia():
+    """Cache devuelve la misma instancia con los mismos parametros."""
+    _MODEL_CACHE.clear()
+    sentinel = object()
+    _MODEL_CACHE[("dummy/model", "cpu")] = sentinel
+
+    out = glirel_load_model(model_name="dummy/model", device="cpu")
+    assert out is sentinel
+
+    _MODEL_CACHE.clear()
+
+
+def test_resolve_device_explicito_se_respeta():
+    assert _resolve_device("cpu") == "cpu"
+    assert _resolve_device("cuda") == "cuda"
+    assert _resolve_device("cuda:0") == "cuda:0"
+
+
+def test_resolve_device_auto_cae_a_cpu_sin_torch(monkeypatch):
+    """device='auto' resuelve a cpu si torch no esta disponible."""
+    real_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "torch":
+            raise ImportError("torch missing")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", fake_import)
+    assert _resolve_device("auto") == "cpu"

From 7f598e17a181d1d2bbca1e16237103b26f942c7e Mon Sep 17 00:00:00 2001
From: egutierrez <egutierrez@dead.dd>
Date: Thu, 30 Apr 2026 16:41:18 +0200
Subject: [PATCH 9/9] =?UTF-8?q?docs(issues):=20cerrar=200039=20=E2=80=94?=
 =?UTF-8?q?=20GLiREL=20relation=20extractor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dev/issues/README.md                                         | 2 +-
 dev/issues/{ => completed}/0039-glirel-relation-extractor.md | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename dev/issues/{ => completed}/0039-glirel-relation-extractor.md (100%)

diff --git a/dev/issues/README.md b/dev/issues/README.md
index e9c49bf6..5b8ffd00 100644
--- a/dev/issues/README.md
+++ b/dev/issues/README.md
@@ -44,7 +44,7 @@
 | [0036](0036-cpp-image-canvas-webcam.md) | C++ image_canvas + webcam_texture | pendiente | baja | feature | — |
 | [0037](completed/0037-ioc-regex-extractor.md) | IoC regex extractor (IP, email, dominio, hash, wallet, CVE, MAC) | completado | alta | feature | — |
 | [0038](completed/0038-gliner-entity-extractor.md) | GLiNER entity extractor (zero-shot NER multilingue) | completado | alta | feature | 0039, 0040 |
-| [0039](0039-glirel-relation-extractor.md) | GLiREL relation extractor (zero-shot triplets) | pendiente | media | feature | 0040 |
+| [0039](completed/0039-glirel-relation-extractor.md) | GLiREL relation extractor (zero-shot triplets) | completado | media | feature | 0040 |
 | [0040](0040-hybrid-extraction-pipeline.md) | Pipeline hibrido extraccion grafos (regex + GLiNER + GLiREL + LLM fallback) | pendiente | media | feature | — |
 | [0041](completed/0041-cpp-app-best-practices.md) | C++ app shell estandarizado (PATTERNS.md + AppConfig extendido) | completado | alta | feature | 0043 |
 | [0042](completed/0042-cpp-layout-storage-public.md) | C++ layout_storage publico (extraer de shaders_lab) | completado | alta | feature | 0043 |
diff --git a/dev/issues/0039-glirel-relation-extractor.md b/dev/issues/completed/0039-glirel-relation-extractor.md
similarity index 100%
rename from dev/issues/0039-glirel-relation-extractor.md
rename to dev/issues/completed/0039-glirel-relation-extractor.md